package api_test import ( "testing" "github.com/jfraeys/fetch_ml/internal/api/helpers" "github.com/jfraeys/fetch_ml/internal/queue" ) // ProcessTest demonstrates the duplicate detection process step by step func TestDuplicateDetectionProcess(t *testing.T) { t.Log("=== Duplicate Detection Process Test ===") // Step 1: First job submission t.Log("\n1. First job submission:") commitID := "abc123def456" args1 := "--epochs 10 --lr 0.001" datasets := []queue.DatasetSpec{{Name: "mnist", Checksum: "sha256:abc123"}} datasetID1 := helpers.ComputeDatasetID(datasets, nil) paramsHash1 := helpers.ComputeParamsHash(args1) t.Logf(" Commit ID: %s", commitID) t.Logf(" Dataset ID: %s (computed from %d datasets)", datasetID1, len(datasets)) t.Logf(" Params Hash: %s (computed from args: %s)", paramsHash1, args1) t.Logf(" Composite Key: (%s, %s, %s)", commitID, datasetID1, paramsHash1) // Step 2: Second job with SAME parameters (should be duplicate) t.Log("\n2. Second job submission (same params):") args2 := "--epochs 10 --lr 0.001" // Same args datasets2 := []queue.DatasetSpec{{Name: "mnist", Checksum: "sha256:abc123"}} // Same dataset datasetID2 := helpers.ComputeDatasetID(datasets2, nil) paramsHash2 := helpers.ComputeParamsHash(args2) t.Logf(" Commit ID: %s", commitID) t.Logf(" Dataset ID: %s", datasetID2) t.Logf(" Params Hash: %s", paramsHash2) t.Logf(" Composite Key: (%s, %s, %s)", commitID, datasetID2, paramsHash2) // Verify they're the same if datasetID1 == datasetID2 && paramsHash1 == paramsHash2 { t.Log(" ✓ DUPLICATE DETECTED - same composite key!") } else { t.Error(" ✗ Should have been detected as duplicate") } // Step 3: Third job with DIFFERENT parameters (not duplicate) t.Log("\n3. Third job submission (different params):") args3 := "--epochs 20 --lr 0.01" // Different args datasets3 := []queue.DatasetSpec{{Name: "mnist", Checksum: "sha256:abc123"}} // Same dataset datasetID3 := helpers.ComputeDatasetID(datasets3, nil) paramsHash3 := helpers.ComputeParamsHash(args3) t.Logf(" Commit ID: %s", commitID) t.Logf(" Dataset ID: %s", datasetID3) t.Logf(" Params Hash: %s", paramsHash3) t.Logf(" Composite Key: (%s, %s, %s)", commitID, datasetID3, paramsHash3) // Verify they're different if paramsHash1 != paramsHash3 { t.Log(" ✓ NOT A DUPLICATE - different params_hash") } else { t.Error(" ✗ Should have different params_hash") } // Step 4: Fourth job with DIFFERENT dataset (not duplicate) t.Log("\n4. Fourth job submission (different dataset):") args4 := "--epochs 10 --lr 0.001" // Same args datasets4 := []queue.DatasetSpec{{Name: "cifar10", Checksum: "sha256:def456"}} // Different dataset datasetID4 := helpers.ComputeDatasetID(datasets4, nil) paramsHash4 := helpers.ComputeParamsHash(args4) t.Logf(" Commit ID: %s", commitID) t.Logf(" Dataset ID: %s", datasetID4) t.Logf(" Params Hash: %s", paramsHash4) t.Logf(" Composite Key: (%s, %s, %s)", commitID, datasetID4, paramsHash4) // Verify they're different if datasetID1 != datasetID4 { t.Log(" ✓ NOT A DUPLICATE - different dataset_id") } else { t.Error(" ✗ Should have different dataset_id") } // Step 5: Summary t.Log("\n5. Summary:") t.Log(" - Jobs 1 & 2: Same commit_id + dataset_id + params_hash = DUPLICATE") t.Log(" - Job 3: Different params_hash = NOT DUPLICATE") t.Log(" - Job 4: Different dataset_id = NOT DUPLICATE") t.Log("\n The composite key (commit_id, dataset_id, params_hash) ensures") t.Log(" only truly identical experiments are flagged as duplicates.") }