fetch_ml/tests/unit/api/duplicate_detection_process_test.go
Jeremie Fraeys 7305e2bc21
test: add comprehensive test coverage and command improvements
- Add logs and debug end-to-end tests
- Add test helper utilities
- Improve test fixtures and templates
- Update API server and config lint commands
- Add multi-user database initialization
2026-02-16 20:38:15 -05:00

95 lines
3.7 KiB
Go

package api_test
import (
"testing"
"github.com/jfraeys/fetch_ml/internal/api/helpers"
"github.com/jfraeys/fetch_ml/internal/queue"
)
// ProcessTest demonstrates the duplicate detection process step by step
func TestDuplicateDetectionProcess(t *testing.T) {
t.Log("=== Duplicate Detection Process Test ===")
// Step 1: First job submission
t.Log("\n1. First job submission:")
commitID := "abc123def456"
args1 := "--epochs 10 --lr 0.001"
datasets := []queue.DatasetSpec{{Name: "mnist", Checksum: "sha256:abc123"}}
datasetID1 := helpers.ComputeDatasetID(datasets, nil)
paramsHash1 := helpers.ComputeParamsHash(args1)
t.Logf(" Commit ID: %s", commitID)
t.Logf(" Dataset ID: %s (computed from %d datasets)", datasetID1, len(datasets))
t.Logf(" Params Hash: %s (computed from args: %s)", paramsHash1, args1)
t.Logf(" Composite Key: (%s, %s, %s)", commitID, datasetID1, paramsHash1)
// Step 2: Second job with SAME parameters (should be duplicate)
t.Log("\n2. Second job submission (same params):")
args2 := "--epochs 10 --lr 0.001" // Same args
datasets2 := []queue.DatasetSpec{{Name: "mnist", Checksum: "sha256:abc123"}} // Same dataset
datasetID2 := helpers.ComputeDatasetID(datasets2, nil)
paramsHash2 := helpers.ComputeParamsHash(args2)
t.Logf(" Commit ID: %s", commitID)
t.Logf(" Dataset ID: %s", datasetID2)
t.Logf(" Params Hash: %s", paramsHash2)
t.Logf(" Composite Key: (%s, %s, %s)", commitID, datasetID2, paramsHash2)
// Verify they're the same
if datasetID1 == datasetID2 && paramsHash1 == paramsHash2 {
t.Log(" ✓ DUPLICATE DETECTED - same composite key!")
} else {
t.Error(" ✗ Should have been detected as duplicate")
}
// Step 3: Third job with DIFFERENT parameters (not duplicate)
t.Log("\n3. Third job submission (different params):")
args3 := "--epochs 20 --lr 0.01" // Different args
datasets3 := []queue.DatasetSpec{{Name: "mnist", Checksum: "sha256:abc123"}} // Same dataset
datasetID3 := helpers.ComputeDatasetID(datasets3, nil)
paramsHash3 := helpers.ComputeParamsHash(args3)
t.Logf(" Commit ID: %s", commitID)
t.Logf(" Dataset ID: %s", datasetID3)
t.Logf(" Params Hash: %s", paramsHash3)
t.Logf(" Composite Key: (%s, %s, %s)", commitID, datasetID3, paramsHash3)
// Verify they're different
if paramsHash1 != paramsHash3 {
t.Log(" ✓ NOT A DUPLICATE - different params_hash")
} else {
t.Error(" ✗ Should have different params_hash")
}
// Step 4: Fourth job with DIFFERENT dataset (not duplicate)
t.Log("\n4. Fourth job submission (different dataset):")
args4 := "--epochs 10 --lr 0.001" // Same args
datasets4 := []queue.DatasetSpec{{Name: "cifar10", Checksum: "sha256:def456"}} // Different dataset
datasetID4 := helpers.ComputeDatasetID(datasets4, nil)
paramsHash4 := helpers.ComputeParamsHash(args4)
t.Logf(" Commit ID: %s", commitID)
t.Logf(" Dataset ID: %s", datasetID4)
t.Logf(" Params Hash: %s", paramsHash4)
t.Logf(" Composite Key: (%s, %s, %s)", commitID, datasetID4, paramsHash4)
// Verify they're different
if datasetID1 != datasetID4 {
t.Log(" ✓ NOT A DUPLICATE - different dataset_id")
} else {
t.Error(" ✗ Should have different dataset_id")
}
// Step 5: Summary
t.Log("\n5. Summary:")
t.Log(" - Jobs 1 & 2: Same commit_id + dataset_id + params_hash = DUPLICATE")
t.Log(" - Job 3: Different params_hash = NOT DUPLICATE")
t.Log(" - Job 4: Different dataset_id = NOT DUPLICATE")
t.Log("\n The composite key (commit_id, dataset_id, params_hash) ensures")
t.Log(" only truly identical experiments are flagged as duplicates.")
}