package tests import ( "context" "fmt" "os" "path/filepath" "testing" "time" "github.com/jfraeys/fetch_ml/internal/experiment" "github.com/jfraeys/fetch_ml/internal/storage" "github.com/redis/go-redis/v9" ) // setupRedis creates a Redis client for testing func setupRedis(t *testing.T) *redis.Client { rdb := redis.NewClient(&redis.Options{ Addr: "localhost:6379", Password: "", DB: 2, // Use DB 2 for e2e tests to avoid conflicts }) ctx := context.Background() if err := rdb.Ping(ctx).Err(); err != nil { t.Skipf("Redis not available, skipping e2e test: %v", err) return nil } // Clean up the test database rdb.FlushDB(ctx) t.Cleanup(func() { rdb.FlushDB(ctx) rdb.Close() }) return rdb } func TestCompleteJobLifecycle(t *testing.T) { // t.Parallel() // Disable parallel to avoid Redis conflicts // Setup test environment tempDir := t.TempDir() rdb := setupRedis(t) if rdb == nil { return } defer rdb.Close() // Setup database db, err := storage.NewDBFromPath(filepath.Join(tempDir, "test.db")) if err != nil { t.Fatalf("Failed to create database: %v", err) } defer db.Close() // Initialize database schema schema := ` CREATE TABLE IF NOT EXISTS jobs ( id TEXT PRIMARY KEY, job_name TEXT NOT NULL, args TEXT, status TEXT NOT NULL DEFAULT 'pending', priority INTEGER DEFAULT 0, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, started_at DATETIME, ended_at DATETIME, worker_id TEXT, error TEXT, datasets TEXT, metadata TEXT, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ); CREATE TABLE IF NOT EXISTS workers ( id TEXT PRIMARY KEY, hostname TEXT NOT NULL, last_heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP, status TEXT NOT NULL DEFAULT 'active', current_jobs INTEGER DEFAULT 0, max_jobs INTEGER DEFAULT 1, metadata TEXT ); CREATE TABLE IF NOT EXISTS job_metrics ( job_id TEXT NOT NULL, metric_name TEXT NOT NULL, metric_value TEXT NOT NULL, timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (job_id, metric_name), FOREIGN KEY (job_id) REFERENCES jobs(id) ON DELETE CASCADE ); ` err = db.Initialize(schema) if err != nil { t.Fatalf("Failed to initialize database: %v", err) } // Setup experiment manager expManager := experiment.NewManager(filepath.Join(tempDir, "experiments")) // Test 1: Complete job lifecycle jobID := "lifecycle-job-1" // Step 1: Create job job := &storage.Job{ ID: jobID, JobName: "Lifecycle Test Job", Status: "pending", CreatedAt: time.Now(), UpdatedAt: time.Now(), Args: "", Priority: 0, } err = db.CreateJob(job) if err != nil { t.Fatalf("Failed to create job: %v", err) } // Step 2: Queue job in Redis ctx := context.Background() err = rdb.LPush(ctx, "ml:queue", jobID).Err() if err != nil { t.Fatalf("Failed to queue job: %v", err) } // Step 3: Create experiment err = expManager.CreateExperiment(jobID) if err != nil { t.Fatalf("Failed to create experiment: %v", err) } // Create experiment metadata expDir := filepath.Join(tempDir, "experiments") os.MkdirAll(expDir, 0755) expPath := filepath.Join(expDir, jobID+".yaml") expData := fmt.Sprintf(`name: %s commit_id: abc123 user: testuser created_at: %s `, job.JobName, job.CreatedAt.Format(time.RFC3339)) err = os.WriteFile(expPath, []byte(expData), 0644) if err != nil { t.Fatalf("Failed to create experiment metadata: %v", err) } // Step 4: Update job status to running err = db.UpdateJobStatus(job.ID, "running", "worker-1", "") if err != nil { t.Fatalf("Failed to update job status to running: %v", err) } // Update Redis status err = rdb.Set(ctx, "ml:status:"+jobID, "running", time.Hour).Err() if err != nil { t.Fatalf("Failed to set Redis status: %v", err) } // Step 5: Record metrics during execution err = db.RecordJobMetric(jobID, "cpu_usage", "75.5") if err != nil { t.Fatalf("Failed to record job metric: %v", err) } err = db.RecordJobMetric(jobID, "memory_usage", "1024.0") if err != nil { t.Fatalf("Failed to record job metric: %v", err) } // Step 6: Complete job err = db.UpdateJobStatus(jobID, "completed", "worker-1", "") if err != nil { t.Fatalf("Failed to update job status to completed: %v", err) } // Pop job from queue to simulate processing _, err = rdb.LPop(ctx, "ml:queue").Result() if err != nil { t.Fatalf("Failed to pop job from queue: %v", err) } err = rdb.Set(ctx, "ml:status:"+jobID, "completed", time.Hour).Err() if err != nil { t.Fatalf("Failed to update Redis status: %v", err) } // Step 7: Verify complete lifecycle // Check job in database finalJob, err := db.GetJob(jobID) if err != nil { t.Fatalf("Failed to get final job: %v", err) } if finalJob.Status != "completed" { t.Errorf("Expected job status 'completed', got '%s'", finalJob.Status) } // Check Redis status redisStatus := rdb.Get(ctx, "ml:status:"+jobID).Val() if redisStatus != "completed" { t.Errorf("Expected Redis status 'completed', got '%s'", redisStatus) } // Check experiment exists if !expManager.ExperimentExists(jobID) { t.Error("Experiment should exist") } // Check metrics metrics, err := db.GetJobMetrics(jobID) if err != nil { t.Fatalf("Failed to get job metrics: %v", err) } if len(metrics) != 2 { t.Errorf("Expected 2 metrics, got %d", len(metrics)) } // Check queue is empty queueLength := rdb.LLen(ctx, "ml:queue").Val() if queueLength != 0 { t.Errorf("Expected empty queue, got %d", queueLength) } } func TestMultipleJobsLifecycle(t *testing.T) { // t.Parallel() // Disable parallel to avoid Redis conflicts // Setup test environment tempDir := t.TempDir() rdb := setupRedis(t) if rdb == nil { return } defer rdb.Close() // Setup database db, err := storage.NewDBFromPath(filepath.Join(tempDir, "test.db")) if err != nil { t.Fatalf("Failed to create database: %v", err) } defer db.Close() // Initialize database schema schema := ` CREATE TABLE IF NOT EXISTS jobs ( id TEXT PRIMARY KEY, job_name TEXT NOT NULL, args TEXT, status TEXT NOT NULL DEFAULT 'pending', priority INTEGER DEFAULT 0, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, started_at DATETIME, ended_at DATETIME, worker_id TEXT, error TEXT, datasets TEXT, metadata TEXT, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ); CREATE TABLE IF NOT EXISTS workers ( id TEXT PRIMARY KEY, hostname TEXT NOT NULL, last_heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP, status TEXT NOT NULL DEFAULT 'active', current_jobs INTEGER DEFAULT 0, max_jobs INTEGER DEFAULT 1, metadata TEXT ); CREATE TABLE IF NOT EXISTS job_metrics ( job_id TEXT NOT NULL, metric_name TEXT NOT NULL, metric_value TEXT NOT NULL, timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (job_id, metric_name), FOREIGN KEY (job_id) REFERENCES jobs(id) ON DELETE CASCADE ); ` err = db.Initialize(schema) if err != nil { t.Fatalf("Failed to initialize database: %v", err) } // Test 2: Multiple concurrent jobs numJobs := 3 jobIDs := make([]string, numJobs) // Create multiple jobs for i := 0; i < numJobs; i++ { jobID := fmt.Sprintf("multi-job-%d", i) jobIDs[i] = jobID job := &storage.Job{ ID: jobID, JobName: fmt.Sprintf("Multi Job %d", i), Status: "pending", CreatedAt: time.Now(), UpdatedAt: time.Now(), Args: "", Priority: 0, } err = db.CreateJob(job) if err != nil { t.Fatalf("Failed to create job %d: %v", i, err) } // Queue job ctx := context.Background() err = rdb.LPush(ctx, "ml:queue", jobID).Err() if err != nil { t.Fatalf("Failed to queue job %d: %v", i, err) } } // Verify all jobs are queued ctx := context.Background() queueLength := rdb.LLen(ctx, "ml:queue").Val() if int(queueLength) != numJobs { t.Errorf("Expected queue length %d, got %d", numJobs, queueLength) } // Process jobs for i, jobID := range jobIDs { // Update to running err = db.UpdateJobStatus(jobID, "running", "worker-1", "") if err != nil { t.Fatalf("Failed to update job %d to running: %v", i, err) } err = rdb.Set(ctx, "ml:status:"+jobID, "running", time.Hour).Err() if err != nil { t.Fatalf("Failed to set Redis status for job %d: %v", i, err) } // Record metric err = db.RecordJobMetric(jobID, "cpu_usage", fmt.Sprintf("%.1f", float64(50+i*10))) if err != nil { t.Fatalf("Failed to record metric for job %d: %v", i, err) } // Complete job err = db.UpdateJobStatus(jobID, "completed", "worker-1", "") if err != nil { t.Fatalf("Failed to update job %d to completed: %v", i, err) } // Pop job from queue to simulate processing _, err = rdb.LPop(ctx, "ml:queue").Result() if err != nil { t.Fatalf("Failed to pop job %d from queue: %v", i, err) } err = rdb.Set(ctx, "ml:status:"+jobID, "completed", time.Hour).Err() if err != nil { t.Fatalf("Failed to update Redis status for job %d: %v", i, err) } } // Verify all jobs completed for i, jobID := range jobIDs { job, err := db.GetJob(jobID) if err != nil { t.Fatalf("Failed to get job %d: %v", i, err) } if job.Status != "completed" { t.Errorf("Job %d status should be completed, got '%s'", i, job.Status) } redisStatus := rdb.Get(ctx, "ml:status:"+jobID).Val() if redisStatus != "completed" { t.Errorf("Job %d Redis status should be completed, got '%s'", i, redisStatus) } } // Verify queue is empty queueLength = rdb.LLen(ctx, "ml:queue").Val() if queueLength != 0 { t.Errorf("Expected empty queue, got %d", queueLength) } } func TestFailedJobHandling(t *testing.T) { // t.Parallel() // Disable parallel to avoid Redis conflicts // Setup test environment tempDir := t.TempDir() rdb := setupRedis(t) if rdb == nil { return } defer rdb.Close() // Setup database db, err := storage.NewDBFromPath(filepath.Join(tempDir, "test.db")) if err != nil { t.Fatalf("Failed to create database: %v", err) } defer db.Close() // Initialize database schema schema := ` CREATE TABLE IF NOT EXISTS jobs ( id TEXT PRIMARY KEY, job_name TEXT NOT NULL, args TEXT, status TEXT NOT NULL DEFAULT 'pending', priority INTEGER DEFAULT 0, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, started_at DATETIME, ended_at DATETIME, worker_id TEXT, error TEXT, datasets TEXT, metadata TEXT, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ); CREATE TABLE IF NOT EXISTS workers ( id TEXT PRIMARY KEY, hostname TEXT NOT NULL, last_heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP, status TEXT NOT NULL DEFAULT 'active', current_jobs INTEGER DEFAULT 0, max_jobs INTEGER DEFAULT 1, metadata TEXT ); CREATE TABLE IF NOT EXISTS job_metrics ( job_id TEXT NOT NULL, metric_name TEXT NOT NULL, metric_value TEXT NOT NULL, timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (job_id, metric_name), FOREIGN KEY (job_id) REFERENCES jobs(id) ON DELETE CASCADE ); ` err = db.Initialize(schema) if err != nil { t.Fatalf("Failed to initialize database: %v", err) } // Test 3: Failed job handling jobID := "failed-job-1" // Create job job := &storage.Job{ ID: jobID, JobName: "Failed Test Job", Status: "pending", CreatedAt: time.Now(), UpdatedAt: time.Now(), Args: "", Priority: 0, } err = db.CreateJob(job) if err != nil { t.Fatalf("Failed to create job: %v", err) } // Queue job ctx := context.Background() err = rdb.LPush(ctx, "ml:queue", jobID).Err() if err != nil { t.Fatalf("Failed to queue job: %v", err) } // Update to running err = db.UpdateJobStatus(jobID, "running", "worker-1", "") if err != nil { t.Fatalf("Failed to update job to running: %v", err) } err = rdb.Set(ctx, "ml:status:"+jobID, "running", time.Hour).Err() if err != nil { t.Fatalf("Failed to set Redis status: %v", err) } // Simulate failure err = db.UpdateJobStatus(jobID, "failed", "worker-1", "simulated error") if err != nil { t.Fatalf("Failed to update job to failed: %v", err) } // Pop job from queue to simulate processing (even failed jobs are processed) _, err = rdb.LPop(ctx, "ml:queue").Result() if err != nil { t.Fatalf("Failed to pop job from queue: %v", err) } err = rdb.Set(ctx, "ml:status:"+jobID, "failed", time.Hour).Err() if err != nil { t.Fatalf("Failed to update Redis status: %v", err) } // Verify failed state finalJob, err := db.GetJob(jobID) if err != nil { t.Fatalf("Failed to get final job: %v", err) } if finalJob.Status != "failed" { t.Errorf("Expected job status 'failed', got '%s'", finalJob.Status) } redisStatus := rdb.Get(ctx, "ml:status:"+jobID).Val() if redisStatus != "failed" { t.Errorf("Expected Redis status 'failed', got '%s'", redisStatus) } // Verify queue is empty (job was processed) queueLength := rdb.LLen(ctx, "ml:queue").Val() if queueLength != 0 { t.Errorf("Expected empty queue, got %d", queueLength) } } func TestJobCleanup(t *testing.T) { // t.Parallel() // Disable parallel to avoid Redis conflicts // Setup test environment tempDir := t.TempDir() rdb := setupRedis(t) if rdb == nil { return } defer rdb.Close() // Setup database db, err := storage.NewDBFromPath(filepath.Join(tempDir, "test.db")) if err != nil { t.Fatalf("Failed to create database: %v", err) } defer db.Close() // Initialize database schema schema := ` CREATE TABLE IF NOT EXISTS jobs ( id TEXT PRIMARY KEY, job_name TEXT NOT NULL, args TEXT, status TEXT NOT NULL DEFAULT 'pending', priority INTEGER DEFAULT 0, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, started_at DATETIME, ended_at DATETIME, worker_id TEXT, error TEXT, datasets TEXT, metadata TEXT, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ); CREATE TABLE IF NOT EXISTS workers ( id TEXT PRIMARY KEY, hostname TEXT NOT NULL, last_heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP, status TEXT NOT NULL DEFAULT 'active', current_jobs INTEGER DEFAULT 0, max_jobs INTEGER DEFAULT 1, metadata TEXT ); CREATE TABLE IF NOT EXISTS job_metrics ( job_id TEXT NOT NULL, metric_name TEXT NOT NULL, metric_value TEXT NOT NULL, timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (job_id, metric_name), FOREIGN KEY (job_id) REFERENCES jobs(id) ON DELETE CASCADE ); ` err = db.Initialize(schema) if err != nil { t.Fatalf("Failed to initialize database: %v", err) } // Setup experiment manager expManager := experiment.NewManager(filepath.Join(tempDir, "experiments")) // Test 4: Job cleanup jobID := "cleanup-job-1" commitID := "cleanupcommit" // Create job and experiment job := &storage.Job{ ID: jobID, JobName: "Cleanup Test Job", Status: "pending", CreatedAt: time.Now(), UpdatedAt: time.Now(), Args: "", Priority: 0, } err = db.CreateJob(job) if err != nil { t.Fatalf("Failed to create job: %v", err) } // Create experiment with proper metadata err = expManager.CreateExperiment(commitID) if err != nil { t.Fatalf("Failed to create experiment: %v", err) } // Create proper metadata file metadata := &experiment.Metadata{ CommitID: commitID, Timestamp: time.Now().AddDate(0, 0, -2).Unix(), // 2 days ago JobName: "Cleanup Test Job", User: "testuser", } err = expManager.WriteMetadata(metadata) if err != nil { t.Fatalf("Failed to write metadata: %v", err) } // Add some files to experiment filesDir := expManager.GetFilesPath(commitID) testFile := filepath.Join(filesDir, "test.txt") err = os.WriteFile(testFile, []byte("test content"), 0644) if err != nil { t.Fatalf("Failed to create test file: %v", err) } // Verify experiment exists if !expManager.ExperimentExists(commitID) { t.Error("Experiment should exist") } // Complete job err = db.UpdateJobStatus(jobID, "completed", "worker-1", "") if err != nil { t.Fatalf("Failed to update job status: %v", err) } // Cleanup old experiments (keep 0 - should prune everything) pruned, err := expManager.PruneExperiments(0, 0) if err != nil { t.Fatalf("Failed to prune experiments: %v", err) } if len(pruned) != 1 { t.Errorf("Expected 1 pruned experiment, got %d", len(pruned)) } // Verify experiment is gone if expManager.ExperimentExists(commitID) { t.Error("Experiment should be pruned") } // Verify job still exists in database _, err = db.GetJob(jobID) if err != nil { t.Errorf("Job should still exist in database: %v", err) } }