fetch_ml/tests/e2e/job_lifecycle_e2e_test.go
Jeremie Fraeys ea15af1833 Fix multi-user authentication and clean up debug code
- Fix YAML tags in auth config struct (json -> yaml)
- Update CLI configs to use pre-hashed API keys
- Remove double hashing in WebSocket client
- Fix port mapping (9102 -> 9103) in CLI commands
- Update permission keys to use jobs:read, jobs:create, etc.
- Clean up all debug logging from CLI and server
- All user roles now authenticate correctly:
  * Admin: Can queue jobs and see all jobs
  * Researcher: Can queue jobs and see own jobs
  * Analyst: Can see status (read-only access)

Multi-user authentication is now fully functional.
2025-12-06 12:35:32 -05:00

598 lines
15 KiB
Go

package tests
import (
"context"
"fmt"
"os"
"path/filepath"
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/experiment"
"github.com/jfraeys/fetch_ml/internal/storage"
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
"github.com/redis/go-redis/v9"
)
const statusCompleted = "completed"
// setupRedis creates a Redis client for testing
func setupRedis(t *testing.T) *redis.Client {
rdb := redis.NewClient(&redis.Options{
Addr: "localhost:6379",
Password: "",
DB: 2, // Use DB 2 for e2e tests to avoid conflicts
})
ctx := context.Background()
if err := rdb.Ping(ctx).Err(); err != nil {
t.Skipf("Redis not available, skipping e2e test: %v", err)
return nil
}
// Clean up the test database
rdb.FlushDB(ctx)
t.Cleanup(func() {
_ = rdb.FlushDB(ctx)
_ = rdb.Close()
})
return rdb
}
func TestCompleteJobLifecycle(t *testing.T) {
// t.Parallel() // Disable parallel to avoid Redis conflicts
// Setup test environment
tempDir := t.TempDir()
rdb := setupRedis(t)
if rdb == nil {
return
}
defer func() { _ = rdb.Close() }()
// Setup database
db, err := storage.NewDBFromPath(filepath.Join(tempDir, "test.db"))
if err != nil {
t.Fatalf("Failed to create database: %v", err)
}
defer func() { _ = db.Close() }()
// Initialize database schema
schema := fixtures.TestSchema
err = db.Initialize(schema)
if err != nil {
t.Fatalf("Failed to initialize database: %v", err)
}
// Setup experiment manager
expManager := experiment.NewManager(filepath.Join(tempDir, "experiments"))
// Test 1: Complete job lifecycle
jobID := "lifecycle-job-1"
// Step 1: Create job
job := &storage.Job{
ID: jobID,
JobName: "Lifecycle Test Job",
Status: "pending",
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
Args: "",
Priority: 0,
}
err = db.CreateJob(job)
if err != nil {
t.Fatalf("Failed to create job: %v", err)
}
// Step 2: Queue job in Redis
ctx := context.Background()
err = rdb.LPush(ctx, "ml:queue", jobID).Err()
if err != nil {
t.Fatalf("Failed to queue job: %v", err)
}
// Step 3: Create experiment
err = expManager.CreateExperiment(jobID)
if err != nil {
t.Fatalf("Failed to create experiment: %v", err)
}
// Create experiment metadata
expDir := filepath.Join(tempDir, "experiments")
_ = os.MkdirAll(expDir, 0750)
expPath := filepath.Join(expDir, jobID+".yaml")
expData := fmt.Sprintf(`name: %s
commit_id: abc123
user: testuser
created_at: %s
`, job.JobName, job.CreatedAt.Format(time.RFC3339))
err = os.WriteFile(expPath, []byte(expData), 0600)
if err != nil {
t.Fatalf("Failed to create experiment metadata: %v", err)
}
// Step 4: Update job status to running
err = db.UpdateJobStatus(job.ID, "running", "worker-1", "")
if err != nil {
t.Fatalf("Failed to update job status to running: %v", err)
}
// Update Redis status
err = rdb.Set(ctx, "ml:status:"+jobID, "running", time.Hour).Err()
if err != nil {
t.Fatalf("Failed to set Redis status: %v", err)
}
// Step 5: Record metrics during execution
err = db.RecordJobMetric(jobID, "cpu_usage", "75.5")
if err != nil {
t.Fatalf("Failed to record job metric: %v", err)
}
err = db.RecordJobMetric(jobID, "memory_usage", "1024.0")
if err != nil {
t.Fatalf("Failed to record job metric: %v", err)
}
// Step 6: Complete job
err = db.UpdateJobStatus(jobID, statusCompleted, "worker-1", "")
if err != nil {
t.Fatalf("Failed to update job status to completed: %v", err)
}
// Pop job from queue to simulate processing
_, err = rdb.LPop(ctx, "ml:queue").Result()
if err != nil {
t.Fatalf("Failed to pop job from queue: %v", err)
}
err = rdb.Set(ctx, "ml:status:"+jobID, statusCompleted, time.Hour).Err()
if err != nil {
t.Fatalf("Failed to update Redis status: %v", err)
}
// Step 7: Verify complete lifecycle
// Check job in database
finalJob, err := db.GetJob(jobID)
if err != nil {
t.Fatalf("Failed to get final job: %v", err)
}
if finalJob.Status != statusCompleted {
t.Errorf("Expected job status 'completed', got '%s'", finalJob.Status)
}
// Check Redis status
redisStatus := rdb.Get(ctx, "ml:status:"+jobID).Val()
if redisStatus != statusCompleted {
t.Errorf("Expected Redis status 'completed', got '%s'", redisStatus)
}
// Check experiment exists
if !expManager.ExperimentExists(jobID) {
t.Error("Experiment should exist")
}
// Check metrics
metrics, err := db.GetJobMetrics(jobID)
if err != nil {
t.Fatalf("Failed to get job metrics: %v", err)
}
if len(metrics) != 2 {
t.Errorf("Expected 2 metrics, got %d", len(metrics))
}
// Check queue is empty
queueLength := rdb.LLen(ctx, "ml:queue").Val()
if queueLength != 0 {
t.Errorf("Expected empty queue, got %d", queueLength)
}
}
func TestMultipleJobsLifecycle(t *testing.T) {
// t.Parallel() // Disable parallel to avoid Redis conflicts
// Setup test environment
tempDir := t.TempDir()
rdb := setupRedis(t)
if rdb == nil {
return
}
defer func() { _ = rdb.Close() }()
// Setup database
db, err := storage.NewDBFromPath(filepath.Join(tempDir, "test.db"))
if err != nil {
t.Fatalf("Failed to create database: %v", err)
}
defer func() { _ = db.Close() }()
// Initialize database schema
schema := fixtures.TestSchema
err = db.Initialize(schema)
if err != nil {
t.Fatalf("Failed to initialize database: %v", err)
}
// Test 2: Multiple concurrent jobs
numJobs := 3
jobIDs := make([]string, numJobs)
// Create multiple jobs
for i := 0; i < numJobs; i++ {
jobID := fmt.Sprintf("multi-job-%d", i)
jobIDs[i] = jobID
job := &storage.Job{
ID: jobID,
JobName: fmt.Sprintf("Multi Job %d", i),
Status: "pending",
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
Args: "",
Priority: 0,
}
err = db.CreateJob(job)
if err != nil {
t.Fatalf("Failed to create job %d: %v", i, err)
}
// Queue job
ctx := context.Background()
err = rdb.LPush(ctx, "ml:queue", jobID).Err()
if err != nil {
t.Fatalf("Failed to queue job %d: %v", i, err)
}
}
// Verify all jobs are queued
ctx := context.Background()
queueLength := rdb.LLen(ctx, "ml:queue").Val()
if int(queueLength) != numJobs {
t.Errorf("Expected queue length %d, got %d", numJobs, queueLength)
}
// Process jobs
for i, jobID := range jobIDs {
// Update to running
err = db.UpdateJobStatus(jobID, "running", "worker-1", "")
if err != nil {
t.Fatalf("Failed to update job %d to running: %v", i, err)
}
err = rdb.Set(ctx, "ml:status:"+jobID, "running", time.Hour).Err()
if err != nil {
t.Fatalf("Failed to set Redis status for job %d: %v", i, err)
}
// Record metric
err = db.RecordJobMetric(jobID, "cpu_usage", fmt.Sprintf("%.1f", float64(50+i*10)))
if err != nil {
t.Fatalf("Failed to record metric for job %d: %v", i, err)
}
// Complete job
err = db.UpdateJobStatus(jobID, statusCompleted, "worker-1", "")
if err != nil {
t.Fatalf("Failed to update job %d to completed: %v", i, err)
}
// Pop job from queue to simulate processing
_, err = rdb.LPop(ctx, "ml:queue").Result()
if err != nil {
t.Fatalf("Failed to pop job %d from queue: %v", i, err)
}
err = rdb.Set(ctx, "ml:status:"+jobID, statusCompleted, time.Hour).Err()
if err != nil {
t.Fatalf("Failed to update Redis status for job %d: %v", i, err)
}
}
// Verify all jobs completed
for i, jobID := range jobIDs {
job, err := db.GetJob(jobID)
if err != nil {
t.Fatalf("Failed to get job %d: %v", i, err)
}
if job.Status != statusCompleted {
t.Errorf("Job %d status should be completed, got '%s'", i, job.Status)
}
redisStatus := rdb.Get(ctx, "ml:status:"+jobID).Val()
if redisStatus != statusCompleted {
t.Errorf("Job %d Redis status should be completed, got '%s'", i, redisStatus)
}
}
// Verify queue is empty
queueLength = rdb.LLen(ctx, "ml:queue").Val()
if queueLength != 0 {
t.Errorf("Expected empty queue, got %d", queueLength)
}
}
func TestFailedJobHandling(t *testing.T) {
// t.Parallel() // Disable parallel to avoid Redis conflicts
// Setup test environment
tempDir := t.TempDir()
rdb := setupRedis(t)
if rdb == nil {
return
}
defer func() { _ = rdb.Close() }()
// Setup database
db, err := storage.NewDBFromPath(filepath.Join(tempDir, "test.db"))
if err != nil {
t.Fatalf("Failed to create database: %v", err)
}
defer func() { _ = db.Close() }()
// Initialize database schema
schema := `
CREATE TABLE IF NOT EXISTS jobs (
id TEXT PRIMARY KEY,
job_name TEXT NOT NULL,
args TEXT,
status TEXT NOT NULL DEFAULT 'pending',
priority INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
started_at DATETIME,
ended_at DATETIME,
worker_id TEXT,
error TEXT,
datasets TEXT,
metadata TEXT,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS workers (
id TEXT PRIMARY KEY,
hostname TEXT NOT NULL,
last_heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP,
status TEXT NOT NULL DEFAULT 'active',
current_jobs INTEGER DEFAULT 0,
max_jobs INTEGER DEFAULT 1,
metadata TEXT
);
CREATE TABLE IF NOT EXISTS job_metrics (
job_id TEXT NOT NULL,
metric_name TEXT NOT NULL,
metric_value TEXT NOT NULL,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (job_id, metric_name),
FOREIGN KEY (job_id) REFERENCES jobs(id) ON DELETE CASCADE
);
`
err = db.Initialize(schema)
if err != nil {
t.Fatalf("Failed to initialize database: %v", err)
}
// Test 3: Failed job handling
jobID := "failed-job-1"
// Create job
job := &storage.Job{
ID: jobID,
JobName: "Failed Test Job",
Status: "pending",
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
Args: "",
Priority: 0,
}
err = db.CreateJob(job)
if err != nil {
t.Fatalf("Failed to create job: %v", err)
}
// Queue job
ctx := context.Background()
err = rdb.LPush(ctx, "ml:queue", jobID).Err()
if err != nil {
t.Fatalf("Failed to queue job: %v", err)
}
// Update to running
err = db.UpdateJobStatus(jobID, "running", "worker-1", "")
if err != nil {
t.Fatalf("Failed to update job to running: %v", err)
}
err = rdb.Set(ctx, "ml:status:"+jobID, "running", time.Hour).Err()
if err != nil {
t.Fatalf("Failed to set Redis status: %v", err)
}
// Simulate failure
err = db.UpdateJobStatus(jobID, "failed", "worker-1", "simulated error")
if err != nil {
t.Fatalf("Failed to update job to failed: %v", err)
}
// Pop job from queue to simulate processing (even failed jobs are processed)
_, err = rdb.LPop(ctx, "ml:queue").Result()
if err != nil {
t.Fatalf("Failed to pop job from queue: %v", err)
}
err = rdb.Set(ctx, "ml:status:"+jobID, "failed", time.Hour).Err()
if err != nil {
t.Fatalf("Failed to update Redis status: %v", err)
}
// Verify failed state
finalJob, err := db.GetJob(jobID)
if err != nil {
t.Fatalf("Failed to get final job: %v", err)
}
if finalJob.Status != "failed" {
t.Errorf("Expected job status 'failed', got '%s'", finalJob.Status)
}
redisStatus := rdb.Get(ctx, "ml:status:"+jobID).Val()
if redisStatus != "failed" {
t.Errorf("Expected Redis status 'failed', got '%s'", redisStatus)
}
// Verify queue is empty (job was processed)
queueLength := rdb.LLen(ctx, "ml:queue").Val()
if queueLength != 0 {
t.Errorf("Expected empty queue, got %d", queueLength)
}
}
func TestJobCleanup(t *testing.T) {
// t.Parallel() // Disable parallel to avoid Redis conflicts
// Setup test environment
tempDir := t.TempDir()
rdb := setupRedis(t)
if rdb == nil {
return
}
defer func() { _ = rdb.Close() }()
// Setup database
db, err := storage.NewDBFromPath(filepath.Join(tempDir, "test.db"))
if err != nil {
t.Fatalf("Failed to create database: %v", err)
}
defer func() { _ = db.Close() }()
// Initialize database schema
schema := `
CREATE TABLE IF NOT EXISTS jobs (
id TEXT PRIMARY KEY,
job_name TEXT NOT NULL,
args TEXT,
status TEXT NOT NULL DEFAULT 'pending',
priority INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
started_at DATETIME,
ended_at DATETIME,
worker_id TEXT,
error TEXT,
datasets TEXT,
metadata TEXT,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS workers (
id TEXT PRIMARY KEY,
hostname TEXT NOT NULL,
last_heartbeat DATETIME DEFAULT CURRENT_TIMESTAMP,
status TEXT NOT NULL DEFAULT 'active',
current_jobs INTEGER DEFAULT 0,
max_jobs INTEGER DEFAULT 1,
metadata TEXT
);
CREATE TABLE IF NOT EXISTS job_metrics (
job_id TEXT NOT NULL,
metric_name TEXT NOT NULL,
metric_value TEXT NOT NULL,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (job_id, metric_name),
FOREIGN KEY (job_id) REFERENCES jobs(id) ON DELETE CASCADE
);
`
err = db.Initialize(schema)
if err != nil {
t.Fatalf("Failed to initialize database: %v", err)
}
// Setup experiment manager
expManager := experiment.NewManager(filepath.Join(tempDir, "experiments"))
// Test 4: Job cleanup
jobID := "cleanup-job-1"
commitID := "cleanupcommit"
// Create job and experiment
job := &storage.Job{
ID: jobID,
JobName: "Cleanup Test Job",
Status: "pending",
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
Args: "",
Priority: 0,
}
err = db.CreateJob(job)
if err != nil {
t.Fatalf("Failed to create job: %v", err)
}
// Create experiment with proper metadata
err = expManager.CreateExperiment(commitID)
if err != nil {
t.Fatalf("Failed to create experiment: %v", err)
}
// Create proper metadata file
metadata := &experiment.Metadata{
CommitID: commitID,
Timestamp: time.Now().AddDate(0, 0, -2).Unix(), // 2 days ago
JobName: "Cleanup Test Job",
User: "testuser",
}
err = expManager.WriteMetadata(metadata)
if err != nil {
t.Fatalf("Failed to write metadata: %v", err)
}
// Add some files to experiment
filesDir := expManager.GetFilesPath(commitID)
testFile := filepath.Join(filesDir, "test.txt")
err = os.WriteFile(testFile, []byte("test content"), 0600)
if err != nil {
t.Fatalf("Failed to create test file: %v", err)
}
// Verify experiment exists
if !expManager.ExperimentExists(commitID) {
t.Error("Experiment should exist")
}
// Complete job
err = db.UpdateJobStatus(jobID, "completed", "worker-1", "")
if err != nil {
t.Fatalf("Failed to update job status: %v", err)
}
// Cleanup old experiments (keep 0 - should prune everything)
pruned, err := expManager.PruneExperiments(0, 0)
if err != nil {
t.Fatalf("Failed to prune experiments: %v", err)
}
if len(pruned) != 1 {
t.Errorf("Expected 1 pruned experiment, got %d", len(pruned))
}
// Verify experiment is gone
if expManager.ExperimentExists(commitID) {
t.Error("Experiment should be pruned")
}
// Verify job still exists in database
_, err = db.GetJob(jobID)
if err != nil {
t.Errorf("Job should still exist in database: %v", err)
}
}