package chaos import ( "context" "fmt" "sync" "testing" "time" "github.com/jfraeys/fetch_ml/internal/storage" fixtures "github.com/jfraeys/fetch_ml/tests/fixtures" "github.com/redis/go-redis/v9" ) // ChaosTestSuite tests system resilience under various failure conditions func TestChaosTestSuite(t *testing.T) { // Check Redis availability at suite level and warn if not available quickRedis := redis.NewClient(&redis.Options{Addr: "localhost:6379", DB: 6}) if err := quickRedis.Ping(context.Background()).Err(); err != nil { t.Logf("WARNING: Redis not available at localhost:6379 - chaos tests will be skipped") t.Logf(" To run these tests, start Redis: redis-server --port 6379") } quickRedis.Close() // Tests that intentionally close/corrupt connections get their own resources // to prevent cascading failures to subsequent subtests t.Run("DatabaseConnectionFailure", func(t *testing.T) { // This test intentionally closes the database, so it needs its own instance tempDir := t.TempDir() rdb := setupChaosRedis(t) if rdb == nil { t.Skip("Redis not available for chaos tests") } db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir)) if err != nil { t.Fatalf("Failed to create database: %v", err) } err = db.Initialize(getChaosSchema()) if err != nil { t.Fatalf("Failed to initialize database: %v", err) } testDatabaseConnectionFailure(t, db, rdb) }) t.Run("RedisConnectionFailure", func(t *testing.T) { // This test intentionally closes Redis, so it needs its own instance tempDir := t.TempDir() rdb := setupChaosRedisIsolated(t) if rdb == nil { t.Skip("Redis not available for chaos tests") } db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir)) if err != nil { t.Fatalf("Failed to create database: %v", err) } defer func() { if err := db.Close(); err != nil { t.Logf("Warning: failed to close database: %v", err) } }() err = db.Initialize(getChaosSchema()) if err != nil { t.Fatalf("Failed to initialize database: %v", err) } testRedisConnectionFailure(t, db, rdb) }) // Remaining tests share resources since they don't corrupt connections tempDir := t.TempDir() rdb := setupChaosRedis(t) if rdb == nil { t.Skip("Redis not available for chaos tests") return } defer func() { _ = rdb.Close() }() db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir)) if err != nil { t.Fatalf("Failed to create database: %v", err) } defer func() { _ = db.Close() }() err = db.Initialize(getChaosSchema()) if err != nil { t.Fatalf("Failed to initialize database: %v", err) } t.Run("HighConcurrencyStress", func(t *testing.T) { testHighConcurrencyStress(t, db, rdb) }) t.Run("MemoryPressure", func(t *testing.T) { testMemoryPressure(t, db, rdb) }) t.Run("NetworkLatency", func(t *testing.T) { testNetworkLatency(t, db, rdb) }) t.Run("ResourceExhaustion", func(t *testing.T) { testResourceExhaustion(t, db, rdb) }) } // testDatabaseConnectionFailure tests system behavior when database fails func testDatabaseConnectionFailure(t *testing.T, db *storage.DB, _ *redis.Client) { // Create some jobs before failure jobIDs := createTestJobs(t, db, 10) // Simulate database connection failure by closing the database err := db.Close() if err != nil { t.Errorf("Failed to close database: %v", err) } // Try to perform operations that should fail gracefully for _, jobID := range jobIDs { err := db.UpdateJobStatus(jobID, "completed", "worker-1", "") if err == nil { t.Errorf("Expected error when updating job %s on closed database", jobID) } } // Reopen database and verify recovery newDB, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", t.TempDir())) if err != nil { t.Fatalf("Failed to reopen database: %v", err) } defer func() { _ = newDB.Close() }() err = newDB.Initialize(getChaosSchema()) if err != nil { t.Fatalf("Failed to reinitialize database: %v", err) } // Verify system can recover and continue operations newJobID := fmt.Sprintf("recovery-job-%d", time.Now().Unix()) job := &storage.Job{ ID: newJobID, JobName: "Recovery Test Job", Status: "pending", Priority: 0, } err = newDB.CreateJob(job) if err != nil { t.Errorf("Failed to create job after database recovery: %v", err) } t.Log("Database connection failure test passed - system recovered gracefully") } // testRedisConnectionFailure tests system behavior when Redis fails func testRedisConnectionFailure(t *testing.T, _ *storage.DB, rdb *redis.Client) { // Add jobs to Redis queue for i := range 10 { jobID := fmt.Sprintf("redis-chaos-job-%d", i) err := rdb.LPush(context.Background(), "ml:queue", jobID).Err() if err != nil { t.Fatalf("Failed to add job to Redis: %v", err) } } // Simulate Redis connection failure err := rdb.Close() if err != nil { t.Errorf("Failed to close Redis connection: %v", err) } // Try to perform Redis operations that should fail _, err = rdb.LPop(context.Background(), "ml:queue").Result() if err == nil { t.Error("Expected error when popping from closed Redis connection") } // Reconnect to Redis and verify recovery newRdb := redis.NewClient(&redis.Options{ Addr: "localhost:6379", Password: "", DB: 6, // Use different DB for chaos tests }) // Wait for Redis to be available for range 10 { err := newRdb.Ping(context.Background()).Err() if err == nil { break } time.Sleep(100 * time.Millisecond) } // Verify system can recover and continue operations testJobID := fmt.Sprintf("recovery-redis-job-%d", time.Now().Unix()) err = newRdb.LPush(context.Background(), "ml:queue", testJobID).Err() if err != nil { t.Errorf("Failed to add job to Redis after recovery: %v", err) } _ = newRdb.Close() t.Log("Redis connection failure test passed - system recovered gracefully") } // testHighConcurrencyStress tests system under high concurrent load func testHighConcurrencyStress(t *testing.T, db *storage.DB, rdb *redis.Client) { numWorkers := 50 jobsPerWorker := 20 var wg sync.WaitGroup errors := make(chan error, numWorkers*jobsPerWorker) start := time.Now() // Launch many concurrent workers for worker := range numWorkers { wg.Add(1) go func(workerID int) { defer wg.Done() for job := 0; job < jobsPerWorker; job++ { jobID := fmt.Sprintf("stress-job-w%d-j%d", workerID, job) // Create job in database dbJob := &storage.Job{ ID: jobID, JobName: fmt.Sprintf("Stress Job W%d J%d", workerID, job), Status: "pending", Priority: 0, } err := db.CreateJob(dbJob) if err != nil { errors <- fmt.Errorf("failed to create job %s: %w", jobID, err) continue } // Add to Redis queue err = rdb.LPush(context.Background(), "ml:queue", jobID).Err() if err != nil { errors <- fmt.Errorf("failed to queue job %s: %w", jobID, err) continue } // Update job status err = db.UpdateJobStatus(jobID, "running", fmt.Sprintf("worker-%d", workerID), "") if err != nil { errors <- fmt.Errorf("failed to update job %s: %w", jobID, err) continue } // Complete job err = db.UpdateJobStatus(jobID, "completed", fmt.Sprintf("worker-%d", workerID), "") if err != nil { errors <- fmt.Errorf("failed to complete job %s: %w", jobID, err) continue } // Pop from queue _, err = rdb.LPop(context.Background(), "ml:queue").Result() if err != nil { errors <- fmt.Errorf("failed to pop job %s: %w", jobID, err) continue } } }(worker) } wg.Wait() close(errors) duration := time.Since(start) totalJobs := numWorkers * jobsPerWorker jobsPerSecond := float64(totalJobs) / duration.Seconds() // Count errors errorCount := 0 for err := range errors { t.Logf("Stress test error: %v", err) errorCount++ } t.Logf("High concurrency stress test completed:") t.Logf(" Total jobs: %d", totalJobs) t.Logf(" Duration: %v", duration) t.Logf(" Jobs per second: %.2f", jobsPerSecond) t.Logf(" Error count: %d", errorCount) // Verify system handled stress reasonably well if errorCount > totalJobs/10 { // Allow up to 10% errors under stress t.Errorf("Too many errors under stress: %d/%d", errorCount, totalJobs) } if jobsPerSecond < 100 { // Should handle at least 100 jobs/sec t.Errorf("Performance too low under stress: %.2f jobs/sec", jobsPerSecond) } } // testMemoryPressure tests system behavior under memory pressure func testMemoryPressure(t *testing.T, db *storage.DB, rdb *redis.Client) { // Create large payloads to stress memory largePayload := make([]byte, 1024*1024) // 1MB payload for i := range largePayload { largePayload[i] = byte(i % 256) } payloadString := string(largePayload) numJobs := 50 // Create jobs with large payloads for i := range numJobs { jobID := fmt.Sprintf("memory-pressure-job-%d", i) job := &storage.Job{ ID: jobID, JobName: fmt.Sprintf("Memory Pressure Job %d", i), Status: "pending", Priority: 0, Args: payloadString, } err := db.CreateJob(job) if err != nil { t.Errorf("Failed to create large job %d: %v", i, err) } // Add to Redis queue err = rdb.LPush(context.Background(), "ml:queue", jobID).Err() if err != nil { t.Errorf("Failed to queue large job %d: %v", i, err) } } // Process jobs to test memory handling during operations for i := range numJobs { jobID := fmt.Sprintf("memory-pressure-job-%d", i) // Update job status err := db.UpdateJobStatus(jobID, "completed", "memory-worker", "") if err != nil { t.Errorf("Failed to update large job %d: %v", i, err) } // Pop from queue _, err = rdb.LPop(context.Background(), "ml:queue").Result() if err != nil { t.Errorf("Failed to pop large job %d: %v", i, err) } } t.Log("Memory pressure test passed - system handled large payloads") } // testNetworkLatency simulates network latency effects func testNetworkLatency(t *testing.T, db *storage.DB, rdb *redis.Client) { // Simulate operations with artificial delays numJobs := 20 for i := range numJobs { jobID := fmt.Sprintf("latency-job-%d", i) // Add artificial delay to simulate network latency time.Sleep(time.Millisecond * 10) job := &storage.Job{ ID: jobID, JobName: fmt.Sprintf("Latency Job %d", i), Status: "pending", Priority: 0, } err := db.CreateJob(job) if err != nil { t.Errorf("Failed to create latency job %d: %v", i, err) } // Simulate network latency for Redis operations time.Sleep(time.Millisecond * 5) err = rdb.LPush(context.Background(), "ml:queue", jobID).Err() if err != nil { t.Errorf("Failed to queue latency job %d: %v", i, err) } } // Process jobs with latency simulation for i := range numJobs { jobID := fmt.Sprintf("latency-job-%d", i) time.Sleep(time.Millisecond * 8) err := db.UpdateJobStatus(jobID, "completed", "latency-worker", "") if err != nil { t.Errorf("Failed to complete latency job %d: %v", i, err) } time.Sleep(time.Millisecond * 3) _, err = rdb.LPop(context.Background(), "ml:queue").Result() if err != nil { t.Errorf("Failed to pop latency job %d: %v", i, err) } } t.Log("Network latency test passed - system handled delayed operations") } // testResourceExhaustion tests behavior when resources are exhausted func testResourceExhaustion(t *testing.T, db *storage.DB, rdb *redis.Client) { // Create many simultaneous operations to exhaust resources numOperations := 1000 done := make(chan bool, numOperations) errors := make(chan error, numOperations) for i := range numOperations { go func(opID int) { defer func() { done <- true }() jobID := fmt.Sprintf("exhaustion-job-%d", opID) // Rapid-fire operations to stress the system job := &storage.Job{ ID: jobID, JobName: fmt.Sprintf("Exhaustion Job %d", opID), Status: "pending", Priority: 0, } err := db.CreateJob(job) if err != nil { errors <- fmt.Errorf("create failed for job %d: %w", opID, err) return } err = rdb.LPush(context.Background(), "ml:queue", jobID).Err() if err != nil { errors <- fmt.Errorf("queue failed for job %d: %w", opID, err) return } err = db.UpdateJobStatus(jobID, "completed", "exhaustion-worker", "") if err != nil { errors <- fmt.Errorf("update failed for job %d: %w", opID, err) return } }(i) } // Wait for all operations to complete for range numOperations { <-done } close(errors) // Count errors errorCount := 0 for err := range errors { t.Logf("Resource exhaustion error: %v", err) errorCount++ } t.Logf("Resource exhaustion test completed:") t.Logf(" Total operations: %d", numOperations) t.Logf(" Error count: %d", errorCount) t.Logf(" Success rate: %.2f%%", float64(numOperations-errorCount)/float64(numOperations)*100) // Allow some errors under extreme resource pressure if errorCount > numOperations/20 { // Allow up to 5% errors t.Errorf("Too many errors under resource exhaustion: %d/%d", errorCount, numOperations) } } // Helper functions func setupChaosRedis(t *testing.T) *redis.Client { rdb := redis.NewClient(&redis.Options{ Addr: "localhost:6379", Password: "", DB: 6, // Use DB 6 for chaos tests }) ctx := context.Background() if err := rdb.Ping(ctx).Err(); err != nil { t.Logf("Skipping chaos test - Redis not available: %v", err) t.Skipf("Redis not available for chaos tests: %v", err) return nil } // Clean up the test database rdb.FlushDB(ctx) t.Cleanup(func() { rdb.FlushDB(ctx) _ = rdb.Close() }) return rdb } // setupChaosRedisIsolated creates a Redis client without cleanup handlers // for tests that intentionally close the connection func setupChaosRedisIsolated(t *testing.T) *redis.Client { rdb := redis.NewClient(&redis.Options{ Addr: "localhost:6379", Password: "", DB: 6, // Use DB 6 for chaos tests }) ctx := context.Background() if err := rdb.Ping(ctx).Err(); err != nil { t.Skipf("Redis not available for chaos tests: %v", err) return nil } // Clean up the test database rdb.FlushDB(ctx) // No cleanup handler - test will close this intentionally return rdb } func createTestJobs(t *testing.T, db *storage.DB, count int) []string { jobIDs := make([]string, count) for i := range count { jobID := fmt.Sprintf("chaos-test-job-%d", i) jobIDs[i] = jobID job := &storage.Job{ ID: jobID, JobName: fmt.Sprintf("Chaos Test Job %d", i), Status: "pending", Priority: 0, } err := db.CreateJob(job) if err != nil { t.Fatalf("Failed to create test job %d: %v", i, err) } } return jobIDs } func getChaosSchema() string { return fixtures.TestSchema }