fetch_ml/tests/chaos/chaos_test.go

package chaos

import (
	"context"
	"fmt"
	"sync"
	"testing"
	"time"

	"github.com/jfraeys/fetch_ml/internal/storage"
	fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
	"github.com/redis/go-redis/v9"
)

// ChaosTestSuite tests system resilience under various failure conditions
func TestChaosTestSuite(t *testing.T) {
	// Tests that intentionally close/corrupt connections get their own resources
	// to prevent cascading failures to subsequent subtests

	t.Run("DatabaseConnectionFailure", func(t *testing.T) {
		// This test intentionally closes the database, so it needs its own instance
		tempDir := t.TempDir()
		rdb := setupChaosRedis(t)
		if rdb == nil {
			t.Skip("Redis not available for chaos tests")
		}

		db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
		if err != nil {
			t.Fatalf("Failed to create database: %v", err)
		}

		err = db.Initialize(getChaosSchema())
		if err != nil {
			t.Fatalf("Failed to initialize database: %v", err)
		}

		testDatabaseConnectionFailure(t, db, rdb)
	})

	t.Run("RedisConnectionFailure", func(t *testing.T) {
		// This test intentionally closes Redis, so it needs its own instance
		tempDir := t.TempDir()
		rdb := setupChaosRedisIsolated(t)
		if rdb == nil {
			t.Skip("Redis not available for chaos tests")
		}

		db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
		if err != nil {
			t.Fatalf("Failed to create database: %v", err)
		}
		defer func() {
			if err := db.Close(); err != nil {
				t.Logf("Warning: failed to close database: %v", err)
			}
		}()

		err = db.Initialize(getChaosSchema())
		if err != nil {
			t.Fatalf("Failed to initialize database: %v", err)
		}

		testRedisConnectionFailure(t, db, rdb)
	})

	// Remaining tests share resources since they don't corrupt connections
	tempDir := t.TempDir()
	rdb := setupChaosRedis(t)
	if rdb == nil {
		t.Skip("Redis not available for chaos tests")
	}
	defer func() { _ = rdb.Close() }()

	db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
	if err != nil {
		t.Fatalf("Failed to create database: %v", err)
	}
	defer func() { _ = db.Close() }()

	err = db.Initialize(getChaosSchema())
	if err != nil {
		t.Fatalf("Failed to initialize database: %v", err)
	}

	t.Run("HighConcurrencyStress", func(t *testing.T) {
		testHighConcurrencyStress(t, db, rdb)
	})

	t.Run("MemoryPressure", func(t *testing.T) {
		testMemoryPressure(t, db, rdb)
	})

	t.Run("NetworkLatency", func(t *testing.T) {
		testNetworkLatency(t, db, rdb)
	})

	t.Run("ResourceExhaustion", func(t *testing.T) {
		testResourceExhaustion(t, db, rdb)
	})
}

// testDatabaseConnectionFailure tests system behavior when database fails
func testDatabaseConnectionFailure(t *testing.T, db *storage.DB, _ *redis.Client) {
	// Create some jobs before failure
	jobIDs := createTestJobs(t, db, 10)

	// Simulate database connection failure by closing the database
	err := db.Close()
	if err != nil {
		t.Errorf("Failed to close database: %v", err)
	}

	// Try to perform operations that should fail gracefully
	for _, jobID := range jobIDs {
		err := db.UpdateJobStatus(jobID, "completed", "worker-1", "")
		if err == nil {
			t.Errorf("Expected error when updating job %s on closed database", jobID)
		}
	}

	// Reopen database and verify recovery
	newDB, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", t.TempDir()))
	if err != nil {
		t.Fatalf("Failed to reopen database: %v", err)
	}
	defer func() { _ = newDB.Close() }()

	err = newDB.Initialize(getChaosSchema())
	if err != nil {
		t.Fatalf("Failed to reinitialize database: %v", err)
	}

	// Verify system can recover and continue operations
	newJobID := fmt.Sprintf("recovery-job-%d", time.Now().Unix())
	job := &storage.Job{
		ID:       newJobID,
		JobName:  "Recovery Test Job",
		Status:   "pending",
		Priority: 0,
	}

	err = newDB.CreateJob(job)
	if err != nil {
		t.Errorf("Failed to create job after database recovery: %v", err)
	}

	t.Log("Database connection failure test passed - system recovered gracefully")
}

// testRedisConnectionFailure tests system behavior when Redis fails
func testRedisConnectionFailure(t *testing.T, _ *storage.DB, rdb *redis.Client) {
	// Add jobs to Redis queue
	for i := 0; i < 10; i++ {
		jobID := fmt.Sprintf("redis-chaos-job-%d", i)
		err := rdb.LPush(context.Background(), "ml:queue", jobID).Err()
		if err != nil {
			t.Fatalf("Failed to add job to Redis: %v", err)
		}
	}

	// Simulate Redis connection failure
	err := rdb.Close()
	if err != nil {
		t.Errorf("Failed to close Redis connection: %v", err)
	}

	// Try to perform Redis operations that should fail
	_, err = rdb.LPop(context.Background(), "ml:queue").Result()
	if err == nil {
		t.Error("Expected error when popping from closed Redis connection")
	}

	// Reconnect to Redis and verify recovery
	newRdb := redis.NewClient(&redis.Options{
		Addr:     "localhost:6379",
		Password: "",
		DB:       6, // Use different DB for chaos tests
	})

	// Wait for Redis to be available
	for i := 0; i < 10; i++ {
		err := newRdb.Ping(context.Background()).Err()
		if err == nil {
			break
		}
		time.Sleep(100 * time.Millisecond)
	}

	// Verify system can recover and continue operations
	testJobID := fmt.Sprintf("recovery-redis-job-%d", time.Now().Unix())
	err = newRdb.LPush(context.Background(), "ml:queue", testJobID).Err()
	if err != nil {
		t.Errorf("Failed to add job to Redis after recovery: %v", err)
	}

	_ = newRdb.Close()
	t.Log("Redis connection failure test passed - system recovered gracefully")
}

// testHighConcurrencyStress tests system under high concurrent load
func testHighConcurrencyStress(t *testing.T, db *storage.DB, rdb *redis.Client) {
	numWorkers := 50
	jobsPerWorker := 20

	var wg sync.WaitGroup
	errors := make(chan error, numWorkers*jobsPerWorker)

	start := time.Now()

	// Launch many concurrent workers
	for worker := 0; worker < numWorkers; worker++ {
		wg.Add(1)
		go func(workerID int) {
			defer wg.Done()

			for job := 0; job < jobsPerWorker; job++ {
				jobID := fmt.Sprintf("stress-job-w%d-j%d", workerID, job)

				// Create job in database
				dbJob := &storage.Job{
					ID:       jobID,
					JobName:  fmt.Sprintf("Stress Job W%d J%d", workerID, job),
					Status:   "pending",
					Priority: 0,
				}

				err := db.CreateJob(dbJob)
				if err != nil {
					errors <- fmt.Errorf("failed to create job %s: %w", jobID, err)
					continue
				}

				// Add to Redis queue
				err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
				if err != nil {
					errors <- fmt.Errorf("failed to queue job %s: %w", jobID, err)
					continue
				}

				// Update job status
				err = db.UpdateJobStatus(jobID, "running", fmt.Sprintf("worker-%d", workerID), "")
				if err != nil {
					errors <- fmt.Errorf("failed to update job %s: %w", jobID, err)
					continue
				}

				// Complete job
				err = db.UpdateJobStatus(jobID, "completed", fmt.Sprintf("worker-%d", workerID), "")
				if err != nil {
					errors <- fmt.Errorf("failed to complete job %s: %w", jobID, err)
					continue
				}

				// Pop from queue
				_, err = rdb.LPop(context.Background(), "ml:queue").Result()
				if err != nil {
					errors <- fmt.Errorf("failed to pop job %s: %w", jobID, err)
					continue
				}
			}
		}(worker)
	}

	wg.Wait()
	close(errors)

	duration := time.Since(start)
	totalJobs := numWorkers * jobsPerWorker
	jobsPerSecond := float64(totalJobs) / duration.Seconds()

	// Count errors
	errorCount := 0
	for err := range errors {
		t.Logf("Stress test error: %v", err)
		errorCount++
	}

	t.Logf("High concurrency stress test completed:")
	t.Logf("  Total jobs: %d", totalJobs)
	t.Logf("  Duration: %v", duration)
	t.Logf("  Jobs per second: %.2f", jobsPerSecond)
	t.Logf("  Error count: %d", errorCount)

	// Verify system handled stress reasonably well
	if errorCount > totalJobs/10 { // Allow up to 10% errors under stress
		t.Errorf("Too many errors under stress: %d/%d", errorCount, totalJobs)
	}

	if jobsPerSecond < 100 { // Should handle at least 100 jobs/sec
		t.Errorf("Performance too low under stress: %.2f jobs/sec", jobsPerSecond)
	}
}

// testMemoryPressure tests system behavior under memory pressure
func testMemoryPressure(t *testing.T, db *storage.DB, rdb *redis.Client) {
	// Create large payloads to stress memory
	largePayload := make([]byte, 1024*1024) // 1MB payload
	for i := range largePayload {
		largePayload[i] = byte(i % 256)
	}

	payloadString := string(largePayload)
	numJobs := 50

	// Create jobs with large payloads
	for i := 0; i < numJobs; i++ {
		jobID := fmt.Sprintf("memory-pressure-job-%d", i)

		job := &storage.Job{
			ID:       jobID,
			JobName:  fmt.Sprintf("Memory Pressure Job %d", i),
			Status:   "pending",
			Priority: 0,
			Args:     payloadString,
		}

		err := db.CreateJob(job)
		if err != nil {
			t.Errorf("Failed to create large job %d: %v", i, err)
		}

		// Add to Redis queue
		err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
		if err != nil {
			t.Errorf("Failed to queue large job %d: %v", i, err)
		}
	}

	// Process jobs to test memory handling during operations
	for i := 0; i < numJobs; i++ {
		jobID := fmt.Sprintf("memory-pressure-job-%d", i)

		// Update job status
		err := db.UpdateJobStatus(jobID, "completed", "memory-worker", "")
		if err != nil {
			t.Errorf("Failed to update large job %d: %v", i, err)
		}

		// Pop from queue
		_, err = rdb.LPop(context.Background(), "ml:queue").Result()
		if err != nil {
			t.Errorf("Failed to pop large job %d: %v", i, err)
		}
	}

	t.Log("Memory pressure test passed - system handled large payloads")
}

// testNetworkLatency simulates network latency effects
func testNetworkLatency(t *testing.T, db *storage.DB, rdb *redis.Client) {
	// Simulate operations with artificial delays
	numJobs := 20
	for i := 0; i < numJobs; i++ {
		jobID := fmt.Sprintf("latency-job-%d", i)

		// Add artificial delay to simulate network latency
		time.Sleep(time.Millisecond * 10)

		job := &storage.Job{
			ID:       jobID,
			JobName:  fmt.Sprintf("Latency Job %d", i),
			Status:   "pending",
			Priority: 0,
		}

		err := db.CreateJob(job)
		if err != nil {
			t.Errorf("Failed to create latency job %d: %v", i, err)
		}

		// Simulate network latency for Redis operations
		time.Sleep(time.Millisecond * 5)
		err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
		if err != nil {
			t.Errorf("Failed to queue latency job %d: %v", i, err)
		}
	}

	// Process jobs with latency simulation
	for i := 0; i < numJobs; i++ {
		jobID := fmt.Sprintf("latency-job-%d", i)

		time.Sleep(time.Millisecond * 8)
		err := db.UpdateJobStatus(jobID, "completed", "latency-worker", "")
		if err != nil {
			t.Errorf("Failed to complete latency job %d: %v", i, err)
		}

		time.Sleep(time.Millisecond * 3)
		_, err = rdb.LPop(context.Background(), "ml:queue").Result()
		if err != nil {
			t.Errorf("Failed to pop latency job %d: %v", i, err)
		}
	}

	t.Log("Network latency test passed - system handled delayed operations")
}

// testResourceExhaustion tests behavior when resources are exhausted
func testResourceExhaustion(t *testing.T, db *storage.DB, rdb *redis.Client) {
	// Create many simultaneous operations to exhaust resources
	numOperations := 1000
	done := make(chan bool, numOperations)
	errors := make(chan error, numOperations)

	for i := 0; i < numOperations; i++ {
		go func(opID int) {
			defer func() { done <- true }()

			jobID := fmt.Sprintf("exhaustion-job-%d", opID)

			// Rapid-fire operations to stress the system
			job := &storage.Job{
				ID:       jobID,
				JobName:  fmt.Sprintf("Exhaustion Job %d", opID),
				Status:   "pending",
				Priority: 0,
			}

			err := db.CreateJob(job)
			if err != nil {
				errors <- fmt.Errorf("create failed for job %d: %w", opID, err)
				return
			}

			err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
			if err != nil {
				errors <- fmt.Errorf("queue failed for job %d: %w", opID, err)
				return
			}

			err = db.UpdateJobStatus(jobID, "completed", "exhaustion-worker", "")
			if err != nil {
				errors <- fmt.Errorf("update failed for job %d: %w", opID, err)
				return
			}
		}(i)
	}

	// Wait for all operations to complete
	for i := 0; i < numOperations; i++ {
		<-done
	}
	close(errors)

	// Count errors
	errorCount := 0
	for err := range errors {
		t.Logf("Resource exhaustion error: %v", err)
		errorCount++
	}

	t.Logf("Resource exhaustion test completed:")
	t.Logf("  Total operations: %d", numOperations)
	t.Logf("  Error count: %d", errorCount)
	t.Logf("  Success rate: %.2f%%", float64(numOperations-errorCount)/float64(numOperations)*100)

	// Allow some errors under extreme resource pressure
	if errorCount > numOperations/20 { // Allow up to 5% errors
		t.Errorf("Too many errors under resource exhaustion: %d/%d", errorCount, numOperations)
	}
}

// Helper functions

func setupChaosRedis(t *testing.T) *redis.Client {
	rdb := redis.NewClient(&redis.Options{
		Addr:     "localhost:6379",
		Password: "",
		DB:       6, // Use DB 6 for chaos tests
	})

	ctx := context.Background()
	if err := rdb.Ping(ctx).Err(); err != nil {
		t.Skipf("Redis not available for chaos tests: %v", err)
		return nil
	}

	// Clean up the test database
	rdb.FlushDB(ctx)

	t.Cleanup(func() {
		rdb.FlushDB(ctx)
		_ = rdb.Close()
	})

	return rdb
}

// setupChaosRedisIsolated creates a Redis client without cleanup handlers
// for tests that intentionally close the connection
func setupChaosRedisIsolated(t *testing.T) *redis.Client {
	rdb := redis.NewClient(&redis.Options{
		Addr:     "localhost:6379",
		Password: "",
		DB:       6, // Use DB 6 for chaos tests
	})

	ctx := context.Background()
	if err := rdb.Ping(ctx).Err(); err != nil {
		t.Skipf("Redis not available for chaos tests: %v", err)
		return nil
	}

	// Clean up the test database
	rdb.FlushDB(ctx)

	// No cleanup handler - test will close this intentionally
	return rdb
}

func createTestJobs(t *testing.T, db *storage.DB, count int) []string {
	jobIDs := make([]string, count)
	for i := 0; i < count; i++ {
		jobID := fmt.Sprintf("chaos-test-job-%d", i)
		jobIDs[i] = jobID

		job := &storage.Job{
			ID:       jobID,
			JobName:  fmt.Sprintf("Chaos Test Job %d", i),
			Status:   "pending",
			Priority: 0,
		}

		err := db.CreateJob(job)
		if err != nil {
			t.Fatalf("Failed to create test job %d: %v", i, err)
		}
	}
	return jobIDs
}

func getChaosSchema() string {
	return fixtures.TestSchema
}