fetch_ml/tests/chaos/chaos_test.go
Jeremie Fraeys ea15af1833 Fix multi-user authentication and clean up debug code
- Fix YAML tags in auth config struct (json -> yaml)
- Update CLI configs to use pre-hashed API keys
- Remove double hashing in WebSocket client
- Fix port mapping (9102 -> 9103) in CLI commands
- Update permission keys to use jobs:read, jobs:create, etc.
- Clean up all debug logging from CLI and server
- All user roles now authenticate correctly:
  * Admin: Can queue jobs and see all jobs
  * Researcher: Can queue jobs and see own jobs
  * Analyst: Can see status (read-only access)

Multi-user authentication is now fully functional.
2025-12-06 12:35:32 -05:00

536 lines
14 KiB
Go

package chaos
import (
"context"
"fmt"
"sync"
"testing"
"time"
"github.com/jfraeys/fetch_ml/internal/storage"
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
"github.com/redis/go-redis/v9"
)
// ChaosTestSuite tests system resilience under various failure conditions
func TestChaosTestSuite(t *testing.T) {
// Tests that intentionally close/corrupt connections get their own resources
// to prevent cascading failures to subsequent subtests
t.Run("DatabaseConnectionFailure", func(t *testing.T) {
// This test intentionally closes the database, so it needs its own instance
tempDir := t.TempDir()
rdb := setupChaosRedis(t)
if rdb == nil {
t.Skip("Redis not available for chaos tests")
}
db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
if err != nil {
t.Fatalf("Failed to create database: %v", err)
}
err = db.Initialize(getChaosSchema())
if err != nil {
t.Fatalf("Failed to initialize database: %v", err)
}
testDatabaseConnectionFailure(t, db, rdb)
})
t.Run("RedisConnectionFailure", func(t *testing.T) {
// This test intentionally closes Redis, so it needs its own instance
tempDir := t.TempDir()
rdb := setupChaosRedisIsolated(t)
if rdb == nil {
t.Skip("Redis not available for chaos tests")
}
db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
if err != nil {
t.Fatalf("Failed to create database: %v", err)
}
defer func() {
if err := db.Close(); err != nil {
t.Logf("Warning: failed to close database: %v", err)
}
}()
err = db.Initialize(getChaosSchema())
if err != nil {
t.Fatalf("Failed to initialize database: %v", err)
}
testRedisConnectionFailure(t, db, rdb)
})
// Remaining tests share resources since they don't corrupt connections
tempDir := t.TempDir()
rdb := setupChaosRedis(t)
if rdb == nil {
t.Skip("Redis not available for chaos tests")
}
defer func() { _ = rdb.Close() }()
db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
if err != nil {
t.Fatalf("Failed to create database: %v", err)
}
defer func() { _ = db.Close() }()
err = db.Initialize(getChaosSchema())
if err != nil {
t.Fatalf("Failed to initialize database: %v", err)
}
t.Run("HighConcurrencyStress", func(t *testing.T) {
testHighConcurrencyStress(t, db, rdb)
})
t.Run("MemoryPressure", func(t *testing.T) {
testMemoryPressure(t, db, rdb)
})
t.Run("NetworkLatency", func(t *testing.T) {
testNetworkLatency(t, db, rdb)
})
t.Run("ResourceExhaustion", func(t *testing.T) {
testResourceExhaustion(t, db, rdb)
})
}
// testDatabaseConnectionFailure tests system behavior when database fails
func testDatabaseConnectionFailure(t *testing.T, db *storage.DB, _ *redis.Client) {
// Create some jobs before failure
jobIDs := createTestJobs(t, db, 10)
// Simulate database connection failure by closing the database
err := db.Close()
if err != nil {
t.Errorf("Failed to close database: %v", err)
}
// Try to perform operations that should fail gracefully
for _, jobID := range jobIDs {
err := db.UpdateJobStatus(jobID, "completed", "worker-1", "")
if err == nil {
t.Errorf("Expected error when updating job %s on closed database", jobID)
}
}
// Reopen database and verify recovery
newDB, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", t.TempDir()))
if err != nil {
t.Fatalf("Failed to reopen database: %v", err)
}
defer func() { _ = newDB.Close() }()
err = newDB.Initialize(getChaosSchema())
if err != nil {
t.Fatalf("Failed to reinitialize database: %v", err)
}
// Verify system can recover and continue operations
newJobID := fmt.Sprintf("recovery-job-%d", time.Now().Unix())
job := &storage.Job{
ID: newJobID,
JobName: "Recovery Test Job",
Status: "pending",
Priority: 0,
}
err = newDB.CreateJob(job)
if err != nil {
t.Errorf("Failed to create job after database recovery: %v", err)
}
t.Log("Database connection failure test passed - system recovered gracefully")
}
// testRedisConnectionFailure tests system behavior when Redis fails
func testRedisConnectionFailure(t *testing.T, _ *storage.DB, rdb *redis.Client) {
// Add jobs to Redis queue
for i := 0; i < 10; i++ {
jobID := fmt.Sprintf("redis-chaos-job-%d", i)
err := rdb.LPush(context.Background(), "ml:queue", jobID).Err()
if err != nil {
t.Fatalf("Failed to add job to Redis: %v", err)
}
}
// Simulate Redis connection failure
err := rdb.Close()
if err != nil {
t.Errorf("Failed to close Redis connection: %v", err)
}
// Try to perform Redis operations that should fail
_, err = rdb.LPop(context.Background(), "ml:queue").Result()
if err == nil {
t.Error("Expected error when popping from closed Redis connection")
}
// Reconnect to Redis and verify recovery
newRdb := redis.NewClient(&redis.Options{
Addr: "localhost:6379",
Password: "",
DB: 6, // Use different DB for chaos tests
})
// Wait for Redis to be available
for i := 0; i < 10; i++ {
err := newRdb.Ping(context.Background()).Err()
if err == nil {
break
}
time.Sleep(100 * time.Millisecond)
}
// Verify system can recover and continue operations
testJobID := fmt.Sprintf("recovery-redis-job-%d", time.Now().Unix())
err = newRdb.LPush(context.Background(), "ml:queue", testJobID).Err()
if err != nil {
t.Errorf("Failed to add job to Redis after recovery: %v", err)
}
_ = newRdb.Close()
t.Log("Redis connection failure test passed - system recovered gracefully")
}
// testHighConcurrencyStress tests system under high concurrent load
func testHighConcurrencyStress(t *testing.T, db *storage.DB, rdb *redis.Client) {
numWorkers := 50
jobsPerWorker := 20
var wg sync.WaitGroup
errors := make(chan error, numWorkers*jobsPerWorker)
start := time.Now()
// Launch many concurrent workers
for worker := 0; worker < numWorkers; worker++ {
wg.Add(1)
go func(workerID int) {
defer wg.Done()
for job := 0; job < jobsPerWorker; job++ {
jobID := fmt.Sprintf("stress-job-w%d-j%d", workerID, job)
// Create job in database
dbJob := &storage.Job{
ID: jobID,
JobName: fmt.Sprintf("Stress Job W%d J%d", workerID, job),
Status: "pending",
Priority: 0,
}
err := db.CreateJob(dbJob)
if err != nil {
errors <- fmt.Errorf("failed to create job %s: %w", jobID, err)
continue
}
// Add to Redis queue
err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
if err != nil {
errors <- fmt.Errorf("failed to queue job %s: %w", jobID, err)
continue
}
// Update job status
err = db.UpdateJobStatus(jobID, "running", fmt.Sprintf("worker-%d", workerID), "")
if err != nil {
errors <- fmt.Errorf("failed to update job %s: %w", jobID, err)
continue
}
// Complete job
err = db.UpdateJobStatus(jobID, "completed", fmt.Sprintf("worker-%d", workerID), "")
if err != nil {
errors <- fmt.Errorf("failed to complete job %s: %w", jobID, err)
continue
}
// Pop from queue
_, err = rdb.LPop(context.Background(), "ml:queue").Result()
if err != nil {
errors <- fmt.Errorf("failed to pop job %s: %w", jobID, err)
continue
}
}
}(worker)
}
wg.Wait()
close(errors)
duration := time.Since(start)
totalJobs := numWorkers * jobsPerWorker
jobsPerSecond := float64(totalJobs) / duration.Seconds()
// Count errors
errorCount := 0
for err := range errors {
t.Logf("Stress test error: %v", err)
errorCount++
}
t.Logf("High concurrency stress test completed:")
t.Logf(" Total jobs: %d", totalJobs)
t.Logf(" Duration: %v", duration)
t.Logf(" Jobs per second: %.2f", jobsPerSecond)
t.Logf(" Error count: %d", errorCount)
// Verify system handled stress reasonably well
if errorCount > totalJobs/10 { // Allow up to 10% errors under stress
t.Errorf("Too many errors under stress: %d/%d", errorCount, totalJobs)
}
if jobsPerSecond < 100 { // Should handle at least 100 jobs/sec
t.Errorf("Performance too low under stress: %.2f jobs/sec", jobsPerSecond)
}
}
// testMemoryPressure tests system behavior under memory pressure
func testMemoryPressure(t *testing.T, db *storage.DB, rdb *redis.Client) {
// Create large payloads to stress memory
largePayload := make([]byte, 1024*1024) // 1MB payload
for i := range largePayload {
largePayload[i] = byte(i % 256)
}
payloadString := string(largePayload)
numJobs := 50
// Create jobs with large payloads
for i := 0; i < numJobs; i++ {
jobID := fmt.Sprintf("memory-pressure-job-%d", i)
job := &storage.Job{
ID: jobID,
JobName: fmt.Sprintf("Memory Pressure Job %d", i),
Status: "pending",
Priority: 0,
Args: payloadString,
}
err := db.CreateJob(job)
if err != nil {
t.Errorf("Failed to create large job %d: %v", i, err)
}
// Add to Redis queue
err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
if err != nil {
t.Errorf("Failed to queue large job %d: %v", i, err)
}
}
// Process jobs to test memory handling during operations
for i := 0; i < numJobs; i++ {
jobID := fmt.Sprintf("memory-pressure-job-%d", i)
// Update job status
err := db.UpdateJobStatus(jobID, "completed", "memory-worker", "")
if err != nil {
t.Errorf("Failed to update large job %d: %v", i, err)
}
// Pop from queue
_, err = rdb.LPop(context.Background(), "ml:queue").Result()
if err != nil {
t.Errorf("Failed to pop large job %d: %v", i, err)
}
}
t.Log("Memory pressure test passed - system handled large payloads")
}
// testNetworkLatency simulates network latency effects
func testNetworkLatency(t *testing.T, db *storage.DB, rdb *redis.Client) {
// Simulate operations with artificial delays
numJobs := 20
for i := 0; i < numJobs; i++ {
jobID := fmt.Sprintf("latency-job-%d", i)
// Add artificial delay to simulate network latency
time.Sleep(time.Millisecond * 10)
job := &storage.Job{
ID: jobID,
JobName: fmt.Sprintf("Latency Job %d", i),
Status: "pending",
Priority: 0,
}
err := db.CreateJob(job)
if err != nil {
t.Errorf("Failed to create latency job %d: %v", i, err)
}
// Simulate network latency for Redis operations
time.Sleep(time.Millisecond * 5)
err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
if err != nil {
t.Errorf("Failed to queue latency job %d: %v", i, err)
}
}
// Process jobs with latency simulation
for i := 0; i < numJobs; i++ {
jobID := fmt.Sprintf("latency-job-%d", i)
time.Sleep(time.Millisecond * 8)
err := db.UpdateJobStatus(jobID, "completed", "latency-worker", "")
if err != nil {
t.Errorf("Failed to complete latency job %d: %v", i, err)
}
time.Sleep(time.Millisecond * 3)
_, err = rdb.LPop(context.Background(), "ml:queue").Result()
if err != nil {
t.Errorf("Failed to pop latency job %d: %v", i, err)
}
}
t.Log("Network latency test passed - system handled delayed operations")
}
// testResourceExhaustion tests behavior when resources are exhausted
func testResourceExhaustion(t *testing.T, db *storage.DB, rdb *redis.Client) {
// Create many simultaneous operations to exhaust resources
numOperations := 1000
done := make(chan bool, numOperations)
errors := make(chan error, numOperations)
for i := 0; i < numOperations; i++ {
go func(opID int) {
defer func() { done <- true }()
jobID := fmt.Sprintf("exhaustion-job-%d", opID)
// Rapid-fire operations to stress the system
job := &storage.Job{
ID: jobID,
JobName: fmt.Sprintf("Exhaustion Job %d", opID),
Status: "pending",
Priority: 0,
}
err := db.CreateJob(job)
if err != nil {
errors <- fmt.Errorf("create failed for job %d: %w", opID, err)
return
}
err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
if err != nil {
errors <- fmt.Errorf("queue failed for job %d: %w", opID, err)
return
}
err = db.UpdateJobStatus(jobID, "completed", "exhaustion-worker", "")
if err != nil {
errors <- fmt.Errorf("update failed for job %d: %w", opID, err)
return
}
}(i)
}
// Wait for all operations to complete
for i := 0; i < numOperations; i++ {
<-done
}
close(errors)
// Count errors
errorCount := 0
for err := range errors {
t.Logf("Resource exhaustion error: %v", err)
errorCount++
}
t.Logf("Resource exhaustion test completed:")
t.Logf(" Total operations: %d", numOperations)
t.Logf(" Error count: %d", errorCount)
t.Logf(" Success rate: %.2f%%", float64(numOperations-errorCount)/float64(numOperations)*100)
// Allow some errors under extreme resource pressure
if errorCount > numOperations/20 { // Allow up to 5% errors
t.Errorf("Too many errors under resource exhaustion: %d/%d", errorCount, numOperations)
}
}
// Helper functions
func setupChaosRedis(t *testing.T) *redis.Client {
rdb := redis.NewClient(&redis.Options{
Addr: "localhost:6379",
Password: "",
DB: 6, // Use DB 6 for chaos tests
})
ctx := context.Background()
if err := rdb.Ping(ctx).Err(); err != nil {
t.Skipf("Redis not available for chaos tests: %v", err)
return nil
}
// Clean up the test database
rdb.FlushDB(ctx)
t.Cleanup(func() {
rdb.FlushDB(ctx)
_ = rdb.Close()
})
return rdb
}
// setupChaosRedisIsolated creates a Redis client without cleanup handlers
// for tests that intentionally close the connection
func setupChaosRedisIsolated(t *testing.T) *redis.Client {
rdb := redis.NewClient(&redis.Options{
Addr: "localhost:6379",
Password: "",
DB: 6, // Use DB 6 for chaos tests
})
ctx := context.Background()
if err := rdb.Ping(ctx).Err(); err != nil {
t.Skipf("Redis not available for chaos tests: %v", err)
return nil
}
// Clean up the test database
rdb.FlushDB(ctx)
// No cleanup handler - test will close this intentionally
return rdb
}
func createTestJobs(t *testing.T, db *storage.DB, count int) []string {
jobIDs := make([]string, count)
for i := 0; i < count; i++ {
jobID := fmt.Sprintf("chaos-test-job-%d", i)
jobIDs[i] = jobID
job := &storage.Job{
ID: jobID,
JobName: fmt.Sprintf("Chaos Test Job %d", i),
Status: "pending",
Priority: 0,
}
err := db.CreateJob(job)
if err != nil {
t.Fatalf("Failed to create test job %d: %v", i, err)
}
}
return jobIDs
}
func getChaosSchema() string {
return fixtures.TestSchema
}