- Fix YAML tags in auth config struct (json -> yaml) - Update CLI configs to use pre-hashed API keys - Remove double hashing in WebSocket client - Fix port mapping (9102 -> 9103) in CLI commands - Update permission keys to use jobs:read, jobs:create, etc. - Clean up all debug logging from CLI and server - All user roles now authenticate correctly: * Admin: Can queue jobs and see all jobs * Researcher: Can queue jobs and see own jobs * Analyst: Can see status (read-only access) Multi-user authentication is now fully functional.
536 lines
14 KiB
Go
536 lines
14 KiB
Go
package chaos
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/storage"
|
|
fixtures "github.com/jfraeys/fetch_ml/tests/fixtures"
|
|
"github.com/redis/go-redis/v9"
|
|
)
|
|
|
|
// ChaosTestSuite tests system resilience under various failure conditions
|
|
func TestChaosTestSuite(t *testing.T) {
|
|
// Tests that intentionally close/corrupt connections get their own resources
|
|
// to prevent cascading failures to subsequent subtests
|
|
|
|
t.Run("DatabaseConnectionFailure", func(t *testing.T) {
|
|
// This test intentionally closes the database, so it needs its own instance
|
|
tempDir := t.TempDir()
|
|
rdb := setupChaosRedis(t)
|
|
if rdb == nil {
|
|
t.Skip("Redis not available for chaos tests")
|
|
}
|
|
|
|
db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
|
|
if err != nil {
|
|
t.Fatalf("Failed to create database: %v", err)
|
|
}
|
|
|
|
err = db.Initialize(getChaosSchema())
|
|
if err != nil {
|
|
t.Fatalf("Failed to initialize database: %v", err)
|
|
}
|
|
|
|
testDatabaseConnectionFailure(t, db, rdb)
|
|
})
|
|
|
|
t.Run("RedisConnectionFailure", func(t *testing.T) {
|
|
// This test intentionally closes Redis, so it needs its own instance
|
|
tempDir := t.TempDir()
|
|
rdb := setupChaosRedisIsolated(t)
|
|
if rdb == nil {
|
|
t.Skip("Redis not available for chaos tests")
|
|
}
|
|
|
|
db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
|
|
if err != nil {
|
|
t.Fatalf("Failed to create database: %v", err)
|
|
}
|
|
defer func() {
|
|
if err := db.Close(); err != nil {
|
|
t.Logf("Warning: failed to close database: %v", err)
|
|
}
|
|
}()
|
|
|
|
err = db.Initialize(getChaosSchema())
|
|
if err != nil {
|
|
t.Fatalf("Failed to initialize database: %v", err)
|
|
}
|
|
|
|
testRedisConnectionFailure(t, db, rdb)
|
|
})
|
|
|
|
// Remaining tests share resources since they don't corrupt connections
|
|
tempDir := t.TempDir()
|
|
rdb := setupChaosRedis(t)
|
|
if rdb == nil {
|
|
t.Skip("Redis not available for chaos tests")
|
|
}
|
|
defer func() { _ = rdb.Close() }()
|
|
|
|
db, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", tempDir))
|
|
if err != nil {
|
|
t.Fatalf("Failed to create database: %v", err)
|
|
}
|
|
defer func() { _ = db.Close() }()
|
|
|
|
err = db.Initialize(getChaosSchema())
|
|
if err != nil {
|
|
t.Fatalf("Failed to initialize database: %v", err)
|
|
}
|
|
|
|
t.Run("HighConcurrencyStress", func(t *testing.T) {
|
|
testHighConcurrencyStress(t, db, rdb)
|
|
})
|
|
|
|
t.Run("MemoryPressure", func(t *testing.T) {
|
|
testMemoryPressure(t, db, rdb)
|
|
})
|
|
|
|
t.Run("NetworkLatency", func(t *testing.T) {
|
|
testNetworkLatency(t, db, rdb)
|
|
})
|
|
|
|
t.Run("ResourceExhaustion", func(t *testing.T) {
|
|
testResourceExhaustion(t, db, rdb)
|
|
})
|
|
}
|
|
|
|
// testDatabaseConnectionFailure tests system behavior when database fails
|
|
func testDatabaseConnectionFailure(t *testing.T, db *storage.DB, _ *redis.Client) {
|
|
// Create some jobs before failure
|
|
jobIDs := createTestJobs(t, db, 10)
|
|
|
|
// Simulate database connection failure by closing the database
|
|
err := db.Close()
|
|
if err != nil {
|
|
t.Errorf("Failed to close database: %v", err)
|
|
}
|
|
|
|
// Try to perform operations that should fail gracefully
|
|
for _, jobID := range jobIDs {
|
|
err := db.UpdateJobStatus(jobID, "completed", "worker-1", "")
|
|
if err == nil {
|
|
t.Errorf("Expected error when updating job %s on closed database", jobID)
|
|
}
|
|
}
|
|
|
|
// Reopen database and verify recovery
|
|
newDB, err := storage.NewDBFromPath(fmt.Sprintf("%s/chaos.db", t.TempDir()))
|
|
if err != nil {
|
|
t.Fatalf("Failed to reopen database: %v", err)
|
|
}
|
|
defer func() { _ = newDB.Close() }()
|
|
|
|
err = newDB.Initialize(getChaosSchema())
|
|
if err != nil {
|
|
t.Fatalf("Failed to reinitialize database: %v", err)
|
|
}
|
|
|
|
// Verify system can recover and continue operations
|
|
newJobID := fmt.Sprintf("recovery-job-%d", time.Now().Unix())
|
|
job := &storage.Job{
|
|
ID: newJobID,
|
|
JobName: "Recovery Test Job",
|
|
Status: "pending",
|
|
Priority: 0,
|
|
}
|
|
|
|
err = newDB.CreateJob(job)
|
|
if err != nil {
|
|
t.Errorf("Failed to create job after database recovery: %v", err)
|
|
}
|
|
|
|
t.Log("Database connection failure test passed - system recovered gracefully")
|
|
}
|
|
|
|
// testRedisConnectionFailure tests system behavior when Redis fails
|
|
func testRedisConnectionFailure(t *testing.T, _ *storage.DB, rdb *redis.Client) {
|
|
// Add jobs to Redis queue
|
|
for i := 0; i < 10; i++ {
|
|
jobID := fmt.Sprintf("redis-chaos-job-%d", i)
|
|
err := rdb.LPush(context.Background(), "ml:queue", jobID).Err()
|
|
if err != nil {
|
|
t.Fatalf("Failed to add job to Redis: %v", err)
|
|
}
|
|
}
|
|
|
|
// Simulate Redis connection failure
|
|
err := rdb.Close()
|
|
if err != nil {
|
|
t.Errorf("Failed to close Redis connection: %v", err)
|
|
}
|
|
|
|
// Try to perform Redis operations that should fail
|
|
_, err = rdb.LPop(context.Background(), "ml:queue").Result()
|
|
if err == nil {
|
|
t.Error("Expected error when popping from closed Redis connection")
|
|
}
|
|
|
|
// Reconnect to Redis and verify recovery
|
|
newRdb := redis.NewClient(&redis.Options{
|
|
Addr: "localhost:6379",
|
|
Password: "",
|
|
DB: 6, // Use different DB for chaos tests
|
|
})
|
|
|
|
// Wait for Redis to be available
|
|
for i := 0; i < 10; i++ {
|
|
err := newRdb.Ping(context.Background()).Err()
|
|
if err == nil {
|
|
break
|
|
}
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
|
|
// Verify system can recover and continue operations
|
|
testJobID := fmt.Sprintf("recovery-redis-job-%d", time.Now().Unix())
|
|
err = newRdb.LPush(context.Background(), "ml:queue", testJobID).Err()
|
|
if err != nil {
|
|
t.Errorf("Failed to add job to Redis after recovery: %v", err)
|
|
}
|
|
|
|
_ = newRdb.Close()
|
|
t.Log("Redis connection failure test passed - system recovered gracefully")
|
|
}
|
|
|
|
// testHighConcurrencyStress tests system under high concurrent load
|
|
func testHighConcurrencyStress(t *testing.T, db *storage.DB, rdb *redis.Client) {
|
|
numWorkers := 50
|
|
jobsPerWorker := 20
|
|
|
|
var wg sync.WaitGroup
|
|
errors := make(chan error, numWorkers*jobsPerWorker)
|
|
|
|
start := time.Now()
|
|
|
|
// Launch many concurrent workers
|
|
for worker := 0; worker < numWorkers; worker++ {
|
|
wg.Add(1)
|
|
go func(workerID int) {
|
|
defer wg.Done()
|
|
|
|
for job := 0; job < jobsPerWorker; job++ {
|
|
jobID := fmt.Sprintf("stress-job-w%d-j%d", workerID, job)
|
|
|
|
// Create job in database
|
|
dbJob := &storage.Job{
|
|
ID: jobID,
|
|
JobName: fmt.Sprintf("Stress Job W%d J%d", workerID, job),
|
|
Status: "pending",
|
|
Priority: 0,
|
|
}
|
|
|
|
err := db.CreateJob(dbJob)
|
|
if err != nil {
|
|
errors <- fmt.Errorf("failed to create job %s: %w", jobID, err)
|
|
continue
|
|
}
|
|
|
|
// Add to Redis queue
|
|
err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
|
|
if err != nil {
|
|
errors <- fmt.Errorf("failed to queue job %s: %w", jobID, err)
|
|
continue
|
|
}
|
|
|
|
// Update job status
|
|
err = db.UpdateJobStatus(jobID, "running", fmt.Sprintf("worker-%d", workerID), "")
|
|
if err != nil {
|
|
errors <- fmt.Errorf("failed to update job %s: %w", jobID, err)
|
|
continue
|
|
}
|
|
|
|
// Complete job
|
|
err = db.UpdateJobStatus(jobID, "completed", fmt.Sprintf("worker-%d", workerID), "")
|
|
if err != nil {
|
|
errors <- fmt.Errorf("failed to complete job %s: %w", jobID, err)
|
|
continue
|
|
}
|
|
|
|
// Pop from queue
|
|
_, err = rdb.LPop(context.Background(), "ml:queue").Result()
|
|
if err != nil {
|
|
errors <- fmt.Errorf("failed to pop job %s: %w", jobID, err)
|
|
continue
|
|
}
|
|
}
|
|
}(worker)
|
|
}
|
|
|
|
wg.Wait()
|
|
close(errors)
|
|
|
|
duration := time.Since(start)
|
|
totalJobs := numWorkers * jobsPerWorker
|
|
jobsPerSecond := float64(totalJobs) / duration.Seconds()
|
|
|
|
// Count errors
|
|
errorCount := 0
|
|
for err := range errors {
|
|
t.Logf("Stress test error: %v", err)
|
|
errorCount++
|
|
}
|
|
|
|
t.Logf("High concurrency stress test completed:")
|
|
t.Logf(" Total jobs: %d", totalJobs)
|
|
t.Logf(" Duration: %v", duration)
|
|
t.Logf(" Jobs per second: %.2f", jobsPerSecond)
|
|
t.Logf(" Error count: %d", errorCount)
|
|
|
|
// Verify system handled stress reasonably well
|
|
if errorCount > totalJobs/10 { // Allow up to 10% errors under stress
|
|
t.Errorf("Too many errors under stress: %d/%d", errorCount, totalJobs)
|
|
}
|
|
|
|
if jobsPerSecond < 100 { // Should handle at least 100 jobs/sec
|
|
t.Errorf("Performance too low under stress: %.2f jobs/sec", jobsPerSecond)
|
|
}
|
|
}
|
|
|
|
// testMemoryPressure tests system behavior under memory pressure
|
|
func testMemoryPressure(t *testing.T, db *storage.DB, rdb *redis.Client) {
|
|
// Create large payloads to stress memory
|
|
largePayload := make([]byte, 1024*1024) // 1MB payload
|
|
for i := range largePayload {
|
|
largePayload[i] = byte(i % 256)
|
|
}
|
|
|
|
payloadString := string(largePayload)
|
|
numJobs := 50
|
|
|
|
// Create jobs with large payloads
|
|
for i := 0; i < numJobs; i++ {
|
|
jobID := fmt.Sprintf("memory-pressure-job-%d", i)
|
|
|
|
job := &storage.Job{
|
|
ID: jobID,
|
|
JobName: fmt.Sprintf("Memory Pressure Job %d", i),
|
|
Status: "pending",
|
|
Priority: 0,
|
|
Args: payloadString,
|
|
}
|
|
|
|
err := db.CreateJob(job)
|
|
if err != nil {
|
|
t.Errorf("Failed to create large job %d: %v", i, err)
|
|
}
|
|
|
|
// Add to Redis queue
|
|
err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
|
|
if err != nil {
|
|
t.Errorf("Failed to queue large job %d: %v", i, err)
|
|
}
|
|
}
|
|
|
|
// Process jobs to test memory handling during operations
|
|
for i := 0; i < numJobs; i++ {
|
|
jobID := fmt.Sprintf("memory-pressure-job-%d", i)
|
|
|
|
// Update job status
|
|
err := db.UpdateJobStatus(jobID, "completed", "memory-worker", "")
|
|
if err != nil {
|
|
t.Errorf("Failed to update large job %d: %v", i, err)
|
|
}
|
|
|
|
// Pop from queue
|
|
_, err = rdb.LPop(context.Background(), "ml:queue").Result()
|
|
if err != nil {
|
|
t.Errorf("Failed to pop large job %d: %v", i, err)
|
|
}
|
|
}
|
|
|
|
t.Log("Memory pressure test passed - system handled large payloads")
|
|
}
|
|
|
|
// testNetworkLatency simulates network latency effects
|
|
func testNetworkLatency(t *testing.T, db *storage.DB, rdb *redis.Client) {
|
|
// Simulate operations with artificial delays
|
|
numJobs := 20
|
|
for i := 0; i < numJobs; i++ {
|
|
jobID := fmt.Sprintf("latency-job-%d", i)
|
|
|
|
// Add artificial delay to simulate network latency
|
|
time.Sleep(time.Millisecond * 10)
|
|
|
|
job := &storage.Job{
|
|
ID: jobID,
|
|
JobName: fmt.Sprintf("Latency Job %d", i),
|
|
Status: "pending",
|
|
Priority: 0,
|
|
}
|
|
|
|
err := db.CreateJob(job)
|
|
if err != nil {
|
|
t.Errorf("Failed to create latency job %d: %v", i, err)
|
|
}
|
|
|
|
// Simulate network latency for Redis operations
|
|
time.Sleep(time.Millisecond * 5)
|
|
err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
|
|
if err != nil {
|
|
t.Errorf("Failed to queue latency job %d: %v", i, err)
|
|
}
|
|
}
|
|
|
|
// Process jobs with latency simulation
|
|
for i := 0; i < numJobs; i++ {
|
|
jobID := fmt.Sprintf("latency-job-%d", i)
|
|
|
|
time.Sleep(time.Millisecond * 8)
|
|
err := db.UpdateJobStatus(jobID, "completed", "latency-worker", "")
|
|
if err != nil {
|
|
t.Errorf("Failed to complete latency job %d: %v", i, err)
|
|
}
|
|
|
|
time.Sleep(time.Millisecond * 3)
|
|
_, err = rdb.LPop(context.Background(), "ml:queue").Result()
|
|
if err != nil {
|
|
t.Errorf("Failed to pop latency job %d: %v", i, err)
|
|
}
|
|
}
|
|
|
|
t.Log("Network latency test passed - system handled delayed operations")
|
|
}
|
|
|
|
// testResourceExhaustion tests behavior when resources are exhausted
|
|
func testResourceExhaustion(t *testing.T, db *storage.DB, rdb *redis.Client) {
|
|
// Create many simultaneous operations to exhaust resources
|
|
numOperations := 1000
|
|
done := make(chan bool, numOperations)
|
|
errors := make(chan error, numOperations)
|
|
|
|
for i := 0; i < numOperations; i++ {
|
|
go func(opID int) {
|
|
defer func() { done <- true }()
|
|
|
|
jobID := fmt.Sprintf("exhaustion-job-%d", opID)
|
|
|
|
// Rapid-fire operations to stress the system
|
|
job := &storage.Job{
|
|
ID: jobID,
|
|
JobName: fmt.Sprintf("Exhaustion Job %d", opID),
|
|
Status: "pending",
|
|
Priority: 0,
|
|
}
|
|
|
|
err := db.CreateJob(job)
|
|
if err != nil {
|
|
errors <- fmt.Errorf("create failed for job %d: %w", opID, err)
|
|
return
|
|
}
|
|
|
|
err = rdb.LPush(context.Background(), "ml:queue", jobID).Err()
|
|
if err != nil {
|
|
errors <- fmt.Errorf("queue failed for job %d: %w", opID, err)
|
|
return
|
|
}
|
|
|
|
err = db.UpdateJobStatus(jobID, "completed", "exhaustion-worker", "")
|
|
if err != nil {
|
|
errors <- fmt.Errorf("update failed for job %d: %w", opID, err)
|
|
return
|
|
}
|
|
}(i)
|
|
}
|
|
|
|
// Wait for all operations to complete
|
|
for i := 0; i < numOperations; i++ {
|
|
<-done
|
|
}
|
|
close(errors)
|
|
|
|
// Count errors
|
|
errorCount := 0
|
|
for err := range errors {
|
|
t.Logf("Resource exhaustion error: %v", err)
|
|
errorCount++
|
|
}
|
|
|
|
t.Logf("Resource exhaustion test completed:")
|
|
t.Logf(" Total operations: %d", numOperations)
|
|
t.Logf(" Error count: %d", errorCount)
|
|
t.Logf(" Success rate: %.2f%%", float64(numOperations-errorCount)/float64(numOperations)*100)
|
|
|
|
// Allow some errors under extreme resource pressure
|
|
if errorCount > numOperations/20 { // Allow up to 5% errors
|
|
t.Errorf("Too many errors under resource exhaustion: %d/%d", errorCount, numOperations)
|
|
}
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
func setupChaosRedis(t *testing.T) *redis.Client {
|
|
rdb := redis.NewClient(&redis.Options{
|
|
Addr: "localhost:6379",
|
|
Password: "",
|
|
DB: 6, // Use DB 6 for chaos tests
|
|
})
|
|
|
|
ctx := context.Background()
|
|
if err := rdb.Ping(ctx).Err(); err != nil {
|
|
t.Skipf("Redis not available for chaos tests: %v", err)
|
|
return nil
|
|
}
|
|
|
|
// Clean up the test database
|
|
rdb.FlushDB(ctx)
|
|
|
|
t.Cleanup(func() {
|
|
rdb.FlushDB(ctx)
|
|
_ = rdb.Close()
|
|
})
|
|
|
|
return rdb
|
|
}
|
|
|
|
// setupChaosRedisIsolated creates a Redis client without cleanup handlers
|
|
// for tests that intentionally close the connection
|
|
func setupChaosRedisIsolated(t *testing.T) *redis.Client {
|
|
rdb := redis.NewClient(&redis.Options{
|
|
Addr: "localhost:6379",
|
|
Password: "",
|
|
DB: 6, // Use DB 6 for chaos tests
|
|
})
|
|
|
|
ctx := context.Background()
|
|
if err := rdb.Ping(ctx).Err(); err != nil {
|
|
t.Skipf("Redis not available for chaos tests: %v", err)
|
|
return nil
|
|
}
|
|
|
|
// Clean up the test database
|
|
rdb.FlushDB(ctx)
|
|
|
|
// No cleanup handler - test will close this intentionally
|
|
return rdb
|
|
}
|
|
|
|
func createTestJobs(t *testing.T, db *storage.DB, count int) []string {
|
|
jobIDs := make([]string, count)
|
|
for i := 0; i < count; i++ {
|
|
jobID := fmt.Sprintf("chaos-test-job-%d", i)
|
|
jobIDs[i] = jobID
|
|
|
|
job := &storage.Job{
|
|
ID: jobID,
|
|
JobName: fmt.Sprintf("Chaos Test Job %d", i),
|
|
Status: "pending",
|
|
Priority: 0,
|
|
}
|
|
|
|
err := db.CreateJob(job)
|
|
if err != nil {
|
|
t.Fatalf("Failed to create test job %d: %v", i, err)
|
|
}
|
|
}
|
|
return jobIDs
|
|
}
|
|
|
|
func getChaosSchema() string {
|
|
return fixtures.TestSchema
|
|
}
|