- Update E2E tests for consolidated docker-compose.test.yml - Remove references to obsolete logs-debug.yml - Enhance test fixtures and utilities - Improve integration test coverage for KMS, queue, scheduler - Update unit tests for config constants and worker execution - Modernize cleanup-status.sh with new Makefile targets
298 lines
8.4 KiB
Go
298 lines
8.4 KiB
Go
package tests
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
|
|
tests "github.com/jfraeys/fetch_ml/tests/fixtures"
|
|
)
|
|
|
|
const (
|
|
statusRunning = "running"
|
|
statusCompleted = "completed"
|
|
)
|
|
|
|
// TestIntegrationE2E tests the complete end-to-end workflow
|
|
func TestIntegrationE2E(t *testing.T) {
|
|
t.Parallel() // Enable parallel execution
|
|
|
|
testDir := t.TempDir()
|
|
ctx := context.Background()
|
|
|
|
// Create test job directory structure
|
|
jobBaseDir := filepath.Join(testDir, "jobs")
|
|
pendingDir := filepath.Join(jobBaseDir, "pending")
|
|
runningDir := filepath.Join(jobBaseDir, "running")
|
|
finishedDir := filepath.Join(jobBaseDir, "finished")
|
|
|
|
for _, dir := range []string{pendingDir, runningDir, finishedDir} {
|
|
if err := os.MkdirAll(dir, 0750); err != nil {
|
|
t.Fatalf("Failed to create directory %s: %v", dir, err)
|
|
}
|
|
}
|
|
|
|
// Create standard ML experiment (zero-install style)
|
|
jobDir := filepath.Join(pendingDir, "test_job")
|
|
if err := os.MkdirAll(jobDir, 0750); err != nil {
|
|
t.Fatalf("Failed to create job directory: %v", err)
|
|
}
|
|
|
|
// Create standard ML project files
|
|
tEntrypoint := filepath.Join(jobDir, "train.py")
|
|
requirementsFile := filepath.Join(jobDir, "requirements.txt")
|
|
readmeFile := filepath.Join(jobDir, "README.md")
|
|
|
|
// Create train.py (standard ML script)
|
|
trainCode := `#!/usr/bin/env python3
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Train ML model")
|
|
parser.add_argument("--epochs", type=int, default=10, help="Number of epochs")
|
|
parser.add_argument("--lr", type=float, default=0.001, help="Learning rate")
|
|
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
|
|
parser.add_argument("--output_dir", type=str, required=True, help="Output directory")
|
|
parser.add_argument("--data_dir", type=str, help="Data directory")
|
|
parser.add_argument("--datasets", type=str, help="Comma-separated datasets")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Setup logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Create output directory
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
logger.info(f"Starting training: {args.epochs} epochs, lr={args.lr}, batch_size={args.batch_size}")
|
|
|
|
if args.datasets:
|
|
logger.info(f"Using datasets: {args.datasets}")
|
|
|
|
# Simulate training
|
|
for epoch in range(args.epochs):
|
|
loss = 1.0 - (epoch * 0.08)
|
|
accuracy = 0.4 + (epoch * 0.055)
|
|
|
|
logger.info(f"Epoch {epoch + 1}/{args.epochs}: loss={loss:.4f}, accuracy={accuracy:.4f}")
|
|
time.sleep(0.01) # Minimal delay for testing
|
|
|
|
# Save results
|
|
results = {
|
|
"model_type": "test_model",
|
|
"epochs_trained": args.epochs,
|
|
"learning_rate": args.lr,
|
|
"batch_size": args.batch_size,
|
|
"final_accuracy": accuracy,
|
|
"final_loss": loss,
|
|
"datasets": args.datasets.split(",") if args.datasets else []
|
|
}
|
|
|
|
results_file = output_dir / "results.json"
|
|
with open(results_file, 'w') as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
logger.info(f"Training completed! Results saved to {results_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
`
|
|
|
|
//nolint:gosec // G306: Script needs execute permissions
|
|
if err := os.WriteFile(tEntrypoint, []byte(trainCode), 0750); err != nil {
|
|
t.Fatalf("Failed to create train.py: %v", err)
|
|
}
|
|
|
|
// Create requirements.txt
|
|
requirements := `torch>=1.9.0
|
|
numpy>=1.21.0
|
|
scikit-learn>=1.0.0
|
|
`
|
|
|
|
if err := os.WriteFile(requirementsFile, []byte(requirements), 0600); err != nil {
|
|
t.Fatalf("Failed to create requirements.txt: %v", err)
|
|
}
|
|
|
|
// Create README.md
|
|
readme := `# Test Experiment
|
|
|
|
This is a test experiment for integration testing.
|
|
|
|
## Usage
|
|
python train.py --epochs 2 --lr 0.01 --output_dir ./results
|
|
`
|
|
|
|
if err := os.WriteFile(readmeFile, []byte(readme), 0600); err != nil {
|
|
t.Fatalf("Failed to create README.md: %v", err)
|
|
}
|
|
|
|
// Setup test Redis using fixtures
|
|
redisHelper, err := tests.NewRedisHelper("localhost:6379", 15)
|
|
if err != nil {
|
|
t.Skipf("Redis not available, skipping integration test: %v", err)
|
|
}
|
|
defer func() { _ = redisHelper.Close() }()
|
|
|
|
// Test Redis connection
|
|
if err := redisHelper.GetClient().Ping(ctx).Err(); err != nil {
|
|
t.Skipf("Redis not available, skipping integration test: %v", err)
|
|
}
|
|
|
|
// Create task queue
|
|
taskQueue, err := tests.NewTaskQueue(&tests.Config{
|
|
RedisAddr: "localhost:6379",
|
|
RedisDB: 15,
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("Failed to create task queue: %v", err)
|
|
}
|
|
defer func() { _ = taskQueue.Close() }()
|
|
|
|
// Create ML server (local mode)
|
|
mlServer := tests.NewMLServer()
|
|
|
|
// Test 1: Enqueue task (as would happen from TUI)
|
|
task, err := taskQueue.EnqueueTask("test_job", "--epochs 2 --lr 0.01", 5)
|
|
if err != nil {
|
|
t.Fatalf("Failed to enqueue task: %v", err)
|
|
}
|
|
|
|
if task.ID == "" {
|
|
t.Fatal("Task ID should not be empty")
|
|
}
|
|
|
|
if task.JobName != "test_job" {
|
|
t.Errorf("Expected job name 'test_job', got '%s'", task.JobName)
|
|
}
|
|
|
|
if task.Status != "queued" {
|
|
t.Errorf("Expected status 'queued', got '%s'", task.Status)
|
|
}
|
|
|
|
// Test 2: Get next task (as worker would)
|
|
nextTask, err := taskQueue.GetNextTask()
|
|
if err != nil {
|
|
t.Fatalf("Failed to get next task: %v", err)
|
|
}
|
|
|
|
if nextTask == nil {
|
|
t.Fatal("Should have retrieved a task")
|
|
}
|
|
|
|
if nextTask.ID != task.ID {
|
|
t.Errorf("Expected task ID %s, got %s", task.ID, nextTask.ID)
|
|
}
|
|
|
|
// Test 3: Update task status to running
|
|
now := time.Now()
|
|
nextTask.Status = statusRunning
|
|
nextTask.StartedAt = &now
|
|
|
|
if err := taskQueue.UpdateTask(nextTask); err != nil {
|
|
t.Fatalf("Failed to update task: %v", err)
|
|
}
|
|
|
|
// Test 4: Execute job (zero-install style)
|
|
if err := executeZeroInstallJob(mlServer, nextTask, jobBaseDir, tEntrypoint); err != nil {
|
|
t.Fatalf("Failed to execute job: %v", err)
|
|
}
|
|
|
|
// Test 5: Update task status to completed
|
|
endTime := time.Now()
|
|
nextTask.Status = statusCompleted
|
|
nextTask.EndedAt = &endTime
|
|
|
|
if err := taskQueue.UpdateTask(nextTask); err != nil {
|
|
t.Fatalf("Failed to update final task status: %v", err)
|
|
}
|
|
|
|
// Test 6: Verify results
|
|
retrievedTask, err := taskQueue.GetTask(nextTask.ID)
|
|
if err != nil {
|
|
t.Fatalf("Failed to retrieve completed task: %v", err)
|
|
}
|
|
|
|
if retrievedTask.Status != statusCompleted {
|
|
t.Errorf("Expected status 'completed', got '%s'", retrievedTask.Status)
|
|
}
|
|
|
|
if retrievedTask.StartedAt == nil {
|
|
t.Error("StartedAt should not be nil")
|
|
}
|
|
|
|
if retrievedTask.EndedAt == nil {
|
|
t.Error("EndedAt should not be nil")
|
|
}
|
|
|
|
// Test 7: Check job status
|
|
jobStatus, err := taskQueue.GetJobStatus("test_job")
|
|
if err != nil {
|
|
t.Fatalf("Failed to get job status: %v", err)
|
|
}
|
|
|
|
if jobStatus["status"] != statusCompleted {
|
|
t.Errorf("Expected job status 'completed', got '%s'", jobStatus["status"])
|
|
}
|
|
|
|
// Test 8: Record and check metrics
|
|
if err := taskQueue.RecordMetric("test_job", "accuracy", 0.95); err != nil {
|
|
t.Fatalf("Failed to record metric: %v", err)
|
|
}
|
|
|
|
metrics, err := taskQueue.GetMetrics("test_job")
|
|
if err != nil {
|
|
t.Fatalf("Failed to get metrics: %v", err)
|
|
}
|
|
|
|
if metrics["accuracy"] != "0.95" {
|
|
t.Errorf("Expected accuracy '0.95', got '%s'", metrics["accuracy"])
|
|
}
|
|
|
|
t.Log("End-to-end test completed successfully")
|
|
}
|
|
|
|
// executeZeroInstallJob simulates zero-install job execution
|
|
func executeZeroInstallJob(server *tests.MLServer, task *tests.Task, baseDir, tEntrypoint string) error {
|
|
// Move job to running directory
|
|
pendingPath := filepath.Join(baseDir, "pending", task.JobName)
|
|
runningPath := filepath.Join(baseDir, statusRunning, task.JobName)
|
|
|
|
if err := os.Rename(pendingPath, runningPath); err != nil {
|
|
return fmt.Errorf("failed to move job to running: %w", err)
|
|
}
|
|
|
|
// Execute the job (zero-install style - direct Python execution)
|
|
outputDir := filepath.Join(runningPath, "results")
|
|
if err := os.MkdirAll(outputDir, 0750); err != nil {
|
|
return fmt.Errorf("failed to create output directory: %w", err)
|
|
}
|
|
|
|
cmd := fmt.Sprintf("cd %s && python3 %s --output_dir %s %s",
|
|
runningPath,
|
|
filepath.Base(tEntrypoint),
|
|
outputDir,
|
|
task.Args,
|
|
)
|
|
|
|
output, err := server.Exec(cmd)
|
|
if err != nil {
|
|
return fmt.Errorf("job execution failed: %w, output: %s", err, output)
|
|
}
|
|
|
|
// Move to finished directory
|
|
finishedPath := filepath.Join(baseDir, "finished", task.JobName)
|
|
if err := os.Rename(runningPath, finishedPath); err != nil {
|
|
return fmt.Errorf("failed to move job to finished: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|