fetch_ml/tests/integration/queue_execution_test.go
Jeremie Fraeys c980167041 test: implement comprehensive test suite with multiple test types
- Add end-to-end tests for complete workflow validation
- Include integration tests for API and database interactions
- Add unit tests for all major components and utilities
- Include performance tests for payload handling
- Add CLI API integration tests
- Include Podman container integration tests
- Add WebSocket and queue execution tests
- Include shell script tests for setup validation

Provides comprehensive test coverage ensuring platform reliability
and functionality across all components and interactions.
2025-12-04 16:55:13 -05:00

465 lines
16 KiB
Go

package tests
import (
"fmt"
"os"
"path/filepath"
"testing"
"time"
tests "github.com/jfraeys/fetch_ml/tests/fixtures"
)
// TestQueueExecution tests that experiments are processed sequentially through the queue
func TestQueueExecution(t *testing.T) {
t.Parallel() // Enable parallel execution
testDir := t.TempDir()
// Use fixtures for examples directory operations
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
// Test 1: Create multiple experiments from actual examples and add them to queue
t.Run("QueueSubmission", func(t *testing.T) {
// Create server queue structure
queueDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
// Use actual examples with different priorities
experiments := []struct {
name string
priority int
exampleDir string
}{
{"sklearn_classification", 1, "sklearn_project"},
{"xgboost_classification", 2, "xgboost_project"},
{"pytorch_nn", 3, "pytorch_project"},
}
for _, exp := range experiments {
// Copy actual example files using fixtures
sourceDir := examplesDir.GetPath(exp.exampleDir)
experimentDir := filepath.Join(testDir, exp.name)
// Copy all files from example directory
if err := tests.CopyDir(sourceDir, experimentDir); err != nil {
t.Fatalf("Failed to copy example %s: %v", exp.exampleDir, err)
}
// Add to queue (simulate job submission)
timestamp := time.Now().Format("20060102_150405")
jobName := fmt.Sprintf("%s_%s_priority_%d", exp.name, timestamp, exp.priority)
jobDir := filepath.Join(queueDir, jobName)
if err := os.MkdirAll(jobDir, 0755); err != nil {
t.Fatalf("Failed to create queue directory for %s: %v", exp.name, err)
}
// Copy experiment files to queue
files := []string{"train.py", "requirements.txt", "README.md"}
for _, file := range files {
src := filepath.Join(experimentDir, file)
dst := filepath.Join(jobDir, file)
if _, err := os.Stat(src); os.IsNotExist(err) {
continue // Skip if file doesn't exist
}
data, err := os.ReadFile(src)
if err != nil {
t.Fatalf("Failed to read %s for %s: %v", file, exp.name, err)
}
if err := os.WriteFile(dst, data, 0755); err != nil {
t.Fatalf("Failed to copy %s for %s: %v", file, exp.name, err)
}
}
// Create queue metadata file
queueMetadata := filepath.Join(jobDir, "queue_metadata.json")
metadata := fmt.Sprintf(`{
"job_name": "%s",
"experiment_name": "%s",
"example_source": "%s",
"priority": %d,
"status": "pending",
"submitted_at": "%s"
}`, jobName, exp.name, exp.exampleDir, exp.priority, time.Now().Format(time.RFC3339))
if err := os.WriteFile(queueMetadata, []byte(metadata), 0644); err != nil {
t.Fatalf("Failed to create queue metadata for %s: %v", exp.name, err)
}
}
// Verify all experiments are in queue
for _, exp := range experiments {
queueJobs, err := filepath.Glob(filepath.Join(queueDir, fmt.Sprintf("%s_*_priority_%d", exp.name, exp.priority)))
if err != nil || len(queueJobs) == 0 {
t.Errorf("Queue job should exist for %s with priority %d", exp.name, exp.priority)
}
}
})
// Test 2: Simulate sequential processing (queue behavior)
t.Run("SequentialProcessing", func(t *testing.T) {
pendingDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
finishedDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "finished")
// Create directories if they don't exist
if err := os.MkdirAll(runningDir, 0755); err != nil {
t.Fatalf("Failed to create running directory: %v", err)
}
if err := os.MkdirAll(finishedDir, 0755); err != nil {
t.Fatalf("Failed to create finished directory: %v", err)
}
// Process jobs in priority order (1, 2, 3)
for priority := 1; priority <= 3; priority++ {
// Find job with this priority
jobs, err := filepath.Glob(filepath.Join(pendingDir, fmt.Sprintf("*_priority_%d", priority)))
if err != nil {
t.Fatalf("Failed to find jobs with priority %d: %v", priority, err)
}
if len(jobs) == 0 {
t.Fatalf("No job found with priority %d", priority)
}
jobDir := jobs[0] // Take first job with this priority
jobName := filepath.Base(jobDir)
// Move from pending to running
runningJobDir := filepath.Join(runningDir, jobName)
if err := os.Rename(jobDir, runningJobDir); err != nil {
t.Fatalf("Failed to move job %s to running: %v", jobName, err)
}
// Verify only one job is running at this time
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
if err != nil || len(runningJobs) != 1 {
t.Errorf("Expected exactly 1 running job, found %d", len(runningJobs))
}
// Simulate execution by creating results (using actual framework patterns)
outputDir := filepath.Join(runningJobDir, "results")
if err := os.MkdirAll(outputDir, 0755); err != nil {
t.Fatalf("Failed to create output directory for %s: %v", jobName, err)
}
// Read the actual train.py to determine framework
trainScript := filepath.Join(runningJobDir, "train.py")
scriptContent, err := os.ReadFile(trainScript)
if err != nil {
t.Fatalf("Failed to read train.py for %s: %v", jobName, err)
}
// Determine framework from script content
framework := "unknown"
scriptStr := string(scriptContent)
if contains(scriptStr, "sklearn") {
framework = "scikit-learn"
} else if contains(scriptStr, "xgboost") {
framework = "xgboost"
} else if contains(scriptStr, "torch") {
framework = "pytorch"
} else if contains(scriptStr, "tensorflow") {
framework = "tensorflow"
} else if contains(scriptStr, "statsmodels") {
framework = "statsmodels"
}
resultsFile := filepath.Join(outputDir, "results.json")
results := fmt.Sprintf(`{
"job_name": "%s",
"framework": "%s",
"priority": %d,
"status": "completed",
"execution_order": %d,
"started_at": "%s",
"completed_at": "%s",
"source": "actual_example"
}`, jobName, framework, priority, priority, time.Now().Add(-time.Duration(priority)*time.Minute).Format(time.RFC3339), time.Now().Format(time.RFC3339))
if err := os.WriteFile(resultsFile, []byte(results), 0644); err != nil {
t.Fatalf("Failed to create results for %s: %v", jobName, err)
}
// Move from running to finished
finishedJobDir := filepath.Join(finishedDir, jobName)
if err := os.Rename(runningJobDir, finishedJobDir); err != nil {
t.Fatalf("Failed to move job %s to finished: %v", jobName, err)
}
// Verify job is no longer in pending or running
if _, err := os.Stat(jobDir); !os.IsNotExist(err) {
t.Errorf("Job %s should no longer be in pending directory", jobName)
}
if _, err := os.Stat(runningJobDir); !os.IsNotExist(err) {
t.Errorf("Job %s should no longer be in running directory", jobName)
}
}
// Verify all jobs completed
finishedJobs, err := filepath.Glob(filepath.Join(finishedDir, "*"))
if err != nil || len(finishedJobs) != 3 {
t.Errorf("Expected 3 finished jobs, got %d", len(finishedJobs))
}
// Verify queue is empty
pendingJobs, err := filepath.Glob(filepath.Join(pendingDir, "*"))
if err != nil || len(pendingJobs) != 0 {
t.Errorf("Expected 0 pending jobs after processing, found %d", len(pendingJobs))
}
// Verify no jobs are running
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
if err != nil || len(runningJobs) != 0 {
t.Errorf("Expected 0 running jobs after processing, found %d", len(runningJobs))
}
})
}
// TestQueueCapacity tests queue capacity and resource limits
func TestQueueCapacity(t *testing.T) {
t.Parallel() // Enable parallel execution
testDir := t.TempDir()
t.Run("QueueCapacityLimits", func(t *testing.T) {
// Use fixtures for examples directory operations
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
pendingDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
finishedDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "finished")
// Create directories
if err := os.MkdirAll(pendingDir, 0755); err != nil {
t.Fatalf("Failed to create pending directory: %v", err)
}
if err := os.MkdirAll(runningDir, 0755); err != nil {
t.Fatalf("Failed to create running directory: %v", err)
}
if err := os.MkdirAll(finishedDir, 0755); err != nil {
t.Fatalf("Failed to create finished directory: %v", err)
}
// Create more jobs than server can handle simultaneously using actual examples
examples := []string{"standard_ml_project", "sklearn_project", "xgboost_project", "pytorch_project", "tensorflow_project"}
totalJobs := len(examples)
for i, example := range examples {
jobName := fmt.Sprintf("capacity_test_job_%d", i)
jobDir := filepath.Join(pendingDir, jobName)
if err := os.MkdirAll(jobDir, 0755); err != nil {
t.Fatalf("Failed to create job directory %s: %v", jobDir, err)
}
// Copy actual example files using fixtures
sourceDir := examplesDir.GetPath(example)
// Copy actual example files
if _, err := os.Stat(sourceDir); os.IsNotExist(err) {
// Create minimal files if example doesn't exist
trainScript := filepath.Join(jobDir, "train.py")
script := fmt.Sprintf(`#!/usr/bin/env python3
import json, time
from pathlib import Path
def main():
results = {
"job_id": %d,
"example": "%s",
"status": "completed",
"completion_time": time.strftime("%%Y-%%m-%%d %%H:%%M:%%S")
}
output_dir = Path("./results")
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "results.json", "w") as f:
json.dump(results, f, indent=2)
if __name__ == "__main__":
main()
`, i, example)
if err := os.WriteFile(trainScript, []byte(script), 0755); err != nil {
t.Fatalf("Failed to create train script for job %d: %v", i, err)
}
} else {
// Copy actual example files
files := []string{"train.py", "requirements.txt"}
for _, file := range files {
src := filepath.Join(sourceDir, file)
dst := filepath.Join(jobDir, file)
if _, err := os.Stat(src); os.IsNotExist(err) {
continue // Skip if file doesn't exist
}
data, err := os.ReadFile(src)
if err != nil {
t.Fatalf("Failed to read %s for job %d: %v", file, i, err)
}
if err := os.WriteFile(dst, data, 0755); err != nil {
t.Fatalf("Failed to copy %s for job %d: %v", file, i, err)
}
}
}
}
// Verify all jobs are in pending queue
pendingJobs, err := filepath.Glob(filepath.Join(pendingDir, "capacity_test_job_*"))
if err != nil || len(pendingJobs) != totalJobs {
t.Errorf("Expected %d pending jobs, found %d", totalJobs, len(pendingJobs))
}
// Process one job at a time (sequential execution)
for i := 0; i < totalJobs; i++ {
// Move one job to running
jobName := fmt.Sprintf("capacity_test_job_%d", i)
pendingJobDir := filepath.Join(pendingDir, jobName)
runningJobDir := filepath.Join(runningDir, jobName)
if err := os.Rename(pendingJobDir, runningJobDir); err != nil {
t.Fatalf("Failed to move job %d to running: %v", i, err)
}
// Verify only one job is running
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
if err != nil || len(runningJobs) != 1 {
t.Errorf("Expected exactly 1 running job, found %d", len(runningJobs))
}
// Simulate job completion
time.Sleep(5 * time.Millisecond) // Reduced from 10ms
// Move to finished
finishedJobDir := filepath.Join(finishedDir, jobName)
if err := os.Rename(runningJobDir, finishedJobDir); err != nil {
t.Fatalf("Failed to move job %d to finished: %v", i, err)
}
// Verify no jobs are running between jobs
runningJobs, err = filepath.Glob(filepath.Join(runningDir, "*"))
if err != nil || len(runningJobs) != 0 {
t.Errorf("Expected 0 running jobs between jobs, found %d", len(runningJobs))
}
}
// Verify all jobs completed
finishedJobs, err := filepath.Glob(filepath.Join(finishedDir, "capacity_test_job_*"))
if err != nil || len(finishedJobs) != totalJobs {
t.Errorf("Expected %d finished jobs, found %d", totalJobs, len(finishedJobs))
}
// Verify queue is empty
pendingJobs, err = filepath.Glob(filepath.Join(pendingDir, "capacity_test_job_*"))
if err != nil || len(pendingJobs) != 0 {
t.Errorf("Expected 0 pending jobs after processing, found %d", len(pendingJobs))
}
})
}
// TestResourceIsolation tests that experiments have isolated resources
func TestResourceIsolation(t *testing.T) {
t.Parallel() // Enable parallel execution
testDir := t.TempDir()
t.Run("OutputDirectoryIsolation", func(t *testing.T) {
// Use fixtures for examples directory operations
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
// Create multiple experiments with same timestamp using actual examples
timestamp := "20231201_143022"
examples := []string{"sklearn_project", "xgboost_project", "pytorch_project"}
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
for i, expName := range examples {
jobName := fmt.Sprintf("exp%d_%s", i, timestamp)
outputDir := filepath.Join(runningDir, jobName, "results")
if err := os.MkdirAll(outputDir, 0755); err != nil {
t.Fatalf("Failed to create output directory: %v", err)
}
// Copy actual example files using fixtures
sourceDir := examplesDir.GetPath(expName)
// Read actual example to create realistic results
trainScript := filepath.Join(sourceDir, "train.py")
framework := "unknown"
if content, err := os.ReadFile(trainScript); err == nil {
scriptStr := string(content)
if contains(scriptStr, "sklearn") {
framework = "scikit-learn"
} else if contains(scriptStr, "xgboost") {
framework = "xgboost"
} else if contains(scriptStr, "torch") {
framework = "pytorch"
}
}
// Create unique results file based on actual framework
resultsFile := filepath.Join(outputDir, "results.json")
results := fmt.Sprintf(`{
"experiment": "exp%d",
"framework": "%s",
"job_name": "%s",
"output_dir": "%s",
"example_source": "%s",
"unique_id": "exp%d_%d"
}`, i, framework, jobName, outputDir, expName, i, time.Now().UnixNano())
if err := os.WriteFile(resultsFile, []byte(results), 0644); err != nil {
t.Fatalf("Failed to create results for %s: %v", expName, err)
}
}
// Verify each experiment has its own isolated output directory
for i, expName := range examples {
jobName := fmt.Sprintf("exp%d_%s", i, timestamp)
outputDir := filepath.Join(runningDir, jobName, "results")
resultsFile := filepath.Join(outputDir, "results.json")
if _, err := os.Stat(resultsFile); os.IsNotExist(err) {
t.Errorf("Results file should exist for %s in isolated directory", expName)
}
// Verify content is unique
content, err := os.ReadFile(resultsFile)
if err != nil {
t.Fatalf("Failed to read results for %s: %v", expName, err)
}
if !contains(string(content), fmt.Sprintf("exp%d", i)) {
t.Errorf("Results file should contain experiment ID exp%d", i)
}
if !contains(string(content), expName) {
t.Errorf("Results file should contain example source %s", expName)
}
}
})
}
// Helper function to check if string contains substring
func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) &&
(s[:len(substr)] == substr || s[len(s)-len(substr):] == substr ||
findSubstring(s, substr)))
}
func findSubstring(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if s[i:i+len(substr)] == substr {
return true
}
}
return false
}