- Add end-to-end tests for complete workflow validation - Include integration tests for API and database interactions - Add unit tests for all major components and utilities - Include performance tests for payload handling - Add CLI API integration tests - Include Podman container integration tests - Add WebSocket and queue execution tests - Include shell script tests for setup validation Provides comprehensive test coverage ensuring platform reliability and functionality across all components and interactions.
465 lines
16 KiB
Go
465 lines
16 KiB
Go
package tests
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
|
|
tests "github.com/jfraeys/fetch_ml/tests/fixtures"
|
|
)
|
|
|
|
// TestQueueExecution tests that experiments are processed sequentially through the queue
|
|
func TestQueueExecution(t *testing.T) {
|
|
t.Parallel() // Enable parallel execution
|
|
|
|
testDir := t.TempDir()
|
|
|
|
// Use fixtures for examples directory operations
|
|
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
|
|
|
|
// Test 1: Create multiple experiments from actual examples and add them to queue
|
|
t.Run("QueueSubmission", func(t *testing.T) {
|
|
// Create server queue structure
|
|
queueDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
|
|
|
|
// Use actual examples with different priorities
|
|
experiments := []struct {
|
|
name string
|
|
priority int
|
|
exampleDir string
|
|
}{
|
|
{"sklearn_classification", 1, "sklearn_project"},
|
|
{"xgboost_classification", 2, "xgboost_project"},
|
|
{"pytorch_nn", 3, "pytorch_project"},
|
|
}
|
|
|
|
for _, exp := range experiments {
|
|
// Copy actual example files using fixtures
|
|
sourceDir := examplesDir.GetPath(exp.exampleDir)
|
|
experimentDir := filepath.Join(testDir, exp.name)
|
|
|
|
// Copy all files from example directory
|
|
if err := tests.CopyDir(sourceDir, experimentDir); err != nil {
|
|
t.Fatalf("Failed to copy example %s: %v", exp.exampleDir, err)
|
|
}
|
|
|
|
// Add to queue (simulate job submission)
|
|
timestamp := time.Now().Format("20060102_150405")
|
|
jobName := fmt.Sprintf("%s_%s_priority_%d", exp.name, timestamp, exp.priority)
|
|
jobDir := filepath.Join(queueDir, jobName)
|
|
|
|
if err := os.MkdirAll(jobDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create queue directory for %s: %v", exp.name, err)
|
|
}
|
|
|
|
// Copy experiment files to queue
|
|
files := []string{"train.py", "requirements.txt", "README.md"}
|
|
for _, file := range files {
|
|
src := filepath.Join(experimentDir, file)
|
|
dst := filepath.Join(jobDir, file)
|
|
|
|
if _, err := os.Stat(src); os.IsNotExist(err) {
|
|
continue // Skip if file doesn't exist
|
|
}
|
|
|
|
data, err := os.ReadFile(src)
|
|
if err != nil {
|
|
t.Fatalf("Failed to read %s for %s: %v", file, exp.name, err)
|
|
}
|
|
|
|
if err := os.WriteFile(dst, data, 0755); err != nil {
|
|
t.Fatalf("Failed to copy %s for %s: %v", file, exp.name, err)
|
|
}
|
|
}
|
|
|
|
// Create queue metadata file
|
|
queueMetadata := filepath.Join(jobDir, "queue_metadata.json")
|
|
metadata := fmt.Sprintf(`{
|
|
"job_name": "%s",
|
|
"experiment_name": "%s",
|
|
"example_source": "%s",
|
|
"priority": %d,
|
|
"status": "pending",
|
|
"submitted_at": "%s"
|
|
}`, jobName, exp.name, exp.exampleDir, exp.priority, time.Now().Format(time.RFC3339))
|
|
|
|
if err := os.WriteFile(queueMetadata, []byte(metadata), 0644); err != nil {
|
|
t.Fatalf("Failed to create queue metadata for %s: %v", exp.name, err)
|
|
}
|
|
}
|
|
|
|
// Verify all experiments are in queue
|
|
for _, exp := range experiments {
|
|
queueJobs, err := filepath.Glob(filepath.Join(queueDir, fmt.Sprintf("%s_*_priority_%d", exp.name, exp.priority)))
|
|
if err != nil || len(queueJobs) == 0 {
|
|
t.Errorf("Queue job should exist for %s with priority %d", exp.name, exp.priority)
|
|
}
|
|
}
|
|
})
|
|
|
|
// Test 2: Simulate sequential processing (queue behavior)
|
|
t.Run("SequentialProcessing", func(t *testing.T) {
|
|
pendingDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
|
|
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
|
|
finishedDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "finished")
|
|
|
|
// Create directories if they don't exist
|
|
if err := os.MkdirAll(runningDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create running directory: %v", err)
|
|
}
|
|
if err := os.MkdirAll(finishedDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create finished directory: %v", err)
|
|
}
|
|
|
|
// Process jobs in priority order (1, 2, 3)
|
|
for priority := 1; priority <= 3; priority++ {
|
|
// Find job with this priority
|
|
jobs, err := filepath.Glob(filepath.Join(pendingDir, fmt.Sprintf("*_priority_%d", priority)))
|
|
if err != nil {
|
|
t.Fatalf("Failed to find jobs with priority %d: %v", priority, err)
|
|
}
|
|
|
|
if len(jobs) == 0 {
|
|
t.Fatalf("No job found with priority %d", priority)
|
|
}
|
|
|
|
jobDir := jobs[0] // Take first job with this priority
|
|
jobName := filepath.Base(jobDir)
|
|
|
|
// Move from pending to running
|
|
runningJobDir := filepath.Join(runningDir, jobName)
|
|
if err := os.Rename(jobDir, runningJobDir); err != nil {
|
|
t.Fatalf("Failed to move job %s to running: %v", jobName, err)
|
|
}
|
|
|
|
// Verify only one job is running at this time
|
|
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
|
|
if err != nil || len(runningJobs) != 1 {
|
|
t.Errorf("Expected exactly 1 running job, found %d", len(runningJobs))
|
|
}
|
|
|
|
// Simulate execution by creating results (using actual framework patterns)
|
|
outputDir := filepath.Join(runningJobDir, "results")
|
|
if err := os.MkdirAll(outputDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create output directory for %s: %v", jobName, err)
|
|
}
|
|
|
|
// Read the actual train.py to determine framework
|
|
trainScript := filepath.Join(runningJobDir, "train.py")
|
|
scriptContent, err := os.ReadFile(trainScript)
|
|
if err != nil {
|
|
t.Fatalf("Failed to read train.py for %s: %v", jobName, err)
|
|
}
|
|
|
|
// Determine framework from script content
|
|
framework := "unknown"
|
|
scriptStr := string(scriptContent)
|
|
if contains(scriptStr, "sklearn") {
|
|
framework = "scikit-learn"
|
|
} else if contains(scriptStr, "xgboost") {
|
|
framework = "xgboost"
|
|
} else if contains(scriptStr, "torch") {
|
|
framework = "pytorch"
|
|
} else if contains(scriptStr, "tensorflow") {
|
|
framework = "tensorflow"
|
|
} else if contains(scriptStr, "statsmodels") {
|
|
framework = "statsmodels"
|
|
}
|
|
|
|
resultsFile := filepath.Join(outputDir, "results.json")
|
|
results := fmt.Sprintf(`{
|
|
"job_name": "%s",
|
|
"framework": "%s",
|
|
"priority": %d,
|
|
"status": "completed",
|
|
"execution_order": %d,
|
|
"started_at": "%s",
|
|
"completed_at": "%s",
|
|
"source": "actual_example"
|
|
}`, jobName, framework, priority, priority, time.Now().Add(-time.Duration(priority)*time.Minute).Format(time.RFC3339), time.Now().Format(time.RFC3339))
|
|
|
|
if err := os.WriteFile(resultsFile, []byte(results), 0644); err != nil {
|
|
t.Fatalf("Failed to create results for %s: %v", jobName, err)
|
|
}
|
|
|
|
// Move from running to finished
|
|
finishedJobDir := filepath.Join(finishedDir, jobName)
|
|
if err := os.Rename(runningJobDir, finishedJobDir); err != nil {
|
|
t.Fatalf("Failed to move job %s to finished: %v", jobName, err)
|
|
}
|
|
|
|
// Verify job is no longer in pending or running
|
|
if _, err := os.Stat(jobDir); !os.IsNotExist(err) {
|
|
t.Errorf("Job %s should no longer be in pending directory", jobName)
|
|
}
|
|
if _, err := os.Stat(runningJobDir); !os.IsNotExist(err) {
|
|
t.Errorf("Job %s should no longer be in running directory", jobName)
|
|
}
|
|
}
|
|
|
|
// Verify all jobs completed
|
|
finishedJobs, err := filepath.Glob(filepath.Join(finishedDir, "*"))
|
|
if err != nil || len(finishedJobs) != 3 {
|
|
t.Errorf("Expected 3 finished jobs, got %d", len(finishedJobs))
|
|
}
|
|
|
|
// Verify queue is empty
|
|
pendingJobs, err := filepath.Glob(filepath.Join(pendingDir, "*"))
|
|
if err != nil || len(pendingJobs) != 0 {
|
|
t.Errorf("Expected 0 pending jobs after processing, found %d", len(pendingJobs))
|
|
}
|
|
|
|
// Verify no jobs are running
|
|
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
|
|
if err != nil || len(runningJobs) != 0 {
|
|
t.Errorf("Expected 0 running jobs after processing, found %d", len(runningJobs))
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestQueueCapacity tests queue capacity and resource limits
|
|
func TestQueueCapacity(t *testing.T) {
|
|
t.Parallel() // Enable parallel execution
|
|
|
|
testDir := t.TempDir()
|
|
|
|
t.Run("QueueCapacityLimits", func(t *testing.T) {
|
|
// Use fixtures for examples directory operations
|
|
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
|
|
|
|
pendingDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
|
|
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
|
|
finishedDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "finished")
|
|
|
|
// Create directories
|
|
if err := os.MkdirAll(pendingDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create pending directory: %v", err)
|
|
}
|
|
if err := os.MkdirAll(runningDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create running directory: %v", err)
|
|
}
|
|
if err := os.MkdirAll(finishedDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create finished directory: %v", err)
|
|
}
|
|
|
|
// Create more jobs than server can handle simultaneously using actual examples
|
|
examples := []string{"standard_ml_project", "sklearn_project", "xgboost_project", "pytorch_project", "tensorflow_project"}
|
|
totalJobs := len(examples)
|
|
|
|
for i, example := range examples {
|
|
jobName := fmt.Sprintf("capacity_test_job_%d", i)
|
|
jobDir := filepath.Join(pendingDir, jobName)
|
|
|
|
if err := os.MkdirAll(jobDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create job directory %s: %v", jobDir, err)
|
|
}
|
|
|
|
// Copy actual example files using fixtures
|
|
sourceDir := examplesDir.GetPath(example)
|
|
|
|
// Copy actual example files
|
|
if _, err := os.Stat(sourceDir); os.IsNotExist(err) {
|
|
// Create minimal files if example doesn't exist
|
|
trainScript := filepath.Join(jobDir, "train.py")
|
|
script := fmt.Sprintf(`#!/usr/bin/env python3
|
|
import json, time
|
|
from pathlib import Path
|
|
|
|
def main():
|
|
results = {
|
|
"job_id": %d,
|
|
"example": "%s",
|
|
"status": "completed",
|
|
"completion_time": time.strftime("%%Y-%%m-%%d %%H:%%M:%%S")
|
|
}
|
|
|
|
output_dir = Path("./results")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_dir / "results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
`, i, example)
|
|
|
|
if err := os.WriteFile(trainScript, []byte(script), 0755); err != nil {
|
|
t.Fatalf("Failed to create train script for job %d: %v", i, err)
|
|
}
|
|
} else {
|
|
// Copy actual example files
|
|
files := []string{"train.py", "requirements.txt"}
|
|
for _, file := range files {
|
|
src := filepath.Join(sourceDir, file)
|
|
dst := filepath.Join(jobDir, file)
|
|
|
|
if _, err := os.Stat(src); os.IsNotExist(err) {
|
|
continue // Skip if file doesn't exist
|
|
}
|
|
|
|
data, err := os.ReadFile(src)
|
|
if err != nil {
|
|
t.Fatalf("Failed to read %s for job %d: %v", file, i, err)
|
|
}
|
|
|
|
if err := os.WriteFile(dst, data, 0755); err != nil {
|
|
t.Fatalf("Failed to copy %s for job %d: %v", file, i, err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Verify all jobs are in pending queue
|
|
pendingJobs, err := filepath.Glob(filepath.Join(pendingDir, "capacity_test_job_*"))
|
|
if err != nil || len(pendingJobs) != totalJobs {
|
|
t.Errorf("Expected %d pending jobs, found %d", totalJobs, len(pendingJobs))
|
|
}
|
|
|
|
// Process one job at a time (sequential execution)
|
|
for i := 0; i < totalJobs; i++ {
|
|
// Move one job to running
|
|
jobName := fmt.Sprintf("capacity_test_job_%d", i)
|
|
pendingJobDir := filepath.Join(pendingDir, jobName)
|
|
runningJobDir := filepath.Join(runningDir, jobName)
|
|
|
|
if err := os.Rename(pendingJobDir, runningJobDir); err != nil {
|
|
t.Fatalf("Failed to move job %d to running: %v", i, err)
|
|
}
|
|
|
|
// Verify only one job is running
|
|
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
|
|
if err != nil || len(runningJobs) != 1 {
|
|
t.Errorf("Expected exactly 1 running job, found %d", len(runningJobs))
|
|
}
|
|
|
|
// Simulate job completion
|
|
time.Sleep(5 * time.Millisecond) // Reduced from 10ms
|
|
|
|
// Move to finished
|
|
finishedJobDir := filepath.Join(finishedDir, jobName)
|
|
if err := os.Rename(runningJobDir, finishedJobDir); err != nil {
|
|
t.Fatalf("Failed to move job %d to finished: %v", i, err)
|
|
}
|
|
|
|
// Verify no jobs are running between jobs
|
|
runningJobs, err = filepath.Glob(filepath.Join(runningDir, "*"))
|
|
if err != nil || len(runningJobs) != 0 {
|
|
t.Errorf("Expected 0 running jobs between jobs, found %d", len(runningJobs))
|
|
}
|
|
}
|
|
|
|
// Verify all jobs completed
|
|
finishedJobs, err := filepath.Glob(filepath.Join(finishedDir, "capacity_test_job_*"))
|
|
if err != nil || len(finishedJobs) != totalJobs {
|
|
t.Errorf("Expected %d finished jobs, found %d", totalJobs, len(finishedJobs))
|
|
}
|
|
|
|
// Verify queue is empty
|
|
pendingJobs, err = filepath.Glob(filepath.Join(pendingDir, "capacity_test_job_*"))
|
|
if err != nil || len(pendingJobs) != 0 {
|
|
t.Errorf("Expected 0 pending jobs after processing, found %d", len(pendingJobs))
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestResourceIsolation tests that experiments have isolated resources
|
|
func TestResourceIsolation(t *testing.T) {
|
|
t.Parallel() // Enable parallel execution
|
|
testDir := t.TempDir()
|
|
|
|
t.Run("OutputDirectoryIsolation", func(t *testing.T) {
|
|
// Use fixtures for examples directory operations
|
|
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
|
|
|
|
// Create multiple experiments with same timestamp using actual examples
|
|
timestamp := "20231201_143022"
|
|
examples := []string{"sklearn_project", "xgboost_project", "pytorch_project"}
|
|
|
|
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
|
|
|
|
for i, expName := range examples {
|
|
jobName := fmt.Sprintf("exp%d_%s", i, timestamp)
|
|
outputDir := filepath.Join(runningDir, jobName, "results")
|
|
|
|
if err := os.MkdirAll(outputDir, 0755); err != nil {
|
|
t.Fatalf("Failed to create output directory: %v", err)
|
|
}
|
|
|
|
// Copy actual example files using fixtures
|
|
sourceDir := examplesDir.GetPath(expName)
|
|
|
|
// Read actual example to create realistic results
|
|
trainScript := filepath.Join(sourceDir, "train.py")
|
|
|
|
framework := "unknown"
|
|
if content, err := os.ReadFile(trainScript); err == nil {
|
|
scriptStr := string(content)
|
|
if contains(scriptStr, "sklearn") {
|
|
framework = "scikit-learn"
|
|
} else if contains(scriptStr, "xgboost") {
|
|
framework = "xgboost"
|
|
} else if contains(scriptStr, "torch") {
|
|
framework = "pytorch"
|
|
}
|
|
}
|
|
|
|
// Create unique results file based on actual framework
|
|
resultsFile := filepath.Join(outputDir, "results.json")
|
|
results := fmt.Sprintf(`{
|
|
"experiment": "exp%d",
|
|
"framework": "%s",
|
|
"job_name": "%s",
|
|
"output_dir": "%s",
|
|
"example_source": "%s",
|
|
"unique_id": "exp%d_%d"
|
|
}`, i, framework, jobName, outputDir, expName, i, time.Now().UnixNano())
|
|
|
|
if err := os.WriteFile(resultsFile, []byte(results), 0644); err != nil {
|
|
t.Fatalf("Failed to create results for %s: %v", expName, err)
|
|
}
|
|
}
|
|
|
|
// Verify each experiment has its own isolated output directory
|
|
for i, expName := range examples {
|
|
jobName := fmt.Sprintf("exp%d_%s", i, timestamp)
|
|
outputDir := filepath.Join(runningDir, jobName, "results")
|
|
resultsFile := filepath.Join(outputDir, "results.json")
|
|
|
|
if _, err := os.Stat(resultsFile); os.IsNotExist(err) {
|
|
t.Errorf("Results file should exist for %s in isolated directory", expName)
|
|
}
|
|
|
|
// Verify content is unique
|
|
content, err := os.ReadFile(resultsFile)
|
|
if err != nil {
|
|
t.Fatalf("Failed to read results for %s: %v", expName, err)
|
|
}
|
|
|
|
if !contains(string(content), fmt.Sprintf("exp%d", i)) {
|
|
t.Errorf("Results file should contain experiment ID exp%d", i)
|
|
}
|
|
|
|
if !contains(string(content), expName) {
|
|
t.Errorf("Results file should contain example source %s", expName)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
// Helper function to check if string contains substring
|
|
func contains(s, substr string) bool {
|
|
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) &&
|
|
(s[:len(substr)] == substr || s[len(s)-len(substr):] == substr ||
|
|
findSubstring(s, substr)))
|
|
}
|
|
|
|
func findSubstring(s, substr string) bool {
|
|
for i := 0; i <= len(s)-len(substr); i++ {
|
|
if s[i:i+len(substr)] == substr {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|