- Update E2E tests for consolidated docker-compose.test.yml - Remove references to obsolete logs-debug.yml - Enhance test fixtures and utilities - Improve integration test coverage for KMS, queue, scheduler - Update unit tests for config constants and worker execution - Modernize cleanup-status.sh with new Makefile targets
531 lines
16 KiB
Go
531 lines
16 KiB
Go
package tests
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/fileutil"
|
|
tests "github.com/jfraeys/fetch_ml/tests/fixtures"
|
|
)
|
|
|
|
// TestQueueExecution tests that experiments are processed sequentially through the queue
|
|
func TestQueueExecution(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
testDir := t.TempDir()
|
|
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
|
|
|
|
t.Run("QueueSubmission", func(t *testing.T) {
|
|
runQueueSubmissionTest(t, testDir, examplesDir)
|
|
})
|
|
|
|
t.Run("SequentialProcessing", func(t *testing.T) {
|
|
runSequentialProcessingTest(t, testDir)
|
|
})
|
|
}
|
|
|
|
type experimentCase struct {
|
|
name string
|
|
priority int
|
|
exampleDir string
|
|
}
|
|
|
|
func runQueueSubmissionTest(t *testing.T, testDir string, examplesDir *tests.ExamplesDir) {
|
|
t.Helper()
|
|
|
|
queueDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
|
|
experiments := []experimentCase{
|
|
{"sklearn_classification", 1, "sklearn_project"},
|
|
{"xgboost_classification", 2, "xgboost_project"},
|
|
{"pytorch_nn", 3, "pytorch_project"},
|
|
}
|
|
|
|
for _, exp := range experiments {
|
|
setupQueueExperiment(t, testDir, queueDir, examplesDir, exp)
|
|
}
|
|
|
|
verifyQueuedExperiments(t, queueDir, experiments)
|
|
}
|
|
|
|
func setupQueueExperiment(t *testing.T, testDir, queueDir string, examplesDir *tests.ExamplesDir, exp experimentCase) {
|
|
t.Helper()
|
|
|
|
sourceDir := examplesDir.GetPath(exp.exampleDir)
|
|
experimentDir := filepath.Join(testDir, exp.name)
|
|
if err := tests.CopyDir(sourceDir, experimentDir); err != nil {
|
|
t.Fatalf("Failed to copy example %s: %v", exp.exampleDir, err)
|
|
}
|
|
|
|
jobDir := createQueueJobDir(t, queueDir, exp)
|
|
copyExperimentArtifacts(t, experimentDir, jobDir)
|
|
writeQueueMetadata(t, jobDir, exp)
|
|
}
|
|
|
|
func createQueueJobDir(t *testing.T, queueDir string, exp experimentCase) string {
|
|
t.Helper()
|
|
|
|
timestamp := time.Now().Format("20060102_150405")
|
|
jobName := fmt.Sprintf("%s_%s_priority_%d", exp.name, timestamp, exp.priority)
|
|
jobDir := filepath.Join(queueDir, jobName)
|
|
if err := os.MkdirAll(jobDir, 0750); err != nil {
|
|
t.Fatalf("Failed to create queue directory for %s: %v", exp.name, err)
|
|
}
|
|
return jobDir
|
|
}
|
|
|
|
func copyExperimentArtifacts(t *testing.T, experimentDir, jobDir string) {
|
|
t.Helper()
|
|
|
|
files := []string{"train.py", "requirements.txt", "README.md"}
|
|
for _, file := range files {
|
|
src := filepath.Join(experimentDir, file)
|
|
dst := filepath.Join(jobDir, file)
|
|
|
|
if _, err := os.Stat(src); os.IsNotExist(err) {
|
|
continue
|
|
}
|
|
|
|
data, err := fileutil.SecureFileRead(src)
|
|
if err != nil {
|
|
t.Fatalf("Failed to read %s: %v", file, err)
|
|
}
|
|
|
|
//nolint:gosec // G306: Script needs execute permissions
|
|
if err := os.WriteFile(dst, data, 0750); err != nil {
|
|
t.Fatalf("Failed to copy %s: %v", file, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func writeQueueMetadata(t *testing.T, jobDir string, exp experimentCase) {
|
|
t.Helper()
|
|
|
|
jobName := filepath.Base(jobDir)
|
|
queueMetadata := filepath.Join(jobDir, "queue_metadata.json")
|
|
metadata := fmt.Sprintf(`{
|
|
"job_name": "%s",
|
|
"experiment_name": "%s",
|
|
"example_source": "%s",
|
|
"priority": %d,
|
|
"status": "pending",
|
|
"submitted_at": "%s"
|
|
}`, jobName, exp.name, exp.exampleDir, exp.priority, time.Now().Format(time.RFC3339))
|
|
|
|
if err := os.WriteFile(queueMetadata, []byte(metadata), 0600); err != nil {
|
|
t.Fatalf("Failed to create queue metadata for %s: %v", exp.name, err)
|
|
}
|
|
}
|
|
|
|
func verifyQueuedExperiments(t *testing.T, queueDir string, experiments []experimentCase) {
|
|
t.Helper()
|
|
|
|
for _, exp := range experiments {
|
|
pattern := filepath.Join(queueDir, fmt.Sprintf("%s_*_priority_%d", exp.name, exp.priority))
|
|
queueJobs, err := filepath.Glob(pattern)
|
|
if err != nil || len(queueJobs) == 0 {
|
|
t.Errorf("Queue job should exist for %s with priority %d", exp.name, exp.priority)
|
|
}
|
|
}
|
|
}
|
|
|
|
func runSequentialProcessingTest(t *testing.T, testDir string) {
|
|
t.Helper()
|
|
|
|
pendingDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
|
|
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
|
|
finishedDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "finished")
|
|
|
|
ensureDir(t, runningDir)
|
|
ensureDir(t, finishedDir)
|
|
|
|
for priority := 1; priority <= 3; priority++ {
|
|
processJobByPriority(t, pendingDir, runningDir, finishedDir, priority)
|
|
}
|
|
|
|
verifyFinalQueueState(t, pendingDir, runningDir, finishedDir)
|
|
}
|
|
|
|
func ensureDir(t *testing.T, dir string) {
|
|
t.Helper()
|
|
if err := os.MkdirAll(dir, 0750); err != nil {
|
|
t.Fatalf("Failed to create directory %s: %v", dir, err)
|
|
}
|
|
}
|
|
|
|
func processJobByPriority(t *testing.T, pendingDir, runningDir, finishedDir string, priority int) {
|
|
t.Helper()
|
|
|
|
jobDir := selectJobByPriority(t, pendingDir, priority)
|
|
jobName := filepath.Base(jobDir)
|
|
runningJobDir := filepath.Join(runningDir, jobName)
|
|
if err := os.Rename(jobDir, runningJobDir); err != nil {
|
|
t.Fatalf("Failed to move job %s to running: %v", jobName, err)
|
|
}
|
|
|
|
ensureSingleRunningJob(t, runningDir)
|
|
simulateJobExecution(t, runningJobDir, jobName, priority)
|
|
|
|
finishedJobDir := filepath.Join(finishedDir, jobName)
|
|
if err := os.Rename(runningJobDir, finishedJobDir); err != nil {
|
|
t.Fatalf("Failed to move job %s to finished: %v", jobName, err)
|
|
}
|
|
|
|
assertDirectoryAbsent(t, jobDir, "pending")
|
|
assertDirectoryAbsent(t, runningJobDir, "running")
|
|
}
|
|
|
|
func selectJobByPriority(t *testing.T, pendingDir string, priority int) string {
|
|
t.Helper()
|
|
|
|
pattern := filepath.Join(pendingDir, fmt.Sprintf("*_priority_%d", priority))
|
|
jobs, err := filepath.Glob(pattern)
|
|
if err != nil {
|
|
t.Fatalf("Failed to find jobs with priority %d: %v", priority, err)
|
|
}
|
|
if len(jobs) == 0 {
|
|
t.Fatalf("No job found with priority %d", priority)
|
|
}
|
|
return jobs[0]
|
|
}
|
|
|
|
func ensureSingleRunningJob(t *testing.T, runningDir string) {
|
|
t.Helper()
|
|
|
|
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
|
|
if err != nil || len(runningJobs) != 1 {
|
|
t.Errorf("Expected exactly 1 running job, found %d", len(runningJobs))
|
|
}
|
|
}
|
|
|
|
func simulateJobExecution(t *testing.T, runningJobDir, jobName string, priority int) {
|
|
t.Helper()
|
|
|
|
outputDir := filepath.Join(runningJobDir, "results")
|
|
if err := os.MkdirAll(outputDir, 0750); err != nil {
|
|
t.Fatalf("Failed to create output directory for %s: %v", jobName, err)
|
|
}
|
|
|
|
framework := detectFramework(t, filepath.Join(runningJobDir, "train.py"))
|
|
results := fmt.Sprintf(`{
|
|
"job_name": "%s",
|
|
"framework": "%s",
|
|
"priority": %d,
|
|
"status": "completed",
|
|
"execution_order": %d,
|
|
"started_at": "%s",
|
|
"completed_at": "%s",
|
|
"source": "actual_example"
|
|
}`, jobName, framework, priority, priority,
|
|
time.Now().Add(-time.Duration(priority)*time.Minute).Format(time.RFC3339),
|
|
time.Now().Format(time.RFC3339))
|
|
|
|
resultsFile := filepath.Join(outputDir, "results.json")
|
|
if err := os.WriteFile(resultsFile, []byte(results), 0600); err != nil {
|
|
t.Fatalf("Failed to create results for %s: %v", jobName, err)
|
|
}
|
|
}
|
|
|
|
func detectFramework(t *testing.T, entrypoint string) string {
|
|
t.Helper()
|
|
|
|
scriptContent, err := fileutil.SecureFileRead(entrypoint)
|
|
if err != nil {
|
|
t.Fatalf("Failed to read entrypoint script file: %v", err)
|
|
}
|
|
|
|
scriptStr := string(scriptContent)
|
|
switch {
|
|
case contains(scriptStr, "sklearn"):
|
|
return "scikit-learn"
|
|
case contains(scriptStr, "xgboost"):
|
|
return "xgboost"
|
|
case contains(scriptStr, "torch"):
|
|
return "pytorch"
|
|
case contains(scriptStr, "tensorflow"):
|
|
return "tensorflow"
|
|
case contains(scriptStr, "statsmodels"):
|
|
return "statsmodels"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func assertDirectoryAbsent(t *testing.T, path, location string) {
|
|
t.Helper()
|
|
if _, err := os.Stat(path); !os.IsNotExist(err) {
|
|
t.Errorf("Job should no longer be in %s directory: %s", location, path)
|
|
}
|
|
}
|
|
|
|
func verifyFinalQueueState(t *testing.T, pendingDir, runningDir, finishedDir string) {
|
|
t.Helper()
|
|
|
|
finishedJobs, err := filepath.Glob(filepath.Join(finishedDir, "*"))
|
|
if err != nil || len(finishedJobs) != 3 {
|
|
t.Errorf("Expected 3 finished jobs, got %d", len(finishedJobs))
|
|
}
|
|
|
|
pendingJobs, err := filepath.Glob(filepath.Join(pendingDir, "*"))
|
|
if err != nil || len(pendingJobs) != 0 {
|
|
t.Errorf("Expected 0 pending jobs after processing, found %d", len(pendingJobs))
|
|
}
|
|
|
|
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
|
|
if err != nil || len(runningJobs) != 0 {
|
|
t.Errorf("Expected 0 running jobs after processing, found %d", len(runningJobs))
|
|
}
|
|
}
|
|
|
|
// TestQueueCapacity tests queue capacity and resource limits
|
|
func TestQueueCapacity(t *testing.T) {
|
|
t.Parallel() // Enable parallel execution
|
|
|
|
testDir := t.TempDir()
|
|
|
|
t.Run("QueueCapacityLimits", func(t *testing.T) {
|
|
// Use fixtures for examples directory operations
|
|
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
|
|
|
|
pendingDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
|
|
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
|
|
finishedDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "finished")
|
|
|
|
// Create directories
|
|
if err := os.MkdirAll(pendingDir, 0750); err != nil {
|
|
t.Fatalf("Failed to create pending directory: %v", err)
|
|
}
|
|
if err := os.MkdirAll(runningDir, 0750); err != nil {
|
|
t.Fatalf("Failed to create running directory: %v", err)
|
|
}
|
|
if err := os.MkdirAll(finishedDir, 0750); err != nil {
|
|
t.Fatalf("Failed to create finished directory: %v", err)
|
|
}
|
|
|
|
// Create more jobs than server can handle simultaneously using actual examples
|
|
examples := []string{
|
|
"standard_ml_project", "sklearn_project", "xgboost_project",
|
|
"pytorch_project", "tensorflow_project",
|
|
}
|
|
totalJobs := len(examples)
|
|
|
|
for i, example := range examples {
|
|
jobName := fmt.Sprintf("capacity_test_job_%d", i)
|
|
jobDir := filepath.Join(pendingDir, jobName)
|
|
|
|
if err := os.MkdirAll(jobDir, 0750); err != nil {
|
|
t.Fatalf("Failed to create job directory %s: %v", jobDir, err)
|
|
}
|
|
|
|
// Copy actual example files using fixtures
|
|
sourceDir := examplesDir.GetPath(example)
|
|
|
|
// Copy actual example files
|
|
if _, err := os.Stat(sourceDir); os.IsNotExist(err) {
|
|
// Create minimal files if example doesn't exist
|
|
entrypoint := filepath.Join(jobDir, "train.py")
|
|
script := fmt.Sprintf(`#!/usr/bin/env python3
|
|
import json, time
|
|
from pathlib import Path
|
|
|
|
def main():
|
|
results = {
|
|
"job_id": %d,
|
|
"example": "%s",
|
|
"status": "completed",
|
|
"completion_time": time.strftime("%%Y-%%m-%%d %%H:%%M:%%S")
|
|
}
|
|
|
|
output_dir = Path("./results")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_dir / "results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
`, i, example)
|
|
|
|
//nolint:gosec // G306: Script needs execute permissions
|
|
if err := os.WriteFile(entrypoint, []byte(script), 0750); err != nil {
|
|
t.Fatalf("Failed to create train script for job %d: %v", i, err)
|
|
}
|
|
} else {
|
|
// Copy actual example files
|
|
files := []string{"train.py", "requirements.txt"}
|
|
for _, file := range files {
|
|
src := filepath.Join(sourceDir, file)
|
|
dst := filepath.Join(jobDir, file)
|
|
|
|
if _, err := os.Stat(src); os.IsNotExist(err) {
|
|
continue // Skip if file doesn't exist
|
|
}
|
|
|
|
data, err := fileutil.SecureFileRead(src)
|
|
if err != nil {
|
|
t.Fatalf("Failed to read %s for job %d: %v", file, i, err)
|
|
}
|
|
|
|
//nolint:gosec // G306: Script needs execute permissions
|
|
if err := os.WriteFile(dst, data, 0750); err != nil {
|
|
t.Fatalf("Failed to copy %s for job %d: %v", file, i, err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Verify all jobs are in pending queue
|
|
pendingJobs, err := filepath.Glob(filepath.Join(pendingDir, "capacity_test_job_*"))
|
|
if err != nil || len(pendingJobs) != totalJobs {
|
|
t.Errorf("Expected %d pending jobs, found %d", totalJobs, len(pendingJobs))
|
|
}
|
|
|
|
// Process one job at a time (sequential execution)
|
|
for i := 0; i < totalJobs; i++ {
|
|
// Move one job to running
|
|
jobName := fmt.Sprintf("capacity_test_job_%d", i)
|
|
pendingJobDir := filepath.Join(pendingDir, jobName)
|
|
runningJobDir := filepath.Join(runningDir, jobName)
|
|
|
|
if err := os.Rename(pendingJobDir, runningJobDir); err != nil {
|
|
t.Fatalf("Failed to move job %d to running: %v", i, err)
|
|
}
|
|
|
|
// Verify only one job is running
|
|
runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
|
|
if err != nil || len(runningJobs) != 1 {
|
|
t.Errorf("Expected exactly 1 running job, found %d", len(runningJobs))
|
|
}
|
|
|
|
// Simulate job completion
|
|
time.Sleep(5 * time.Millisecond) // Reduced from 10ms
|
|
|
|
// Move to finished
|
|
finishedJobDir := filepath.Join(finishedDir, jobName)
|
|
if err := os.Rename(runningJobDir, finishedJobDir); err != nil {
|
|
t.Fatalf("Failed to move job %d to finished: %v", i, err)
|
|
}
|
|
|
|
// Verify no jobs are running between jobs
|
|
runningJobs, err = filepath.Glob(filepath.Join(runningDir, "*"))
|
|
if err != nil || len(runningJobs) != 0 {
|
|
t.Errorf("Expected 0 running jobs between jobs, found %d", len(runningJobs))
|
|
}
|
|
}
|
|
|
|
// Verify all jobs completed
|
|
finishedJobs, err := filepath.Glob(filepath.Join(finishedDir, "capacity_test_job_*"))
|
|
if err != nil || len(finishedJobs) != totalJobs {
|
|
t.Errorf("Expected %d finished jobs, found %d", totalJobs, len(finishedJobs))
|
|
}
|
|
|
|
// Verify queue is empty
|
|
pendingJobs, err = filepath.Glob(filepath.Join(pendingDir, "capacity_test_job_*"))
|
|
if err != nil || len(pendingJobs) != 0 {
|
|
t.Errorf("Expected 0 pending jobs after processing, found %d", len(pendingJobs))
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestResourceIsolation tests that experiments have isolated resources
|
|
func TestResourceIsolation(t *testing.T) {
|
|
t.Parallel() // Enable parallel execution
|
|
testDir := t.TempDir()
|
|
|
|
t.Run("OutputDirectoryIsolation", func(t *testing.T) {
|
|
// Use fixtures for examples directory operations
|
|
examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))
|
|
|
|
// Create multiple experiments with same timestamp using actual examples
|
|
timestamp := "20231201_143022"
|
|
examples := []string{"sklearn_project", "xgboost_project", "pytorch_project"}
|
|
|
|
runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
|
|
|
|
for i, expName := range examples {
|
|
jobName := fmt.Sprintf("exp%d_%s", i, timestamp)
|
|
outputDir := filepath.Join(runningDir, jobName, "results")
|
|
|
|
if err := os.MkdirAll(outputDir, 0750); err != nil {
|
|
t.Fatalf("Failed to create output directory: %v", err)
|
|
}
|
|
|
|
// Copy actual example files using fixtures
|
|
sourceDir := examplesDir.GetPath(expName)
|
|
|
|
// Read actual example to create realistic results
|
|
entrypoint := filepath.Join(sourceDir, "train.py")
|
|
|
|
framework := "unknown"
|
|
if content, err := fileutil.SecureFileRead(entrypoint); err == nil {
|
|
scriptStr := string(content)
|
|
switch {
|
|
case contains(scriptStr, "sklearn"):
|
|
framework = "scikit-learn"
|
|
case contains(scriptStr, "xgboost"):
|
|
framework = "xgboost"
|
|
case contains(scriptStr, "torch"):
|
|
framework = "pytorch"
|
|
}
|
|
}
|
|
|
|
// Create unique results file based on actual framework
|
|
resultsFile := filepath.Join(outputDir, "results.json")
|
|
results := fmt.Sprintf(`{
|
|
"experiment": "exp%d",
|
|
"framework": "%s",
|
|
"job_name": "%s",
|
|
"output_dir": "%s",
|
|
"example_source": "%s",
|
|
"unique_id": "exp%d_%d"
|
|
}`, i, framework, jobName, outputDir, expName, i, time.Now().UnixNano())
|
|
|
|
if err := os.WriteFile(resultsFile, []byte(results), 0600); err != nil {
|
|
t.Fatalf("Failed to create results for %s: %v", expName, err)
|
|
}
|
|
}
|
|
|
|
// Verify each experiment has its own isolated output directory
|
|
for i, expName := range examples {
|
|
jobName := fmt.Sprintf("exp%d_%s", i, timestamp)
|
|
outputDir := filepath.Join(runningDir, jobName, "results")
|
|
resultsFile := filepath.Join(outputDir, "results.json")
|
|
|
|
if _, err := os.Stat(resultsFile); os.IsNotExist(err) {
|
|
t.Errorf("Results file should exist for %s in isolated directory", expName)
|
|
}
|
|
|
|
// Verify content is unique
|
|
content, err := fileutil.SecureFileRead(resultsFile)
|
|
if err != nil {
|
|
t.Fatalf("Failed to read results for %s: %v", expName, err)
|
|
}
|
|
|
|
if !contains(string(content), fmt.Sprintf("exp%d", i)) {
|
|
t.Errorf("Results file should contain experiment ID exp%d", i)
|
|
}
|
|
|
|
if !contains(string(content), expName) {
|
|
t.Errorf("Results file should contain example source %s", expName)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
// Helper function to check if string contains substring
|
|
func contains(s, substr string) bool {
|
|
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) &&
|
|
(s[:len(substr)] == substr || s[len(s)-len(substr):] == substr ||
|
|
findSubstring(s, substr)))
|
|
}
|
|
|
|
func findSubstring(s, substr string) bool {
|
|
for i := 0; i <= len(s)-len(substr); i++ {
|
|
if s[i:i+len(substr)] == substr {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|