fetch_ml/tests/integration/queue_execution_test.go

package tests

import (
	"fmt"
	"os"
	"path/filepath"
	"testing"
	"time"

	tests "github.com/jfraeys/fetch_ml/tests/fixtures"
)

// TestQueueExecution tests that experiments are processed sequentially through the queue
func TestQueueExecution(t *testing.T) {
	t.Parallel() // Enable parallel execution

	testDir := t.TempDir()

	// Use fixtures for examples directory operations
	examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))

	// Test 1: Create multiple experiments from actual examples and add them to queue
	t.Run("QueueSubmission", func(t *testing.T) {
		// Create server queue structure
		queueDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")

		// Use actual examples with different priorities
		experiments := []struct {
			name       string
			priority   int
			exampleDir string
		}{
			{"sklearn_classification", 1, "sklearn_project"},
			{"xgboost_classification", 2, "xgboost_project"},
			{"pytorch_nn", 3, "pytorch_project"},
		}

		for _, exp := range experiments {
			// Copy actual example files using fixtures
			sourceDir := examplesDir.GetPath(exp.exampleDir)
			experimentDir := filepath.Join(testDir, exp.name)

			// Copy all files from example directory
			if err := tests.CopyDir(sourceDir, experimentDir); err != nil {
				t.Fatalf("Failed to copy example %s: %v", exp.exampleDir, err)
			}

			// Add to queue (simulate job submission)
			timestamp := time.Now().Format("20060102_150405")
			jobName := fmt.Sprintf("%s_%s_priority_%d", exp.name, timestamp, exp.priority)
			jobDir := filepath.Join(queueDir, jobName)

			if err := os.MkdirAll(jobDir, 0755); err != nil {
				t.Fatalf("Failed to create queue directory for %s: %v", exp.name, err)
			}

			// Copy experiment files to queue
			files := []string{"train.py", "requirements.txt", "README.md"}
			for _, file := range files {
				src := filepath.Join(experimentDir, file)
				dst := filepath.Join(jobDir, file)

				if _, err := os.Stat(src); os.IsNotExist(err) {
					continue // Skip if file doesn't exist
				}

				data, err := os.ReadFile(src)
				if err != nil {
					t.Fatalf("Failed to read %s for %s: %v", file, exp.name, err)
				}

				if err := os.WriteFile(dst, data, 0755); err != nil {
					t.Fatalf("Failed to copy %s for %s: %v", file, exp.name, err)
				}
			}

			// Create queue metadata file
			queueMetadata := filepath.Join(jobDir, "queue_metadata.json")
			metadata := fmt.Sprintf(`{
  "job_name": "%s",
  "experiment_name": "%s",
  "example_source": "%s",
  "priority": %d,
  "status": "pending",
  "submitted_at": "%s"
}`, jobName, exp.name, exp.exampleDir, exp.priority, time.Now().Format(time.RFC3339))

			if err := os.WriteFile(queueMetadata, []byte(metadata), 0644); err != nil {
				t.Fatalf("Failed to create queue metadata for %s: %v", exp.name, err)
			}
		}

		// Verify all experiments are in queue
		for _, exp := range experiments {
			queueJobs, err := filepath.Glob(filepath.Join(queueDir, fmt.Sprintf("%s_*_priority_%d", exp.name, exp.priority)))
			if err != nil || len(queueJobs) == 0 {
				t.Errorf("Queue job should exist for %s with priority %d", exp.name, exp.priority)
			}
		}
	})

	// Test 2: Simulate sequential processing (queue behavior)
	t.Run("SequentialProcessing", func(t *testing.T) {
		pendingDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
		runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
		finishedDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "finished")

		// Create directories if they don't exist
		if err := os.MkdirAll(runningDir, 0755); err != nil {
			t.Fatalf("Failed to create running directory: %v", err)
		}
		if err := os.MkdirAll(finishedDir, 0755); err != nil {
			t.Fatalf("Failed to create finished directory: %v", err)
		}

		// Process jobs in priority order (1, 2, 3)
		for priority := 1; priority <= 3; priority++ {
			// Find job with this priority
			jobs, err := filepath.Glob(filepath.Join(pendingDir, fmt.Sprintf("*_priority_%d", priority)))
			if err != nil {
				t.Fatalf("Failed to find jobs with priority %d: %v", priority, err)
			}

			if len(jobs) == 0 {
				t.Fatalf("No job found with priority %d", priority)
			}

			jobDir := jobs[0] // Take first job with this priority
			jobName := filepath.Base(jobDir)

			// Move from pending to running
			runningJobDir := filepath.Join(runningDir, jobName)
			if err := os.Rename(jobDir, runningJobDir); err != nil {
				t.Fatalf("Failed to move job %s to running: %v", jobName, err)
			}

			// Verify only one job is running at this time
			runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
			if err != nil || len(runningJobs) != 1 {
				t.Errorf("Expected exactly 1 running job, found %d", len(runningJobs))
			}

			// Simulate execution by creating results (using actual framework patterns)
			outputDir := filepath.Join(runningJobDir, "results")
			if err := os.MkdirAll(outputDir, 0755); err != nil {
				t.Fatalf("Failed to create output directory for %s: %v", jobName, err)
			}

			// Read the actual train.py to determine framework
			trainScript := filepath.Join(runningJobDir, "train.py")
			scriptContent, err := os.ReadFile(trainScript)
			if err != nil {
				t.Fatalf("Failed to read train.py for %s: %v", jobName, err)
			}

			// Determine framework from script content
			framework := "unknown"
			scriptStr := string(scriptContent)
			if contains(scriptStr, "sklearn") {
				framework = "scikit-learn"
			} else if contains(scriptStr, "xgboost") {
				framework = "xgboost"
			} else if contains(scriptStr, "torch") {
				framework = "pytorch"
			} else if contains(scriptStr, "tensorflow") {
				framework = "tensorflow"
			} else if contains(scriptStr, "statsmodels") {
				framework = "statsmodels"
			}

			resultsFile := filepath.Join(outputDir, "results.json")
			results := fmt.Sprintf(`{
  "job_name": "%s",
  "framework": "%s",
  "priority": %d,
  "status": "completed",
  "execution_order": %d,
  "started_at": "%s",
  "completed_at": "%s",
  "source": "actual_example"
}`, jobName, framework, priority, priority, time.Now().Add(-time.Duration(priority)*time.Minute).Format(time.RFC3339), time.Now().Format(time.RFC3339))

			if err := os.WriteFile(resultsFile, []byte(results), 0644); err != nil {
				t.Fatalf("Failed to create results for %s: %v", jobName, err)
			}

			// Move from running to finished
			finishedJobDir := filepath.Join(finishedDir, jobName)
			if err := os.Rename(runningJobDir, finishedJobDir); err != nil {
				t.Fatalf("Failed to move job %s to finished: %v", jobName, err)
			}

			// Verify job is no longer in pending or running
			if _, err := os.Stat(jobDir); !os.IsNotExist(err) {
				t.Errorf("Job %s should no longer be in pending directory", jobName)
			}
			if _, err := os.Stat(runningJobDir); !os.IsNotExist(err) {
				t.Errorf("Job %s should no longer be in running directory", jobName)
			}
		}

		// Verify all jobs completed
		finishedJobs, err := filepath.Glob(filepath.Join(finishedDir, "*"))
		if err != nil || len(finishedJobs) != 3 {
			t.Errorf("Expected 3 finished jobs, got %d", len(finishedJobs))
		}

		// Verify queue is empty
		pendingJobs, err := filepath.Glob(filepath.Join(pendingDir, "*"))
		if err != nil || len(pendingJobs) != 0 {
			t.Errorf("Expected 0 pending jobs after processing, found %d", len(pendingJobs))
		}

		// Verify no jobs are running
		runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
		if err != nil || len(runningJobs) != 0 {
			t.Errorf("Expected 0 running jobs after processing, found %d", len(runningJobs))
		}
	})
}

// TestQueueCapacity tests queue capacity and resource limits
func TestQueueCapacity(t *testing.T) {
	t.Parallel() // Enable parallel execution

	testDir := t.TempDir()

	t.Run("QueueCapacityLimits", func(t *testing.T) {
		// Use fixtures for examples directory operations
		examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))

		pendingDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "pending")
		runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")
		finishedDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "finished")

		// Create directories
		if err := os.MkdirAll(pendingDir, 0755); err != nil {
			t.Fatalf("Failed to create pending directory: %v", err)
		}
		if err := os.MkdirAll(runningDir, 0755); err != nil {
			t.Fatalf("Failed to create running directory: %v", err)
		}
		if err := os.MkdirAll(finishedDir, 0755); err != nil {
			t.Fatalf("Failed to create finished directory: %v", err)
		}

		// Create more jobs than server can handle simultaneously using actual examples
		examples := []string{"standard_ml_project", "sklearn_project", "xgboost_project", "pytorch_project", "tensorflow_project"}
		totalJobs := len(examples)

		for i, example := range examples {
			jobName := fmt.Sprintf("capacity_test_job_%d", i)
			jobDir := filepath.Join(pendingDir, jobName)

			if err := os.MkdirAll(jobDir, 0755); err != nil {
				t.Fatalf("Failed to create job directory %s: %v", jobDir, err)
			}

			// Copy actual example files using fixtures
			sourceDir := examplesDir.GetPath(example)

			// Copy actual example files
			if _, err := os.Stat(sourceDir); os.IsNotExist(err) {
				// Create minimal files if example doesn't exist
				trainScript := filepath.Join(jobDir, "train.py")
				script := fmt.Sprintf(`#!/usr/bin/env python3
import json, time
from pathlib import Path

def main():
    results = {
        "job_id": %d,
        "example": "%s",
        "status": "completed",
        "completion_time": time.strftime("%%Y-%%m-%%d %%H:%%M:%%S")
    }

    output_dir = Path("./results")
    output_dir.mkdir(parents=True, exist_ok=True)

    with open(output_dir / "results.json", "w") as f:
        json.dump(results, f, indent=2)

if __name__ == "__main__":
    main()
`, i, example)

				if err := os.WriteFile(trainScript, []byte(script), 0755); err != nil {
					t.Fatalf("Failed to create train script for job %d: %v", i, err)
				}
			} else {
				// Copy actual example files
				files := []string{"train.py", "requirements.txt"}
				for _, file := range files {
					src := filepath.Join(sourceDir, file)
					dst := filepath.Join(jobDir, file)

					if _, err := os.Stat(src); os.IsNotExist(err) {
						continue // Skip if file doesn't exist
					}

					data, err := os.ReadFile(src)
					if err != nil {
						t.Fatalf("Failed to read %s for job %d: %v", file, i, err)
					}

					if err := os.WriteFile(dst, data, 0755); err != nil {
						t.Fatalf("Failed to copy %s for job %d: %v", file, i, err)
					}
				}
			}
		}

		// Verify all jobs are in pending queue
		pendingJobs, err := filepath.Glob(filepath.Join(pendingDir, "capacity_test_job_*"))
		if err != nil || len(pendingJobs) != totalJobs {
			t.Errorf("Expected %d pending jobs, found %d", totalJobs, len(pendingJobs))
		}

		// Process one job at a time (sequential execution)
		for i := 0; i < totalJobs; i++ {
			// Move one job to running
			jobName := fmt.Sprintf("capacity_test_job_%d", i)
			pendingJobDir := filepath.Join(pendingDir, jobName)
			runningJobDir := filepath.Join(runningDir, jobName)

			if err := os.Rename(pendingJobDir, runningJobDir); err != nil {
				t.Fatalf("Failed to move job %d to running: %v", i, err)
			}

			// Verify only one job is running
			runningJobs, err := filepath.Glob(filepath.Join(runningDir, "*"))
			if err != nil || len(runningJobs) != 1 {
				t.Errorf("Expected exactly 1 running job, found %d", len(runningJobs))
			}

			// Simulate job completion
			time.Sleep(5 * time.Millisecond) // Reduced from 10ms

			// Move to finished
			finishedJobDir := filepath.Join(finishedDir, jobName)
			if err := os.Rename(runningJobDir, finishedJobDir); err != nil {
				t.Fatalf("Failed to move job %d to finished: %v", i, err)
			}

			// Verify no jobs are running between jobs
			runningJobs, err = filepath.Glob(filepath.Join(runningDir, "*"))
			if err != nil || len(runningJobs) != 0 {
				t.Errorf("Expected 0 running jobs between jobs, found %d", len(runningJobs))
			}
		}

		// Verify all jobs completed
		finishedJobs, err := filepath.Glob(filepath.Join(finishedDir, "capacity_test_job_*"))
		if err != nil || len(finishedJobs) != totalJobs {
			t.Errorf("Expected %d finished jobs, found %d", totalJobs, len(finishedJobs))
		}

		// Verify queue is empty
		pendingJobs, err = filepath.Glob(filepath.Join(pendingDir, "capacity_test_job_*"))
		if err != nil || len(pendingJobs) != 0 {
			t.Errorf("Expected 0 pending jobs after processing, found %d", len(pendingJobs))
		}
	})
}

// TestResourceIsolation tests that experiments have isolated resources
func TestResourceIsolation(t *testing.T) {
	t.Parallel() // Enable parallel execution
	testDir := t.TempDir()

	t.Run("OutputDirectoryIsolation", func(t *testing.T) {
		// Use fixtures for examples directory operations
		examplesDir := tests.NewExamplesDir(filepath.Join("..", "fixtures", "examples"))

		// Create multiple experiments with same timestamp using actual examples
		timestamp := "20231201_143022"
		examples := []string{"sklearn_project", "xgboost_project", "pytorch_project"}

		runningDir := filepath.Join(testDir, "server", "home", "mluser", "ml_jobs", "running")

		for i, expName := range examples {
			jobName := fmt.Sprintf("exp%d_%s", i, timestamp)
			outputDir := filepath.Join(runningDir, jobName, "results")

			if err := os.MkdirAll(outputDir, 0755); err != nil {
				t.Fatalf("Failed to create output directory: %v", err)
			}

			// Copy actual example files using fixtures
			sourceDir := examplesDir.GetPath(expName)

			// Read actual example to create realistic results
			trainScript := filepath.Join(sourceDir, "train.py")

			framework := "unknown"
			if content, err := os.ReadFile(trainScript); err == nil {
				scriptStr := string(content)
				if contains(scriptStr, "sklearn") {
					framework = "scikit-learn"
				} else if contains(scriptStr, "xgboost") {
					framework = "xgboost"
				} else if contains(scriptStr, "torch") {
					framework = "pytorch"
				}
			}

			// Create unique results file based on actual framework
			resultsFile := filepath.Join(outputDir, "results.json")
			results := fmt.Sprintf(`{
  "experiment": "exp%d",
  "framework": "%s",
  "job_name": "%s",
  "output_dir": "%s",
  "example_source": "%s",
  "unique_id": "exp%d_%d"
}`, i, framework, jobName, outputDir, expName, i, time.Now().UnixNano())

			if err := os.WriteFile(resultsFile, []byte(results), 0644); err != nil {
				t.Fatalf("Failed to create results for %s: %v", expName, err)
			}
		}

		// Verify each experiment has its own isolated output directory
		for i, expName := range examples {
			jobName := fmt.Sprintf("exp%d_%s", i, timestamp)
			outputDir := filepath.Join(runningDir, jobName, "results")
			resultsFile := filepath.Join(outputDir, "results.json")

			if _, err := os.Stat(resultsFile); os.IsNotExist(err) {
				t.Errorf("Results file should exist for %s in isolated directory", expName)
			}

			// Verify content is unique
			content, err := os.ReadFile(resultsFile)
			if err != nil {
				t.Fatalf("Failed to read results for %s: %v", expName, err)
			}

			if !contains(string(content), fmt.Sprintf("exp%d", i)) {
				t.Errorf("Results file should contain experiment ID exp%d", i)
			}

			if !contains(string(content), expName) {
				t.Errorf("Results file should contain example source %s", expName)
			}
		}
	})
}

// Helper function to check if string contains substring
func contains(s, substr string) bool {
	return len(s) >= len(substr) && (s == substr || len(s) > len(substr) &&
		(s[:len(substr)] == substr || s[len(s)-len(substr):] == substr ||
			findSubstring(s, substr)))
}

func findSubstring(s, substr string) bool {
	for i := 0; i <= len(s)-len(substr); i++ {
		if s[i:i+len(substr)] == substr {
			return true
		}
	}
	return false
}