// Package executor provides job execution implementations package executor import ( "context" "fmt" "log" "os" "os/exec" "path/filepath" "strings" "github.com/jfraeys/fetch_ml/internal/errtypes" "github.com/jfraeys/fetch_ml/internal/fileutil" "github.com/jfraeys/fetch_ml/internal/logging" "github.com/jfraeys/fetch_ml/internal/manifest" "github.com/jfraeys/fetch_ml/internal/queue" "github.com/jfraeys/fetch_ml/internal/worker/interfaces" ) // LocalExecutor executes jobs locally using bash scripts type LocalExecutor struct { logger *logging.Logger writer interfaces.ManifestWriter } // NewLocalExecutor creates a new local job executor func NewLocalExecutor(logger *logging.Logger, writer interfaces.ManifestWriter) *LocalExecutor { return &LocalExecutor{ logger: logger, writer: writer, } } // Execute runs a job locally func (e *LocalExecutor) Execute(ctx context.Context, task *queue.Task, env interfaces.ExecutionEnv) error { // Generate and write script with crash safety (fsync) scriptContent := generateScript(task) scriptPath := filepath.Join(env.OutputDir, "run.sh") if err := fileutil.WriteFileSafe(scriptPath, []byte(scriptContent), 0600); err != nil { return &errtypes.TaskExecutionError{ TaskID: task.ID, JobName: task.JobName, Phase: "execution", Err: fmt.Errorf("failed to write script: %w", err), } } // Update manifest if e.writer != nil { e.writer.Upsert(env.OutputDir, task, func(m *manifest.RunManifest) { m.Command = "bash" m.Args = scriptPath }) } // Open log file logFileHandle, err := fileutil.SecureOpenFile(env.LogFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600) if err != nil { e.logger.Warn("failed to open log file for local output", "path", env.LogFile, "error", err) return &errtypes.TaskExecutionError{ TaskID: task.ID, JobName: task.JobName, Phase: "execution", Err: fmt.Errorf("failed to open log file: %w", err), } } defer func() { if err := logFileHandle.Close(); err != nil { log.Printf("Warning: failed to close log file: %v", err) } }() // Execute the script localCmd := exec.CommandContext(ctx, "bash", scriptPath) localEnv := os.Environ() if strings.TrimSpace(env.GPUEnvVar) != "" { localEnv = append(localEnv, fmt.Sprintf("%s=%s", env.GPUEnvVar, strings.TrimSpace(env.GPUDevicesStr))) } snap := filepath.Join(env.OutputDir, "snapshot") if info, err := os.Stat(snap); err == nil && info.IsDir() { localEnv = append(localEnv, fmt.Sprintf("FETCH_ML_SNAPSHOT_DIR=%s", snap)) if strings.TrimSpace(task.SnapshotID) != "" { localEnv = append(localEnv, fmt.Sprintf("FETCH_ML_SNAPSHOT_ID=%s", strings.TrimSpace(task.SnapshotID))) } } localCmd.Env = localEnv localCmd.Stdout = logFileHandle localCmd.Stderr = logFileHandle e.logger.Info("executing local job", "job", task.JobName, "task_id", task.ID, "script", scriptPath) if err := localCmd.Run(); err != nil { return &errtypes.TaskExecutionError{ TaskID: task.ID, JobName: task.JobName, Phase: "execution", Err: fmt.Errorf("execution failed: %w", err), } } return nil } // generateScript creates a bash script for the experiment func generateScript(task *queue.Task) string { return `#!/bin/bash set -e echo "Starting experiment: ` + task.JobName + `" echo "Task ID: ` + task.ID + `" echo "Timestamp: $(date)" # Simulate ML experiment echo "Loading data..." sleep 1 echo "Training model..." sleep 2 echo "Evaluating model..." sleep 1 # Generate results ACCURACY=0.95 LOSS=0.05 EPOCHS=10 echo "" echo "=== EXPERIMENT RESULTS ===" echo "Accuracy: $ACCURACY" echo "Loss: $LOSS" echo "Epochs: $EPOCHS" echo "Status: SUCCESS" echo "=========================" echo "Experiment completed successfully!" ` }