fetch_ml/internal/worker/executor/local.go
Jeremie Fraeys 22f3d66f1d
refactor: Phase 2 - Extract executor implementations
Created executor package with extracted job execution logic:

1. internal/worker/executor/local.go (104 lines)
   - LocalExecutor implements JobExecutor interface
   - Execute() method for local bash script execution
   - generateScript() helper for creating experiment scripts

2. internal/worker/executor/container.go (229 lines)
   - ContainerExecutor implements JobExecutor interface
   - Execute() method for podman container execution
   - EnvironmentPool interface for image caching
   - Tracking tool provisioning (MLflow, TensorBoard, Wandb)
   - Volume and cache setup
   - selectDependencyManifest() helper

3. internal/worker/executor/runner.go (131 lines)
   - JobRunner orchestrates execution
   - ExecutionMode enum (Auto, Local, Container)
   - Run() method with directory setup and executor selection
   - finalize() for success/failure handling

Key design decisions:
- Executors depend on interfaces (ManifestWriter, not Worker)
- JobRunner composes both executors
- No direct Worker dependencies in executor package
- SetupJobDirectories reused from execution package

Build status: Compiles successfully
2026-02-17 14:14:04 -05:00

145 lines
3.6 KiB
Go

// Package executor provides job execution implementations
package executor
import (
"context"
"fmt"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"github.com/jfraeys/fetch_ml/internal/errtypes"
"github.com/jfraeys/fetch_ml/internal/fileutil"
"github.com/jfraeys/fetch_ml/internal/logging"
"github.com/jfraeys/fetch_ml/internal/manifest"
"github.com/jfraeys/fetch_ml/internal/queue"
"github.com/jfraeys/fetch_ml/internal/worker/interfaces"
)
// LocalExecutor executes jobs locally using bash scripts
type LocalExecutor struct {
logger *logging.Logger
writer interfaces.ManifestWriter
}
// NewLocalExecutor creates a new local job executor
func NewLocalExecutor(logger *logging.Logger, writer interfaces.ManifestWriter) *LocalExecutor {
return &LocalExecutor{
logger: logger,
writer: writer,
}
}
// Execute runs a job locally
func (e *LocalExecutor) Execute(ctx context.Context, task *queue.Task, env interfaces.ExecutionEnv) error {
// Generate and write script
scriptContent := generateScript(task)
scriptPath := filepath.Join(env.OutputDir, "run.sh")
if err := os.WriteFile(scriptPath, []byte(scriptContent), 0600); err != nil {
return &errtypes.TaskExecutionError{
TaskID: task.ID,
JobName: task.JobName,
Phase: "execution",
Err: fmt.Errorf("failed to write script: %w", err),
}
}
// Update manifest
if e.writer != nil {
e.writer.Upsert(env.OutputDir, task, func(m *manifest.RunManifest) {
m.Command = "bash"
m.Args = scriptPath
})
}
// Open log file
logFileHandle, err := fileutil.SecureOpenFile(env.LogFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600)
if err != nil {
e.logger.Warn("failed to open log file for local output", "path", env.LogFile, "error", err)
return &errtypes.TaskExecutionError{
TaskID: task.ID,
JobName: task.JobName,
Phase: "execution",
Err: fmt.Errorf("failed to open log file: %w", err),
}
}
defer func() {
if err := logFileHandle.Close(); err != nil {
log.Printf("Warning: failed to close log file: %v", err)
}
}()
// Execute the script
localCmd := exec.CommandContext(ctx, "bash", scriptPath)
localEnv := os.Environ()
if strings.TrimSpace(env.GPUEnvVar) != "" {
localEnv = append(localEnv, fmt.Sprintf("%s=%s", env.GPUEnvVar, strings.TrimSpace(env.GPUDevicesStr)))
}
snap := filepath.Join(env.OutputDir, "snapshot")
if info, err := os.Stat(snap); err == nil && info.IsDir() {
localEnv = append(localEnv, fmt.Sprintf("FETCH_ML_SNAPSHOT_DIR=%s", snap))
if strings.TrimSpace(task.SnapshotID) != "" {
localEnv = append(localEnv, fmt.Sprintf("FETCH_ML_SNAPSHOT_ID=%s", strings.TrimSpace(task.SnapshotID)))
}
}
localCmd.Env = localEnv
localCmd.Stdout = logFileHandle
localCmd.Stderr = logFileHandle
e.logger.Info("executing local job",
"job", task.JobName,
"task_id", task.ID,
"script", scriptPath)
if err := localCmd.Run(); err != nil {
return &errtypes.TaskExecutionError{
TaskID: task.ID,
JobName: task.JobName,
Phase: "execution",
Err: fmt.Errorf("execution failed: %w", err),
}
}
return nil
}
// generateScript creates a bash script for the experiment
func generateScript(task *queue.Task) string {
return `#!/bin/bash
set -e
echo "Starting experiment: ` + task.JobName + `"
echo "Task ID: ` + task.ID + `"
echo "Timestamp: $(date)"
# Simulate ML experiment
echo "Loading data..."
sleep 1
echo "Training model..."
sleep 2
echo "Evaluating model..."
sleep 1
# Generate results
ACCURACY=0.95
LOSS=0.05
EPOCHS=10
echo ""
echo "=== EXPERIMENT RESULTS ==="
echo "Accuracy: $ACCURACY"
echo "Loss: $LOSS"
echo "Epochs: $EPOCHS"
echo "Status: SUCCESS"
echo "========================="
echo "Experiment completed successfully!"
`
}