fetch_ml/internal/worker/executor/local.go

// Package executor provides job execution implementations
package executor

import (
	"context"
	"fmt"
	"log"
	"os"
	"os/exec"
	"path/filepath"
	"strings"

	"github.com/jfraeys/fetch_ml/internal/errtypes"
	"github.com/jfraeys/fetch_ml/internal/fileutil"
	"github.com/jfraeys/fetch_ml/internal/logging"
	"github.com/jfraeys/fetch_ml/internal/manifest"
	"github.com/jfraeys/fetch_ml/internal/queue"
	"github.com/jfraeys/fetch_ml/internal/worker/interfaces"
)

// LocalExecutor executes jobs locally using bash scripts
type LocalExecutor struct {
	logger *logging.Logger
	writer interfaces.ManifestWriter
}

// NewLocalExecutor creates a new local job executor
func NewLocalExecutor(logger *logging.Logger, writer interfaces.ManifestWriter) *LocalExecutor {
	return &LocalExecutor{
		logger: logger,
		writer: writer,
	}
}

// Execute runs a job locally
func (e *LocalExecutor) Execute(ctx context.Context, task *queue.Task, env interfaces.ExecutionEnv) error {
	// Generate and write script with crash safety (fsync)
	scriptContent := generateScript(task)
	scriptPath := filepath.Join(env.OutputDir, "run.sh")

	if err := fileutil.WriteFileSafe(scriptPath, []byte(scriptContent), 0600); err != nil {
		return &errtypes.TaskExecutionError{
			TaskID:  task.ID,
			JobName: task.JobName,
			Phase:   "execution",
			Err:     fmt.Errorf("failed to write script: %w", err),
		}
	}

	// Update manifest
	if e.writer != nil {
		e.writer.Upsert(env.OutputDir, task, func(m *manifest.RunManifest) {
			m.Command = "bash"
			m.Args = scriptPath
		})
	}

	// Open log file
	logFileHandle, err := fileutil.SecureOpenFile(env.LogFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600)
	if err != nil {
		e.logger.Warn("failed to open log file for local output", "path", env.LogFile, "error", err)
		return &errtypes.TaskExecutionError{
			TaskID:  task.ID,
			JobName: task.JobName,
			Phase:   "execution",
			Err:     fmt.Errorf("failed to open log file: %w", err),
		}
	}
	defer func() {
		if err := logFileHandle.Close(); err != nil {
			log.Printf("Warning: failed to close log file: %v", err)
		}
	}()

	// Execute the script
	localCmd := exec.CommandContext(ctx, "bash", scriptPath)
	localEnv := os.Environ()

	if strings.TrimSpace(env.GPUEnvVar) != "" {
		localEnv = append(localEnv, fmt.Sprintf("%s=%s", env.GPUEnvVar, strings.TrimSpace(env.GPUDevicesStr)))
	}

	snap := filepath.Join(env.OutputDir, "snapshot")
	if info, err := os.Stat(snap); err == nil && info.IsDir() {
		localEnv = append(localEnv, fmt.Sprintf("FETCH_ML_SNAPSHOT_DIR=%s", snap))
		if strings.TrimSpace(task.SnapshotID) != "" {
			localEnv = append(localEnv, fmt.Sprintf("FETCH_ML_SNAPSHOT_ID=%s", strings.TrimSpace(task.SnapshotID)))
		}
	}

	localCmd.Env = localEnv
	localCmd.Stdout = logFileHandle
	localCmd.Stderr = logFileHandle

	e.logger.Info("executing local job",
		"job", task.JobName,
		"task_id", task.ID,
		"script", scriptPath)

	if err := localCmd.Run(); err != nil {
		return &errtypes.TaskExecutionError{
			TaskID:  task.ID,
			JobName: task.JobName,
			Phase:   "execution",
			Err:     fmt.Errorf("execution failed: %w", err),
		}
	}

	return nil
}

// generateScript creates a bash script for the experiment
func generateScript(task *queue.Task) string {
	return `#!/bin/bash
set -e

echo "Starting experiment: ` + task.JobName + `"
echo "Task ID: ` + task.ID + `"
echo "Timestamp: $(date)"

# Simulate ML experiment
echo "Loading data..."
sleep 1

echo "Training model..."
sleep 2

echo "Evaluating model..."
sleep 1

# Generate results
ACCURACY=0.95
LOSS=0.05
EPOCHS=10

echo ""
echo "=== EXPERIMENT RESULTS ==="
echo "Accuracy: $ACCURACY"
echo "Loss: $LOSS"
echo "Epochs: $EPOCHS"
echo "Status: SUCCESS"
echo "========================="
echo "Experiment completed successfully!"
`
}