Update worker system for scheduler integration: - Worker server with scheduler registration - Configuration with scheduler endpoint support - Artifact handling with integrity verification - Container executor with supply chain validation - Local executor enhancements - GPU detection improvements (cross-platform) - Error handling with execution context - Factory pattern for executor instantiation - Hash integrity with native library support
145 lines
3.7 KiB
Go
145 lines
3.7 KiB
Go
// Package executor provides job execution implementations
|
|
package executor
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/errtypes"
|
|
"github.com/jfraeys/fetch_ml/internal/fileutil"
|
|
"github.com/jfraeys/fetch_ml/internal/logging"
|
|
"github.com/jfraeys/fetch_ml/internal/manifest"
|
|
"github.com/jfraeys/fetch_ml/internal/queue"
|
|
"github.com/jfraeys/fetch_ml/internal/worker/interfaces"
|
|
)
|
|
|
|
// LocalExecutor executes jobs locally using bash scripts
|
|
type LocalExecutor struct {
|
|
logger *logging.Logger
|
|
writer interfaces.ManifestWriter
|
|
}
|
|
|
|
// NewLocalExecutor creates a new local job executor
|
|
func NewLocalExecutor(logger *logging.Logger, writer interfaces.ManifestWriter) *LocalExecutor {
|
|
return &LocalExecutor{
|
|
logger: logger,
|
|
writer: writer,
|
|
}
|
|
}
|
|
|
|
// Execute runs a job locally
|
|
func (e *LocalExecutor) Execute(ctx context.Context, task *queue.Task, env interfaces.ExecutionEnv) error {
|
|
// Generate and write script with crash safety (fsync)
|
|
scriptContent := generateScript(task)
|
|
scriptPath := filepath.Join(env.OutputDir, "run.sh")
|
|
|
|
if err := fileutil.WriteFileSafe(scriptPath, []byte(scriptContent), 0600); err != nil {
|
|
return &errtypes.TaskExecutionError{
|
|
TaskID: task.ID,
|
|
JobName: task.JobName,
|
|
Phase: "execution",
|
|
Err: fmt.Errorf("failed to write script: %w", err),
|
|
}
|
|
}
|
|
|
|
// Update manifest
|
|
if e.writer != nil {
|
|
e.writer.Upsert(env.OutputDir, task, func(m *manifest.RunManifest) {
|
|
m.Command = "bash"
|
|
m.Args = scriptPath
|
|
})
|
|
}
|
|
|
|
// Open log file
|
|
logFileHandle, err := fileutil.SecureOpenFile(env.LogFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600)
|
|
if err != nil {
|
|
e.logger.Warn("failed to open log file for local output", "path", env.LogFile, "error", err)
|
|
return &errtypes.TaskExecutionError{
|
|
TaskID: task.ID,
|
|
JobName: task.JobName,
|
|
Phase: "execution",
|
|
Err: fmt.Errorf("failed to open log file: %w", err),
|
|
}
|
|
}
|
|
defer func() {
|
|
if err := logFileHandle.Close(); err != nil {
|
|
log.Printf("Warning: failed to close log file: %v", err)
|
|
}
|
|
}()
|
|
|
|
// Execute the script
|
|
localCmd := exec.CommandContext(ctx, "bash", scriptPath)
|
|
localEnv := os.Environ()
|
|
|
|
if strings.TrimSpace(env.GPUEnvVar) != "" {
|
|
localEnv = append(localEnv, fmt.Sprintf("%s=%s", env.GPUEnvVar, strings.TrimSpace(env.GPUDevicesStr)))
|
|
}
|
|
|
|
snap := filepath.Join(env.OutputDir, "snapshot")
|
|
if info, err := os.Stat(snap); err == nil && info.IsDir() {
|
|
localEnv = append(localEnv, fmt.Sprintf("FETCH_ML_SNAPSHOT_DIR=%s", snap))
|
|
if strings.TrimSpace(task.SnapshotID) != "" {
|
|
localEnv = append(localEnv, fmt.Sprintf("FETCH_ML_SNAPSHOT_ID=%s", strings.TrimSpace(task.SnapshotID)))
|
|
}
|
|
}
|
|
|
|
localCmd.Env = localEnv
|
|
localCmd.Stdout = logFileHandle
|
|
localCmd.Stderr = logFileHandle
|
|
|
|
e.logger.Info("executing local job",
|
|
"job", task.JobName,
|
|
"task_id", task.ID,
|
|
"script", scriptPath)
|
|
|
|
if err := localCmd.Run(); err != nil {
|
|
return &errtypes.TaskExecutionError{
|
|
TaskID: task.ID,
|
|
JobName: task.JobName,
|
|
Phase: "execution",
|
|
Err: fmt.Errorf("execution failed: %w", err),
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// generateScript creates a bash script for the experiment
|
|
func generateScript(task *queue.Task) string {
|
|
return `#!/bin/bash
|
|
set -e
|
|
|
|
echo "Starting experiment: ` + task.JobName + `"
|
|
echo "Task ID: ` + task.ID + `"
|
|
echo "Timestamp: $(date)"
|
|
|
|
# Simulate ML experiment
|
|
echo "Loading data..."
|
|
sleep 1
|
|
|
|
echo "Training model..."
|
|
sleep 2
|
|
|
|
echo "Evaluating model..."
|
|
sleep 1
|
|
|
|
# Generate results
|
|
ACCURACY=0.95
|
|
LOSS=0.05
|
|
EPOCHS=10
|
|
|
|
echo ""
|
|
echo "=== EXPERIMENT RESULTS ==="
|
|
echo "Accuracy: $ACCURACY"
|
|
echo "Loss: $LOSS"
|
|
echo "Epochs: $EPOCHS"
|
|
echo "Status: SUCCESS"
|
|
echo "========================="
|
|
echo "Experiment completed successfully!"
|
|
`
|
|
}
|