fetch_ml/internal/worker/execution/setup.go
Jeremie Fraeys 0b5e99f720
refactor(scheduler,worker): improve service management and GPU detection
Scheduler enhancements:
- auth.go: Group membership validation in authentication
- hub.go: Task distribution with group affinity
- port_allocator.go: Dynamic port allocation with conflict resolution
- scheduler_conn.go: Connection pooling and retry logic
- service_manager.go: Lifecycle management for scheduler services
- service_templates.go: Template-based service configuration
- state.go: Persistent state management with recovery

Worker improvements:
- config.go: Extended configuration for task visibility rules
- execution/setup.go: Sandboxed execution environment setup
- executor/container.go: Container runtime integration
- executor/runner.go: Task runner with visibility enforcement
- gpu_detector.go: Robust GPU detection (NVIDIA, AMD, Apple Silicon, CPU fallback)
- integrity/validate.go: Data integrity validation
- lifecycle/runloop.go: Improved runloop with graceful shutdown
- lifecycle/service_manager.go: Service lifecycle coordination
- process/isolation.go + isolation_unix.go: Process isolation with namespaces/cgroups
- tenant/manager.go: Multi-tenant resource isolation
- tenant/middleware.go: Tenant context propagation
- worker.go: Core worker with group-scoped task execution
2026-03-08 13:03:15 -04:00

225 lines
5.4 KiB
Go

// Package execution provides job execution utilities for the worker
package execution
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/jfraeys/fetch_ml/internal/config"
"github.com/jfraeys/fetch_ml/internal/container"
"github.com/jfraeys/fetch_ml/internal/errtypes"
"github.com/jfraeys/fetch_ml/internal/logging"
"github.com/jfraeys/fetch_ml/internal/storage"
)
// PrepareContainerEnv prepares environment and secrets for container execution
func PrepareContainerEnv(
ctx context.Context,
taskID string,
jobName string,
requestedSecrets []string,
allowedSecrets []string,
allowSecrets bool,
logger *logging.Logger,
) ([]container.PodmanSecret, map[string]string, error) {
env := make(map[string]string)
secrets := []container.PodmanSecret{}
// Add standard env vars
env["FETCH_ML_JOB_NAME"] = jobName
env["FETCH_ML_TASK_ID"] = taskID
// Inject requested secrets if allowed
if allowSecrets && len(requestedSecrets) > 0 {
for _, secretName := range requestedSecrets {
if !isSecretAllowed(secretName, allowedSecrets) {
logger.Warn("secret not in allowlist, skipping", "secret", secretName)
continue
}
value, err := getSecretValue(secretName)
if err != nil {
logger.Warn("failed to get secret value", "secret", secretName, "error", err)
continue
}
secret := container.PodmanSecret{
Name: fmt.Sprintf("fetchml_%s_%s", strings.ToLower(secretName), taskID[:8]),
Data: []byte(value),
EnvVar: secretName, // Mount as env var
}
secrets = append(secrets, secret)
logger.Info("injected secret", "secret", secretName, "task", taskID)
}
}
return secrets, env, nil
}
func isSecretAllowed(name string, allowedList []string) bool {
if len(allowedList) == 0 {
return false // Default deny
}
for _, allowed := range allowedList {
if strings.EqualFold(name, allowed) {
return true
}
}
return false
}
func getSecretValue(name string) (string, error) {
// Try environment variable first
value := os.Getenv(name)
if value != "" {
return value, nil
}
return "", fmt.Errorf("secret %s not found in environment", name)
}
// JobPaths holds the directory paths for a job
type JobPaths struct {
JobDir string
OutputDir string
LogFile string
}
// SetupJobDirectories creates the necessary directories for a job using PathRegistry
func SetupJobDirectories(
basePath string,
jobName string,
taskID string,
) (jobDir, outputDir, logFile string, err error) {
jobPaths := storage.NewJobPaths(basePath)
pendingDir := jobPaths.PendingPath()
jobDir = filepath.Join(pendingDir, jobName)
outputDir = filepath.Join(jobPaths.RunningPath(), jobName)
logFile = filepath.Join(outputDir, "output.log")
// Use PathRegistry for consistent directory creation
paths := config.FromEnv()
// Create pending directory
if err := paths.EnsureDir(pendingDir); err != nil {
return "", "", "", &errtypes.TaskExecutionError{
TaskID: taskID,
JobName: jobName,
Phase: "setup",
Err: fmt.Errorf("failed to create pending dir: %w", err),
}
}
// Create job directory in pending
if err := paths.EnsureDir(jobDir); err != nil {
return "", "", "", &errtypes.TaskExecutionError{
TaskID: taskID,
JobName: jobName,
Phase: "setup",
Err: fmt.Errorf("failed to create job dir: %w", err),
}
}
// Sanitize paths
jobDir, err = container.SanitizePath(jobDir)
if err != nil {
return "", "", "", &errtypes.TaskExecutionError{
TaskID: taskID,
JobName: jobName,
Phase: "validation",
Err: err,
}
}
outputDir, err = container.SanitizePath(outputDir)
if err != nil {
return "", "", "", &errtypes.TaskExecutionError{
TaskID: taskID,
JobName: jobName,
Phase: "validation",
Err: err,
}
}
// Create running directory
if err := paths.EnsureDir(outputDir); err != nil {
return "", "", "", &errtypes.TaskExecutionError{
TaskID: taskID,
JobName: jobName,
Phase: "setup",
Err: fmt.Errorf("failed to create running dir: %w", err),
}
}
return jobDir, outputDir, logFile, nil
}
// CopyDir copies a directory tree from src to dst
func CopyDir(src, dst string) error {
src = filepath.Clean(src)
dst = filepath.Clean(dst)
srcInfo, err := os.Stat(src)
if err != nil {
return err
}
if !srcInfo.IsDir() {
return fmt.Errorf("source is not a directory")
}
if err := os.MkdirAll(dst, 0750); err != nil {
return err
}
return filepath.WalkDir(src, func(path string, d os.DirEntry, walkErr error) error {
if walkErr != nil {
return walkErr
}
rel, err := filepath.Rel(src, path)
if err != nil {
return err
}
rel = filepath.Clean(rel)
if rel == "." {
return nil
}
if rel == ".." || strings.HasPrefix(rel, "..") {
return fmt.Errorf("invalid relative path")
}
outPath := filepath.Join(dst, rel)
if d.IsDir() {
return os.MkdirAll(outPath, 0750)
}
info, err := d.Info()
if err != nil {
return err
}
mode := info.Mode() & 0777
return copyFile(filepath.Clean(path), outPath, mode)
})
}
// copyFile copies a single file
func copyFile(src, dst string, mode os.FileMode) error {
// #nosec G304 -- src is validated path for job files
srcFile, err := os.Open(src)
if err != nil {
return err
}
defer srcFile.Close()
// #nosec G304 -- dst is validated output path for job files
dstFile, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
if err != nil {
return err
}
defer dstFile.Close()
_, err = io.Copy(dstFile, srcFile)
return err
}