diff --git a/configs/worker/homelab-sandbox.yaml b/configs/worker/homelab-sandbox.yaml new file mode 100644 index 0000000..8f071d4 --- /dev/null +++ b/configs/worker/homelab-sandbox.yaml @@ -0,0 +1,53 @@ +# Worker configuration with sandboxing enabled +# This configuration provides strict isolation for sensitive workloads + +host: "127.0.0.1" +user: "worker" +base_path: "/var/lib/fetchml/experiments" +max_workers: 4 + +# Sandboxing configuration +sandbox: + # Network isolation: "none" (no network), "slirp4netns" (user-mode networking), + # "bridge" (bridge networking), or "" (default) + network_mode: "none" + + # Mount root filesystem as read-only + read_only_root: true + + # Enable secret injection for API keys + allow_secrets: true + + # Allowed secrets (explicit allowlist for security) + allowed_secrets: + - HF_TOKEN # Hugging Face API token + - WANDB_API_KEY # Weights & Biases API key + - OPENAI_API_KEY # OpenAI API key + - ANTHROPIC_API_KEY # Anthropic API key + - AWS_ACCESS_KEY_ID # AWS credentials + - AWS_SECRET_ACCESS_KEY + + # Seccomp profile for syscall filtering + seccomp_profile: "ml-research.json" + + # Maximum runtime before automatic termination (hours) + max_runtime_hours: 48 + +# Resource limits +resources: + max_memory_gb: 64 + max_cpu_cores: 16 + max_gpu_devices: 4 + +# Podman configuration +podman_image: "fetchml-ml:latest" +gpu_vendor: "nvidia" + +# Queue backend +queue: + backend: "redis" + redis_url: "redis://localhost:6379/0" + +# Snapshot store (optional) +snapshot_store: + enabled: false diff --git a/internal/config/security.go b/internal/config/security.go index 1adc50b..35f6177 100644 --- a/internal/config/security.go +++ b/internal/config/security.go @@ -30,6 +30,26 @@ type AuditLoggingConfig struct { LogPath string `yaml:"log_path"` } +// PrivacyConfig holds privacy enforcement configuration +type PrivacyConfig struct { + Enabled bool `yaml:"enabled"` + DefaultLevel string `yaml:"default_level"` // private, team, public, anonymized + EnforceTeams bool `yaml:"enforce_teams"` + AuditAccess bool `yaml:"audit_access"` +} + +// Validate checks privacy configuration +func (p *PrivacyConfig) Validate() error { + if !p.Enabled { + return nil + } + validLevels := map[string]bool{"private": true, "team": true, "public": true, "anonymized": true} + if p.DefaultLevel != "" && !validLevels[p.DefaultLevel] { + return fmt.Errorf("invalid default privacy level: %s", p.DefaultLevel) + } + return nil +} + // MonitoringConfig holds monitoring-related configuration type MonitoringConfig struct { Prometheus PrometheusConfig `yaml:"prometheus"` diff --git a/internal/container/podman.go b/internal/container/podman.go index 8715c52..15b988e 100644 --- a/internal/container/podman.go +++ b/internal/container/podman.go @@ -349,6 +349,16 @@ func BuildPodmanCommand( "--cap-drop", "ALL", } + // Add network mode if specified + if cfg.Network != "" { + args = append(args, "--network", cfg.Network) + } + + // Add read-only root filesystem + if cfg.ReadOnlyMounts { + args = append(args, "--read-only") + } + if cfg.Memory != "" { args = append(args, "--memory", cfg.Memory) } else { diff --git a/internal/worker/config.go b/internal/worker/config.go index 9e564bc..1f018fc 100644 --- a/internal/worker/config.go +++ b/internal/worker/config.go @@ -96,6 +96,9 @@ type Config struct { // Plugins configuration Plugins map[string]factory.PluginConfig `yaml:"plugins"` + + // Sandboxing configuration + Sandbox SandboxConfig `yaml:"sandbox"` } // MetricsConfig controls the Prometheus exporter. @@ -125,6 +128,28 @@ type AppleGPUConfig struct { MPSRuntime string `yaml:"mps_runtime"` } +// SandboxConfig holds container sandbox settings +type SandboxConfig struct { + NetworkMode string `yaml:"network_mode"` // "none", "slirp4netns", "bridge" + ReadOnlyRoot bool `yaml:"read_only_root"` + AllowSecrets bool `yaml:"allow_secrets"` + AllowedSecrets []string `yaml:"allowed_secrets"` // e.g., ["HF_TOKEN", "WANDB_API_KEY"] + SeccompProfile string `yaml:"seccomp_profile"` + MaxRuntimeHours int `yaml:"max_runtime_hours"` +} + +// Validate checks sandbox configuration +func (s *SandboxConfig) Validate() error { + validNetworks := map[string]bool{"none": true, "slirp4netns": true, "bridge": true, "": true} + if !validNetworks[s.NetworkMode] { + return fmt.Errorf("invalid network_mode: %s", s.NetworkMode) + } + if s.MaxRuntimeHours < 0 { + return fmt.Errorf("max_runtime_hours must be positive") + } + return nil +} + // LoadConfig loads worker configuration from a YAML file. func LoadConfig(path string) (*Config, error) { data, err := fileutil.SecureFileRead(path) diff --git a/internal/worker/execution/setup.go b/internal/worker/execution/setup.go index 18cd794..79c1306 100644 --- a/internal/worker/execution/setup.go +++ b/internal/worker/execution/setup.go @@ -2,6 +2,7 @@ package execution import ( + "context" "fmt" "io" "os" @@ -11,9 +12,77 @@ import ( "github.com/jfraeys/fetch_ml/internal/config" "github.com/jfraeys/fetch_ml/internal/container" "github.com/jfraeys/fetch_ml/internal/errtypes" + "github.com/jfraeys/fetch_ml/internal/logging" "github.com/jfraeys/fetch_ml/internal/storage" ) +// PrepareContainerEnv prepares environment and secrets for container execution +func PrepareContainerEnv( + ctx context.Context, + taskID string, + jobName string, + requestedSecrets []string, + allowedSecrets []string, + allowSecrets bool, + logger *logging.Logger, +) ([]container.PodmanSecret, map[string]string, error) { + env := make(map[string]string) + secrets := []container.PodmanSecret{} + + // Add standard env vars + env["FETCH_ML_JOB_NAME"] = jobName + env["FETCH_ML_TASK_ID"] = taskID + + // Inject requested secrets if allowed + if allowSecrets && len(requestedSecrets) > 0 { + for _, secretName := range requestedSecrets { + if !isSecretAllowed(secretName, allowedSecrets) { + logger.Warn("secret not in allowlist, skipping", "secret", secretName) + continue + } + + value, err := getSecretValue(secretName) + if err != nil { + logger.Warn("failed to get secret value", "secret", secretName, "error", err) + continue + } + + secret := container.PodmanSecret{ + Name: fmt.Sprintf("fetchml_%s_%s", strings.ToLower(secretName), taskID[:8]), + Data: []byte(value), + EnvVar: secretName, // Mount as env var + } + secrets = append(secrets, secret) + + logger.Info("injected secret", "secret", secretName, "task", taskID) + } + } + + return secrets, env, nil +} + +func isSecretAllowed(name string, allowedList []string) bool { + if len(allowedList) == 0 { + return false // Default deny + } + for _, allowed := range allowedList { + if strings.EqualFold(name, allowed) { + return true + } + } + return false +} + +func getSecretValue(name string) (string, error) { + // Try environment variable first + value := os.Getenv(name) + if value != "" { + return value, nil + } + + return "", fmt.Errorf("secret %s not found in environment", name) +} + // JobPaths holds the directory paths for a job type JobPaths struct { JobDir string