feat: Worker sandboxing and security configuration

Add security hardening features for worker execution: - Worker config with sandboxing options (network_mode, read_only, secrets) - Execution setup with security context propagation - Podman container runtime security enhancements - Security configuration management in config package - Add homelab-sandbox.yaml example configuration Supports running jobs in isolated, restricted environments.
2026-02-18 21:27:59 -05:00 · 2026-02-18 21:27:59 -05:00 · 4756348c48
commit 4756348c48
parent cb826b74a3
5 changed files with 177 additions and 0 deletions
--- a/configs/worker/homelab-sandbox.yaml
+++ b/configs/worker/homelab-sandbox.yaml
@ -0,0 +1,53 @@
+# Worker configuration with sandboxing enabled
+# This configuration provides strict isolation for sensitive workloads
+
+host: "127.0.0.1"
+user: "worker"
+base_path: "/var/lib/fetchml/experiments"
+max_workers: 4
+
+# Sandboxing configuration
+sandbox:
+  # Network isolation: "none" (no network), "slirp4netns" (user-mode networking),
+  # "bridge" (bridge networking), or "" (default)
+  network_mode: "none"
+
+  # Mount root filesystem as read-only
+  read_only_root: true
+
+  # Enable secret injection for API keys
+  allow_secrets: true
+
+  # Allowed secrets (explicit allowlist for security)
+  allowed_secrets:
+    - HF_TOKEN          # Hugging Face API token
+    - WANDB_API_KEY     # Weights & Biases API key
+    - OPENAI_API_KEY    # OpenAI API key
+    - ANTHROPIC_API_KEY # Anthropic API key
+    - AWS_ACCESS_KEY_ID # AWS credentials
+    - AWS_SECRET_ACCESS_KEY
+
+  # Seccomp profile for syscall filtering
+  seccomp_profile: "ml-research.json"
+
+  # Maximum runtime before automatic termination (hours)
+  max_runtime_hours: 48
+
+# Resource limits
+resources:
+  max_memory_gb: 64
+  max_cpu_cores: 16
+  max_gpu_devices: 4
+
+# Podman configuration
+podman_image: "fetchml-ml:latest"
+gpu_vendor: "nvidia"
+
+# Queue backend
+queue:
+  backend: "redis"
+  redis_url: "redis://localhost:6379/0"
+
+# Snapshot store (optional)
+snapshot_store:
+  enabled: false
--- a/internal/config/security.go
+++ b/internal/config/security.go
@ -30,6 +30,26 @@ type AuditLoggingConfig struct {
 	LogPath string `yaml:"log_path"`
 }

+// PrivacyConfig holds privacy enforcement configuration
+type PrivacyConfig struct {
+	Enabled      bool   `yaml:"enabled"`
+	DefaultLevel string `yaml:"default_level"` // private, team, public, anonymized
+	EnforceTeams bool   `yaml:"enforce_teams"`
+	AuditAccess  bool   `yaml:"audit_access"`
+}
+
+// Validate checks privacy configuration
+func (p *PrivacyConfig) Validate() error {
+	if !p.Enabled {
+		return nil
+	}
+	validLevels := map[string]bool{"private": true, "team": true, "public": true, "anonymized": true}
+	if p.DefaultLevel != "" && !validLevels[p.DefaultLevel] {
+		return fmt.Errorf("invalid default privacy level: %s", p.DefaultLevel)
+	}
+	return nil
+}
+
 // MonitoringConfig holds monitoring-related configuration
 type MonitoringConfig struct {
 	Prometheus   PrometheusConfig   `yaml:"prometheus"`
--- a/internal/container/podman.go
+++ b/internal/container/podman.go
@ -349,6 +349,16 @@ func BuildPodmanCommand(
 		"--cap-drop", "ALL",
 	}

+	// Add network mode if specified
+	if cfg.Network != "" {
+		args = append(args, "--network", cfg.Network)
+	}
+
+	// Add read-only root filesystem
+	if cfg.ReadOnlyMounts {
+		args = append(args, "--read-only")
+	}
+
 	if cfg.Memory != "" {
 		args = append(args, "--memory", cfg.Memory)
 	} else {
--- a/internal/worker/config.go
+++ b/internal/worker/config.go
@ -96,6 +96,9 @@ type Config struct {

 	// Plugins configuration
 	Plugins map[string]factory.PluginConfig `yaml:"plugins"`
+
+	// Sandboxing configuration
+	Sandbox SandboxConfig `yaml:"sandbox"`
 }

 // MetricsConfig controls the Prometheus exporter.
@ -125,6 +128,28 @@ type AppleGPUConfig struct {
 	MPSRuntime  string `yaml:"mps_runtime"`
 }

+// SandboxConfig holds container sandbox settings
+type SandboxConfig struct {
+	NetworkMode     string   `yaml:"network_mode"` // "none", "slirp4netns", "bridge"
+	ReadOnlyRoot    bool     `yaml:"read_only_root"`
+	AllowSecrets    bool     `yaml:"allow_secrets"`
+	AllowedSecrets  []string `yaml:"allowed_secrets"` // e.g., ["HF_TOKEN", "WANDB_API_KEY"]
+	SeccompProfile  string   `yaml:"seccomp_profile"`
+	MaxRuntimeHours int      `yaml:"max_runtime_hours"`
+}
+
+// Validate checks sandbox configuration
+func (s *SandboxConfig) Validate() error {
+	validNetworks := map[string]bool{"none": true, "slirp4netns": true, "bridge": true, "": true}
+	if !validNetworks[s.NetworkMode] {
+		return fmt.Errorf("invalid network_mode: %s", s.NetworkMode)
+	}
+	if s.MaxRuntimeHours < 0 {
+		return fmt.Errorf("max_runtime_hours must be positive")
+	}
+	return nil
+}
+
 // LoadConfig loads worker configuration from a YAML file.
 func LoadConfig(path string) (*Config, error) {
 	data, err := fileutil.SecureFileRead(path)
--- a/internal/worker/execution/setup.go
+++ b/internal/worker/execution/setup.go
@ -2,6 +2,7 @@
 package execution

 import (
+	"context"
 	"fmt"
 	"io"
 	"os"
@ -11,9 +12,77 @@ import (
 	"github.com/jfraeys/fetch_ml/internal/config"
 	"github.com/jfraeys/fetch_ml/internal/container"
 	"github.com/jfraeys/fetch_ml/internal/errtypes"
+	"github.com/jfraeys/fetch_ml/internal/logging"
 	"github.com/jfraeys/fetch_ml/internal/storage"
 )

+// PrepareContainerEnv prepares environment and secrets for container execution
+func PrepareContainerEnv(
+	ctx context.Context,
+	taskID string,
+	jobName string,
+	requestedSecrets []string,
+	allowedSecrets []string,
+	allowSecrets bool,
+	logger *logging.Logger,
+) ([]container.PodmanSecret, map[string]string, error) {
+	env := make(map[string]string)
+	secrets := []container.PodmanSecret{}
+
+	// Add standard env vars
+	env["FETCH_ML_JOB_NAME"] = jobName
+	env["FETCH_ML_TASK_ID"] = taskID
+
+	// Inject requested secrets if allowed
+	if allowSecrets && len(requestedSecrets) > 0 {
+		for _, secretName := range requestedSecrets {
+			if !isSecretAllowed(secretName, allowedSecrets) {
+				logger.Warn("secret not in allowlist, skipping", "secret", secretName)
+				continue
+			}
+
+			value, err := getSecretValue(secretName)
+			if err != nil {
+				logger.Warn("failed to get secret value", "secret", secretName, "error", err)
+				continue
+			}
+
+			secret := container.PodmanSecret{
+				Name:   fmt.Sprintf("fetchml_%s_%s", strings.ToLower(secretName), taskID[:8]),
+				Data:   []byte(value),
+				EnvVar: secretName, // Mount as env var
+			}
+			secrets = append(secrets, secret)
+
+			logger.Info("injected secret", "secret", secretName, "task", taskID)
+		}
+	}
+
+	return secrets, env, nil
+}
+
+func isSecretAllowed(name string, allowedList []string) bool {
+	if len(allowedList) == 0 {
+		return false // Default deny
+	}
+	for _, allowed := range allowedList {
+		if strings.EqualFold(name, allowed) {
+			return true
+		}
+	}
+	return false
+}
+
+func getSecretValue(name string) (string, error) {
+	// Try environment variable first
+	value := os.Getenv(name)
+	if value != "" {
+		return value, nil
+	}
+
+	return "", fmt.Errorf("secret %s not found in environment", name)
+}
+
 // JobPaths holds the directory paths for a job
 type JobPaths struct {
 	JobDir    string