feat: Worker sandboxing and security configuration

Add security hardening features for worker execution:
- Worker config with sandboxing options (network_mode, read_only, secrets)
- Execution setup with security context propagation
- Podman container runtime security enhancements
- Security configuration management in config package
- Add homelab-sandbox.yaml example configuration

Supports running jobs in isolated, restricted environments.
This commit is contained in:
Jeremie Fraeys 2026-02-18 21:27:59 -05:00
parent cb826b74a3
commit 4756348c48
No known key found for this signature in database
5 changed files with 177 additions and 0 deletions

View file

@ -0,0 +1,53 @@
# Worker configuration with sandboxing enabled
# This configuration provides strict isolation for sensitive workloads
host: "127.0.0.1"
user: "worker"
base_path: "/var/lib/fetchml/experiments"
max_workers: 4
# Sandboxing configuration
sandbox:
# Network isolation: "none" (no network), "slirp4netns" (user-mode networking),
# "bridge" (bridge networking), or "" (default)
network_mode: "none"
# Mount root filesystem as read-only
read_only_root: true
# Enable secret injection for API keys
allow_secrets: true
# Allowed secrets (explicit allowlist for security)
allowed_secrets:
- HF_TOKEN # Hugging Face API token
- WANDB_API_KEY # Weights & Biases API key
- OPENAI_API_KEY # OpenAI API key
- ANTHROPIC_API_KEY # Anthropic API key
- AWS_ACCESS_KEY_ID # AWS credentials
- AWS_SECRET_ACCESS_KEY
# Seccomp profile for syscall filtering
seccomp_profile: "ml-research.json"
# Maximum runtime before automatic termination (hours)
max_runtime_hours: 48
# Resource limits
resources:
max_memory_gb: 64
max_cpu_cores: 16
max_gpu_devices: 4
# Podman configuration
podman_image: "fetchml-ml:latest"
gpu_vendor: "nvidia"
# Queue backend
queue:
backend: "redis"
redis_url: "redis://localhost:6379/0"
# Snapshot store (optional)
snapshot_store:
enabled: false

View file

@ -30,6 +30,26 @@ type AuditLoggingConfig struct {
LogPath string `yaml:"log_path"`
}
// PrivacyConfig holds privacy enforcement configuration
type PrivacyConfig struct {
Enabled bool `yaml:"enabled"`
DefaultLevel string `yaml:"default_level"` // private, team, public, anonymized
EnforceTeams bool `yaml:"enforce_teams"`
AuditAccess bool `yaml:"audit_access"`
}
// Validate checks privacy configuration
func (p *PrivacyConfig) Validate() error {
if !p.Enabled {
return nil
}
validLevels := map[string]bool{"private": true, "team": true, "public": true, "anonymized": true}
if p.DefaultLevel != "" && !validLevels[p.DefaultLevel] {
return fmt.Errorf("invalid default privacy level: %s", p.DefaultLevel)
}
return nil
}
// MonitoringConfig holds monitoring-related configuration
type MonitoringConfig struct {
Prometheus PrometheusConfig `yaml:"prometheus"`

View file

@ -349,6 +349,16 @@ func BuildPodmanCommand(
"--cap-drop", "ALL",
}
// Add network mode if specified
if cfg.Network != "" {
args = append(args, "--network", cfg.Network)
}
// Add read-only root filesystem
if cfg.ReadOnlyMounts {
args = append(args, "--read-only")
}
if cfg.Memory != "" {
args = append(args, "--memory", cfg.Memory)
} else {

View file

@ -96,6 +96,9 @@ type Config struct {
// Plugins configuration
Plugins map[string]factory.PluginConfig `yaml:"plugins"`
// Sandboxing configuration
Sandbox SandboxConfig `yaml:"sandbox"`
}
// MetricsConfig controls the Prometheus exporter.
@ -125,6 +128,28 @@ type AppleGPUConfig struct {
MPSRuntime string `yaml:"mps_runtime"`
}
// SandboxConfig holds container sandbox settings
type SandboxConfig struct {
NetworkMode string `yaml:"network_mode"` // "none", "slirp4netns", "bridge"
ReadOnlyRoot bool `yaml:"read_only_root"`
AllowSecrets bool `yaml:"allow_secrets"`
AllowedSecrets []string `yaml:"allowed_secrets"` // e.g., ["HF_TOKEN", "WANDB_API_KEY"]
SeccompProfile string `yaml:"seccomp_profile"`
MaxRuntimeHours int `yaml:"max_runtime_hours"`
}
// Validate checks sandbox configuration
func (s *SandboxConfig) Validate() error {
validNetworks := map[string]bool{"none": true, "slirp4netns": true, "bridge": true, "": true}
if !validNetworks[s.NetworkMode] {
return fmt.Errorf("invalid network_mode: %s", s.NetworkMode)
}
if s.MaxRuntimeHours < 0 {
return fmt.Errorf("max_runtime_hours must be positive")
}
return nil
}
// LoadConfig loads worker configuration from a YAML file.
func LoadConfig(path string) (*Config, error) {
data, err := fileutil.SecureFileRead(path)

View file

@ -2,6 +2,7 @@
package execution
import (
"context"
"fmt"
"io"
"os"
@ -11,9 +12,77 @@ import (
"github.com/jfraeys/fetch_ml/internal/config"
"github.com/jfraeys/fetch_ml/internal/container"
"github.com/jfraeys/fetch_ml/internal/errtypes"
"github.com/jfraeys/fetch_ml/internal/logging"
"github.com/jfraeys/fetch_ml/internal/storage"
)
// PrepareContainerEnv prepares environment and secrets for container execution
func PrepareContainerEnv(
ctx context.Context,
taskID string,
jobName string,
requestedSecrets []string,
allowedSecrets []string,
allowSecrets bool,
logger *logging.Logger,
) ([]container.PodmanSecret, map[string]string, error) {
env := make(map[string]string)
secrets := []container.PodmanSecret{}
// Add standard env vars
env["FETCH_ML_JOB_NAME"] = jobName
env["FETCH_ML_TASK_ID"] = taskID
// Inject requested secrets if allowed
if allowSecrets && len(requestedSecrets) > 0 {
for _, secretName := range requestedSecrets {
if !isSecretAllowed(secretName, allowedSecrets) {
logger.Warn("secret not in allowlist, skipping", "secret", secretName)
continue
}
value, err := getSecretValue(secretName)
if err != nil {
logger.Warn("failed to get secret value", "secret", secretName, "error", err)
continue
}
secret := container.PodmanSecret{
Name: fmt.Sprintf("fetchml_%s_%s", strings.ToLower(secretName), taskID[:8]),
Data: []byte(value),
EnvVar: secretName, // Mount as env var
}
secrets = append(secrets, secret)
logger.Info("injected secret", "secret", secretName, "task", taskID)
}
}
return secrets, env, nil
}
func isSecretAllowed(name string, allowedList []string) bool {
if len(allowedList) == 0 {
return false // Default deny
}
for _, allowed := range allowedList {
if strings.EqualFold(name, allowed) {
return true
}
}
return false
}
func getSecretValue(name string) (string, error) {
// Try environment variable first
value := os.Getenv(name)
if value != "" {
return value, nil
}
return "", fmt.Errorf("secret %s not found in environment", name)
}
// JobPaths holds the directory paths for a job
type JobPaths struct {
JobDir string