feat: Worker sandboxing and security configuration
Add security hardening features for worker execution: - Worker config with sandboxing options (network_mode, read_only, secrets) - Execution setup with security context propagation - Podman container runtime security enhancements - Security configuration management in config package - Add homelab-sandbox.yaml example configuration Supports running jobs in isolated, restricted environments.
This commit is contained in:
parent
cb826b74a3
commit
4756348c48
5 changed files with 177 additions and 0 deletions
53
configs/worker/homelab-sandbox.yaml
Normal file
53
configs/worker/homelab-sandbox.yaml
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
# Worker configuration with sandboxing enabled
|
||||
# This configuration provides strict isolation for sensitive workloads
|
||||
|
||||
host: "127.0.0.1"
|
||||
user: "worker"
|
||||
base_path: "/var/lib/fetchml/experiments"
|
||||
max_workers: 4
|
||||
|
||||
# Sandboxing configuration
|
||||
sandbox:
|
||||
# Network isolation: "none" (no network), "slirp4netns" (user-mode networking),
|
||||
# "bridge" (bridge networking), or "" (default)
|
||||
network_mode: "none"
|
||||
|
||||
# Mount root filesystem as read-only
|
||||
read_only_root: true
|
||||
|
||||
# Enable secret injection for API keys
|
||||
allow_secrets: true
|
||||
|
||||
# Allowed secrets (explicit allowlist for security)
|
||||
allowed_secrets:
|
||||
- HF_TOKEN # Hugging Face API token
|
||||
- WANDB_API_KEY # Weights & Biases API key
|
||||
- OPENAI_API_KEY # OpenAI API key
|
||||
- ANTHROPIC_API_KEY # Anthropic API key
|
||||
- AWS_ACCESS_KEY_ID # AWS credentials
|
||||
- AWS_SECRET_ACCESS_KEY
|
||||
|
||||
# Seccomp profile for syscall filtering
|
||||
seccomp_profile: "ml-research.json"
|
||||
|
||||
# Maximum runtime before automatic termination (hours)
|
||||
max_runtime_hours: 48
|
||||
|
||||
# Resource limits
|
||||
resources:
|
||||
max_memory_gb: 64
|
||||
max_cpu_cores: 16
|
||||
max_gpu_devices: 4
|
||||
|
||||
# Podman configuration
|
||||
podman_image: "fetchml-ml:latest"
|
||||
gpu_vendor: "nvidia"
|
||||
|
||||
# Queue backend
|
||||
queue:
|
||||
backend: "redis"
|
||||
redis_url: "redis://localhost:6379/0"
|
||||
|
||||
# Snapshot store (optional)
|
||||
snapshot_store:
|
||||
enabled: false
|
||||
|
|
@ -30,6 +30,26 @@ type AuditLoggingConfig struct {
|
|||
LogPath string `yaml:"log_path"`
|
||||
}
|
||||
|
||||
// PrivacyConfig holds privacy enforcement configuration
|
||||
type PrivacyConfig struct {
|
||||
Enabled bool `yaml:"enabled"`
|
||||
DefaultLevel string `yaml:"default_level"` // private, team, public, anonymized
|
||||
EnforceTeams bool `yaml:"enforce_teams"`
|
||||
AuditAccess bool `yaml:"audit_access"`
|
||||
}
|
||||
|
||||
// Validate checks privacy configuration
|
||||
func (p *PrivacyConfig) Validate() error {
|
||||
if !p.Enabled {
|
||||
return nil
|
||||
}
|
||||
validLevels := map[string]bool{"private": true, "team": true, "public": true, "anonymized": true}
|
||||
if p.DefaultLevel != "" && !validLevels[p.DefaultLevel] {
|
||||
return fmt.Errorf("invalid default privacy level: %s", p.DefaultLevel)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// MonitoringConfig holds monitoring-related configuration
|
||||
type MonitoringConfig struct {
|
||||
Prometheus PrometheusConfig `yaml:"prometheus"`
|
||||
|
|
|
|||
|
|
@ -349,6 +349,16 @@ func BuildPodmanCommand(
|
|||
"--cap-drop", "ALL",
|
||||
}
|
||||
|
||||
// Add network mode if specified
|
||||
if cfg.Network != "" {
|
||||
args = append(args, "--network", cfg.Network)
|
||||
}
|
||||
|
||||
// Add read-only root filesystem
|
||||
if cfg.ReadOnlyMounts {
|
||||
args = append(args, "--read-only")
|
||||
}
|
||||
|
||||
if cfg.Memory != "" {
|
||||
args = append(args, "--memory", cfg.Memory)
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -96,6 +96,9 @@ type Config struct {
|
|||
|
||||
// Plugins configuration
|
||||
Plugins map[string]factory.PluginConfig `yaml:"plugins"`
|
||||
|
||||
// Sandboxing configuration
|
||||
Sandbox SandboxConfig `yaml:"sandbox"`
|
||||
}
|
||||
|
||||
// MetricsConfig controls the Prometheus exporter.
|
||||
|
|
@ -125,6 +128,28 @@ type AppleGPUConfig struct {
|
|||
MPSRuntime string `yaml:"mps_runtime"`
|
||||
}
|
||||
|
||||
// SandboxConfig holds container sandbox settings
|
||||
type SandboxConfig struct {
|
||||
NetworkMode string `yaml:"network_mode"` // "none", "slirp4netns", "bridge"
|
||||
ReadOnlyRoot bool `yaml:"read_only_root"`
|
||||
AllowSecrets bool `yaml:"allow_secrets"`
|
||||
AllowedSecrets []string `yaml:"allowed_secrets"` // e.g., ["HF_TOKEN", "WANDB_API_KEY"]
|
||||
SeccompProfile string `yaml:"seccomp_profile"`
|
||||
MaxRuntimeHours int `yaml:"max_runtime_hours"`
|
||||
}
|
||||
|
||||
// Validate checks sandbox configuration
|
||||
func (s *SandboxConfig) Validate() error {
|
||||
validNetworks := map[string]bool{"none": true, "slirp4netns": true, "bridge": true, "": true}
|
||||
if !validNetworks[s.NetworkMode] {
|
||||
return fmt.Errorf("invalid network_mode: %s", s.NetworkMode)
|
||||
}
|
||||
if s.MaxRuntimeHours < 0 {
|
||||
return fmt.Errorf("max_runtime_hours must be positive")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadConfig loads worker configuration from a YAML file.
|
||||
func LoadConfig(path string) (*Config, error) {
|
||||
data, err := fileutil.SecureFileRead(path)
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
package execution
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
|
@ -11,9 +12,77 @@ import (
|
|||
"github.com/jfraeys/fetch_ml/internal/config"
|
||||
"github.com/jfraeys/fetch_ml/internal/container"
|
||||
"github.com/jfraeys/fetch_ml/internal/errtypes"
|
||||
"github.com/jfraeys/fetch_ml/internal/logging"
|
||||
"github.com/jfraeys/fetch_ml/internal/storage"
|
||||
)
|
||||
|
||||
// PrepareContainerEnv prepares environment and secrets for container execution
|
||||
func PrepareContainerEnv(
|
||||
ctx context.Context,
|
||||
taskID string,
|
||||
jobName string,
|
||||
requestedSecrets []string,
|
||||
allowedSecrets []string,
|
||||
allowSecrets bool,
|
||||
logger *logging.Logger,
|
||||
) ([]container.PodmanSecret, map[string]string, error) {
|
||||
env := make(map[string]string)
|
||||
secrets := []container.PodmanSecret{}
|
||||
|
||||
// Add standard env vars
|
||||
env["FETCH_ML_JOB_NAME"] = jobName
|
||||
env["FETCH_ML_TASK_ID"] = taskID
|
||||
|
||||
// Inject requested secrets if allowed
|
||||
if allowSecrets && len(requestedSecrets) > 0 {
|
||||
for _, secretName := range requestedSecrets {
|
||||
if !isSecretAllowed(secretName, allowedSecrets) {
|
||||
logger.Warn("secret not in allowlist, skipping", "secret", secretName)
|
||||
continue
|
||||
}
|
||||
|
||||
value, err := getSecretValue(secretName)
|
||||
if err != nil {
|
||||
logger.Warn("failed to get secret value", "secret", secretName, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
secret := container.PodmanSecret{
|
||||
Name: fmt.Sprintf("fetchml_%s_%s", strings.ToLower(secretName), taskID[:8]),
|
||||
Data: []byte(value),
|
||||
EnvVar: secretName, // Mount as env var
|
||||
}
|
||||
secrets = append(secrets, secret)
|
||||
|
||||
logger.Info("injected secret", "secret", secretName, "task", taskID)
|
||||
}
|
||||
}
|
||||
|
||||
return secrets, env, nil
|
||||
}
|
||||
|
||||
func isSecretAllowed(name string, allowedList []string) bool {
|
||||
if len(allowedList) == 0 {
|
||||
return false // Default deny
|
||||
}
|
||||
for _, allowed := range allowedList {
|
||||
if strings.EqualFold(name, allowed) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func getSecretValue(name string) (string, error) {
|
||||
// Try environment variable first
|
||||
value := os.Getenv(name)
|
||||
if value != "" {
|
||||
return value, nil
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("secret %s not found in environment", name)
|
||||
}
|
||||
|
||||
// JobPaths holds the directory paths for a job
|
||||
type JobPaths struct {
|
||||
JobDir string
|
||||
|
|
|
|||
Loading…
Reference in a new issue