// Package container provides Podman container management utilities. package container import ( "context" "fmt" "os" "os/exec" "path/filepath" "strconv" "strings" "github.com/jfraeys/fetch_ml/internal/config" "github.com/jfraeys/fetch_ml/internal/logging" ) // PodmanManager manages Podman containers type PodmanManager struct { logger *logging.Logger } // NewPodmanManager creates a new Podman manager func NewPodmanManager(logger *logging.Logger) (*PodmanManager, error) { return &PodmanManager{ logger: logger, }, nil } // ContainerConfig holds configuration for starting a container type ContainerConfig struct { Name string `json:"name"` Image string `json:"image"` Command []string `json:"command"` Env map[string]string `json:"env"` Secrets []PodmanSecret `json:"secrets"` // Sensitive data as Podman secrets Volumes map[string]string `json:"volumes"` Ports map[int]int `json:"ports"` SecurityOpts []string `json:"security_opts"` Resources ResourceConfig `json:"resources"` Network NetworkConfig `json:"network"` } // ResourceConfig defines resource limits for containers type ResourceConfig struct { MemoryLimit string `json:"memory_limit"` CPULimit string `json:"cpu_limit"` GPUDevices []string `json:"gpu_devices"` AppleGPU bool `json:"apple_gpu"` } // NetworkConfig defines network settings for containers type NetworkConfig struct { AllowNetwork bool `json:"allow_network"` } func podmanCgroupsMode() string { return strings.TrimSpace(os.Getenv("FETCHML_PODMAN_CGROUPS")) } func BuildRunArgs(config *ContainerConfig) []string { args := []string{"run", "-d"} if podmanCgroupsMode() == "disabled" { args = append(args, "--cgroups=disabled") } // Add name if config.Name != "" { args = append(args, "--name", config.Name) } // Add security options for _, opt := range config.SecurityOpts { args = append(args, "--security-opt", opt) } // Add resource limits if config.Resources.MemoryLimit != "" { args = append(args, "--memory", config.Resources.MemoryLimit) } if config.Resources.CPULimit != "" { args = append(args, "--cpus", config.Resources.CPULimit) } if config.Resources.AppleGPU { args = append(args, "--device", "/dev/metal") args = append(args, "--device", "/dev/mps") } for _, device := range config.Resources.GPUDevices { args = append(args, "--device", device) } // Add volumes for hostPath, containerPath := range config.Volumes { mount := fmt.Sprintf("%s:%s", hostPath, containerPath) args = append(args, "-v", mount) } // Add ports for hostPort, containerPort := range config.Ports { portMapping := fmt.Sprintf("%d:%d", hostPort, containerPort) args = append(args, "-p", portMapping) } // Add environment variables for key, value := range config.Env { args = append(args, "-e", fmt.Sprintf("%s=%s", key, value)) } // Add image and command args = append(args, config.Image) args = append(args, config.Command...) return args } func ParseContainerID(output string) (string, error) { out := strings.TrimSpace(output) if out == "" { return "", fmt.Errorf("no container ID returned") } lines := strings.Split(out, "\n") for i := len(lines) - 1; i >= 0; i-- { line := strings.TrimSpace(lines[i]) if line == "" { continue } return line, nil } return "", fmt.Errorf("no container ID returned") } // StartContainer starts a new container with secret support func (pm *PodmanManager) StartContainer( ctx context.Context, config *ContainerConfig, ) (string, error) { // Create Podman secrets for sensitive data for _, secret := range config.Secrets { if err := pm.CreateSecret(ctx, secret.Name, secret.Data); err != nil { // Clean up any secrets we already created for _, s := range config.Secrets { if s.Name == secret.Name { break } _ = pm.DeleteSecret(ctx, s.Name) } return "", fmt.Errorf("failed to create secret %s: %w", secret.Name, err) } } // Build run args including secrets args := BuildRunArgs(config) // Add secret mount arguments secretArgs := BuildSecretArgs(config.Secrets) // Insert secrets after "run -d" and before other args if len(secretArgs) > 0 { // args[0] = "run", args[1] = "-d" newArgs := append([]string{args[0], args[1]}, secretArgs...) newArgs = append(newArgs, args[2:]...) args = newArgs } // Execute command cmd := exec.CommandContext(ctx, "podman", args...) output, err := cmd.CombinedOutput() if err != nil { // Clean up secrets on failure for _, secret := range config.Secrets { _ = pm.DeleteSecret(ctx, secret.Name) } return "", fmt.Errorf("failed to start container: %w, output: %s", err, string(output)) } containerID, err := ParseContainerID(string(output)) if err != nil { // Clean up secrets on failure for _, secret := range config.Secrets { _ = pm.DeleteSecret(ctx, secret.Name) } return "", err } pm.logger.Info("container started", "container_id", containerID, "name", config.Name, "secrets", len(config.Secrets)) return containerID, nil } // StopContainer stops a container func (pm *PodmanManager) StopContainer(ctx context.Context, containerID string) error { cmd := exec.CommandContext(ctx, "podman", "stop", containerID) output, err := cmd.CombinedOutput() if err != nil { return fmt.Errorf("failed to stop container: %w, output: %s", err, string(output)) } pm.logger.Info("container stopped", "container_id", containerID) return nil } // GetContainerStateStatus returns the container's lifecycle state from `podman inspect`. // Typical values: running, exited, created, paused. func (pm *PodmanManager) GetContainerStateStatus( ctx context.Context, containerID string, ) (string, error) { // Validate containerID to prevent injection if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") { return "", fmt.Errorf("invalid container ID: %s", containerID) } cmd := exec.CommandContext(ctx, "podman", "inspect", "--format", "{{.State.Status}}", containerID) //nolint:gosec output, err := cmd.CombinedOutput() if err != nil { return "", fmt.Errorf("failed to inspect container: %w, output: %s", err, string(output)) } status := strings.TrimSpace(string(output)) if status == "" { return "unknown", nil } return status, nil } // RemoveContainer removes a container func (pm *PodmanManager) RemoveContainer(ctx context.Context, containerID string) error { cmd := exec.CommandContext(ctx, "podman", "rm", containerID) output, err := cmd.CombinedOutput() if err != nil { return fmt.Errorf("failed to remove container: %w, output: %s", err, string(output)) } pm.logger.Info("container removed", "container_id", containerID) return nil } // GetContainerStatus gets the status of a container func (pm *PodmanManager) GetContainerStatus( ctx context.Context, containerID string, ) (string, error) { // Validate containerID to prevent injection if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") { return "", fmt.Errorf("invalid container ID: %s", containerID) } cmd := exec.CommandContext(ctx, "podman", "ps", "--filter", "id="+containerID, "--format", "{{.Status}}") //nolint:gosec output, err := cmd.CombinedOutput() if err != nil { return "", fmt.Errorf("failed to get container status: %w, output: %s", err, string(output)) } status := strings.TrimSpace(string(output)) if status == "" { // Container might be stopped, check all containers cmd = exec.CommandContext( ctx, "podman", "ps", "-a", "--filter", "id="+containerID, "--format", "{{.Status}}", ) //nolint:gosec output, err = cmd.CombinedOutput() if err != nil { return "", fmt.Errorf("failed to get container status: %w, output: %s", err, string(output)) } status = strings.TrimSpace(string(output)) if status == "" { return "unknown", nil } } return status, nil } // ExecContainer executes a command inside a running container and returns the output func (pm *PodmanManager) ExecContainer(ctx context.Context, containerID string, command []string) (string, error) { // Validate containerID to prevent injection if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") { return "", fmt.Errorf("invalid container ID: %s", containerID) } // Validate command to prevent injection for _, arg := range command { if strings.ContainsAny(arg, "&;|<>$`\"'") { return "", fmt.Errorf("invalid command argument: %s", arg) } } // Build podman exec command args := []string{"exec", containerID} args = append(args, command...) cmd := exec.CommandContext(ctx, "podman", args...) //nolint:gosec output, err := cmd.CombinedOutput() if err != nil { return "", fmt.Errorf("failed to execute command in container: %w, output: %s", err, string(output)) } return string(output), nil } // PodmanConfig holds configuration for Podman container execution type PodmanConfig struct { Image string Workspace string Results string ContainerWorkspace string ContainerResults string AppleGPU bool GPUDevices []string Env map[string]string Volumes map[string]string Memory string CPUs string Privileged bool // Security: must be false Network string // Security: must not be "host" ReadOnlyMounts bool // Security: true for dataset mounts } // PodmanResourceOverrides converts per-task resource requests into Podman-compatible // `--cpus` and `--memory` flag values. // // cpu and memoryGB are treated as optional; values <= 0 return empty overrides. func PodmanResourceOverrides(cpu int, memoryGB int) (cpus string, memory string) { if cpu > 0 { cpus = strconv.Itoa(cpu) } if memoryGB > 0 { memory = fmt.Sprintf("%dg", memoryGB) } return cpus, memory } // PodmanSecurityConfig holds security configuration for Podman containers type PodmanSecurityConfig struct { NoNewPrivileges bool DropAllCaps bool AllowedCaps []string UserNS bool RunAsUID int RunAsGID int SeccompProfile string ReadOnlyRoot bool NetworkMode string } // BuildSecurityArgs builds security-related podman arguments from PodmanSecurityConfig func BuildSecurityArgs(sandbox PodmanSecurityConfig) []string { args := []string{} // No new privileges if sandbox.NoNewPrivileges { args = append(args, "--security-opt", "no-new-privileges:true") } // Capability dropping if sandbox.DropAllCaps { args = append(args, "--cap-drop=all") for _, cap := range sandbox.AllowedCaps { if cap != "" { args = append(args, "--cap-add="+cap) } } } // User namespace mapping if sandbox.UserNS && sandbox.RunAsUID > 0 && sandbox.RunAsGID > 0 { // Map container root to specified UID/GID on host args = append(args, "--userns", "keep-id") args = append(args, "--user", fmt.Sprintf("%d:%d", sandbox.RunAsUID, sandbox.RunAsGID)) } // Seccomp profile if sandbox.SeccompProfile != "" && sandbox.SeccompProfile != "unconfined" { profilePath := GetSeccompProfilePath(sandbox.SeccompProfile) if profilePath != "" { args = append(args, "--security-opt", fmt.Sprintf("seccomp=%s", profilePath)) } } // Read-only root filesystem if sandbox.ReadOnlyRoot { args = append(args, "--read-only") } // Network mode (default: none) networkMode := sandbox.NetworkMode if networkMode == "" { networkMode = "none" } args = append(args, "--network", networkMode) return args } // GetSeccompProfilePath returns the filesystem path for a named seccomp profile func GetSeccompProfilePath(profileName string) string { // Check standard locations searchPaths := []string{ filepath.Join("configs", "seccomp", profileName+".json"), filepath.Join("/etc", "fetchml", "seccomp", profileName+".json"), filepath.Join("/usr", "share", "fetchml", "seccomp", profileName+".json"), } for _, path := range searchPaths { if _, err := os.Stat(path); err == nil { return path } } // If profileName is already a path, return it if filepath.IsAbs(profileName) { return profileName } return "" } // BuildPodmanCommand builds a Podman command for executing ML experiments with security options func BuildPodmanCommand( ctx context.Context, cfg PodmanConfig, sandbox PodmanSecurityConfig, scriptPath, depsPath string, extraArgs []string, ) *exec.Cmd { args := []string{"run", "--rm"} // Add security options from sandbox config securityArgs := BuildSecurityArgs(sandbox) args = append(args, securityArgs...) // Resource limits if cfg.Memory != "" { args = append(args, "--memory", cfg.Memory) } else { args = append(args, "--memory", config.DefaultPodmanMemory) } if cfg.CPUs != "" { args = append(args, "--cpus", cfg.CPUs) } else { args = append(args, "--cpus", config.DefaultPodmanCPUs) } // Mount workspace workspaceMount := fmt.Sprintf("%s:%s:rw", cfg.Workspace, cfg.ContainerWorkspace) args = append(args, "-v", workspaceMount) // Mount results resultsMount := fmt.Sprintf("%s:%s:rw", cfg.Results, cfg.ContainerResults) args = append(args, "-v", resultsMount) // Mount additional volumes for hostPath, containerPath := range cfg.Volumes { mount := fmt.Sprintf("%s:%s", hostPath, containerPath) args = append(args, "-v", mount) } // Use injected GPU device paths for Apple GPU or custom configurations for _, device := range cfg.GPUDevices { args = append(args, "--device", device) } // Add environment variables for key, value := range cfg.Env { args = append(args, "-e", fmt.Sprintf("%s=%s", key, value)) } // Image and command args = append(args, cfg.Image, "--workspace", cfg.ContainerWorkspace, "--deps", depsPath, "--script", scriptPath, ) // Add extra arguments via --args flag if len(extraArgs) > 0 { args = append(args, "--args") args = append(args, extraArgs...) } return exec.CommandContext(ctx, "podman", args...) } // BuildPodmanCommandLegacy builds a Podman command using legacy security settings // Deprecated: Use BuildPodmanCommand with SandboxConfig instead func BuildPodmanCommandLegacy( ctx context.Context, cfg PodmanConfig, scriptPath, depsPath string, extraArgs []string, ) *exec.Cmd { args := []string{ "run", "--rm", "--security-opt", "no-new-privileges", "--cap-drop", "ALL", } // Add network mode if specified if cfg.Network != "" { args = append(args, "--network", cfg.Network) } // Add read-only root filesystem if cfg.ReadOnlyMounts { args = append(args, "--read-only") } if cfg.Memory != "" { args = append(args, "--memory", cfg.Memory) } else { args = append(args, "--memory", config.DefaultPodmanMemory) } if cfg.CPUs != "" { args = append(args, "--cpus", cfg.CPUs) } else { args = append(args, "--cpus", config.DefaultPodmanCPUs) } args = append(args, "--userns", "keep-id") // Mount workspace workspaceMount := fmt.Sprintf("%s:%s:rw", cfg.Workspace, cfg.ContainerWorkspace) args = append(args, "-v", workspaceMount) // Mount results resultsMount := fmt.Sprintf("%s:%s:rw", cfg.Results, cfg.ContainerResults) args = append(args, "-v", resultsMount) // Mount additional volumes for hostPath, containerPath := range cfg.Volumes { mount := fmt.Sprintf("%s:%s", hostPath, containerPath) args = append(args, "-v", mount) } // Use injected GPU device paths for Apple GPU or custom configurations for _, device := range cfg.GPUDevices { args = append(args, "--device", device) } // Add environment variables for key, value := range cfg.Env { args = append(args, "-e", fmt.Sprintf("%s=%s", key, value)) } // Image and command args = append(args, cfg.Image, "--workspace", cfg.ContainerWorkspace, "--deps", depsPath, "--script", scriptPath, ) // Add extra arguments via --args flag if len(extraArgs) > 0 { args = append(args, "--args") args = append(args, extraArgs...) } return exec.CommandContext(ctx, "podman", args...) } // ValidateSecurityPolicy validates that the container configuration meets security requirements. // Returns an error if the configuration violates security policies. func ValidateSecurityPolicy(cfg PodmanConfig) error { if cfg.Privileged { return fmt.Errorf("privileged containers are not allowed: %w", ErrSecurityViolation) } if cfg.Network == "host" { return fmt.Errorf("host network mode is not allowed: %w", ErrSecurityViolation) } // Validate volume mounts are read-only where required if !cfg.ReadOnlyMounts { // This is a warning-level issue, not a hard error // but we document it for audit purposes } return nil } // PodmanSecret represents a secret to be mounted in a container type PodmanSecret struct { Name string // Secret name in Podman Data []byte // Secret data (will be base64 encoded) Target string // Mount path inside container EnvVar string // Environment variable name (optional, if set mounts as env var instead of file) } // CreateSecret creates a Podman secret from the given data func (pm *PodmanManager) CreateSecret(ctx context.Context, name string, data []byte) error { // Create secret via podman command // podman secret create name - << data cmd := exec.CommandContext(ctx, "podman", "secret", "create", name, "-") cmd.Stdin = strings.NewReader(string(data)) output, err := cmd.CombinedOutput() if err != nil { return fmt.Errorf("failed to create secret %s: %w, output: %s", name, err, string(output)) } pm.logger.Info("secret created", "name", name) return nil } // DeleteSecret removes a Podman secret func (pm *PodmanManager) DeleteSecret(ctx context.Context, name string) error { cmd := exec.CommandContext(ctx, "podman", "secret", "rm", name) output, err := cmd.CombinedOutput() if err != nil { return fmt.Errorf("failed to delete secret %s: %w, output: %s", name, err, string(output)) } pm.logger.Info("secret deleted", "name", name) return nil } // BuildSecretArgs builds podman run arguments for mounting secrets func BuildSecretArgs(secrets []PodmanSecret) []string { args := []string{} for _, secret := range secrets { if secret.EnvVar != "" { // Mount as environment variable args = append(args, "--secret", fmt.Sprintf("%s,type=env,target=%s", secret.Name, secret.EnvVar)) } else { // Mount as file target := secret.Target if target == "" { target = fmt.Sprintf("/run/secrets/%s", secret.Name) } args = append(args, "--secret", fmt.Sprintf("%s,type=mount,target=%s", secret.Name, target)) } } return args } // SanitizeContainerEnv removes sensitive values from env map and returns secrets to create func SanitizeContainerEnv(env map[string]string, sensitiveKeys []string) ([]PodmanSecret, map[string]string) { secrets := []PodmanSecret{} cleanEnv := make(map[string]string) for key, value := range env { isSensitive := false lowerKey := strings.ToLower(key) for _, sensitive := range sensitiveKeys { if strings.Contains(lowerKey, strings.ToLower(sensitive)) { isSensitive = true break } } if isSensitive && value != "" { // Create secret for this value secretName := fmt.Sprintf("fetchml_%s_%d", strings.ToLower(key), os.Getpid()) secrets = append(secrets, PodmanSecret{ Name: secretName, Data: []byte(value), EnvVar: key, // Mount as env var to maintain compatibility }) // Don't include in env - it will be mounted as secret } else { cleanEnv[key] = value } } return secrets, cleanEnv } // ErrSecurityViolation is returned when a security policy is violated. var ErrSecurityViolation = fmt.Errorf("security policy violation") func SanitizePath(path string) (string, error) { // Clean the path to remove any .. or . components cleaned := filepath.Clean(path) // Check for path traversal attempts if strings.Contains(cleaned, "..") { return "", fmt.Errorf("path traversal detected: %s", path) } return cleaned, nil } // ValidateJobName validates a job name is safe func ValidateJobName(jobName string) error { if jobName == "" { return fmt.Errorf("job name cannot be empty") } // Check for dangerous characters if strings.ContainsAny(jobName, "/\\<>:\"|?*") { return fmt.Errorf("job name contains invalid characters: %s", jobName) } // Check for path traversal if strings.Contains(jobName, "..") { return fmt.Errorf("job name contains path traversal: %s", jobName) } return nil }