fetch_ml/internal/container/podman.go
Jeremie Fraeys cd5640ebd2 Slim and secure: move scripts, clean configs, remove secrets
- Move ci-test.sh and setup.sh to scripts/
- Trim docs/src/zig-cli.md to current structure
- Replace hardcoded secrets with placeholders in configs
- Update .gitignore to block .env*, secrets/, keys, build artifacts
- Slim README.md to reflect current CLI/TUI split
- Add cleanup trap to ci-test.sh
- Ensure no secrets are committed
2025-12-07 13:57:51 -05:00

267 lines
7.5 KiB
Go

// Package container provides Podman container management utilities.
package container
import (
"context"
"fmt"
"os/exec"
"path/filepath"
"strings"
"github.com/jfraeys/fetch_ml/internal/config"
"github.com/jfraeys/fetch_ml/internal/logging"
)
// PodmanManager manages Podman containers
type PodmanManager struct {
logger *logging.Logger
}
// NewPodmanManager creates a new Podman manager
func NewPodmanManager(logger *logging.Logger) (*PodmanManager, error) {
return &PodmanManager{
logger: logger,
}, nil
}
// ContainerConfig holds configuration for starting a container
type ContainerConfig struct {
Name string `json:"name"`
Image string `json:"image"`
Command []string `json:"command"`
Env map[string]string `json:"env"`
Volumes map[string]string `json:"volumes"`
Ports map[int]int `json:"ports"`
SecurityOpts []string `json:"security_opts"`
Resources ResourceConfig `json:"resources"`
Network NetworkConfig `json:"network"`
}
// ResourceConfig defines resource limits for containers
type ResourceConfig struct {
MemoryLimit string `json:"memory_limit"`
CPULimit string `json:"cpu_limit"`
GPUAccess bool `json:"gpu_access"`
}
// NetworkConfig defines network settings for containers
type NetworkConfig struct {
AllowNetwork bool `json:"allow_network"`
}
// StartContainer starts a new container
func (pm *PodmanManager) StartContainer(ctx context.Context, config *ContainerConfig) (string, error) {
args := []string{"run", "-d"}
// Add name
if config.Name != "" {
args = append(args, "--name", config.Name)
}
// Add security options
for _, opt := range config.SecurityOpts {
args = append(args, "--security-opt", opt)
}
// Add resource limits
if config.Resources.MemoryLimit != "" {
args = append(args, "--memory", config.Resources.MemoryLimit)
}
if config.Resources.CPULimit != "" {
args = append(args, "--cpus", config.Resources.CPULimit)
}
if config.Resources.GPUAccess {
args = append(args, "--device", "/dev/dri")
}
// Add volumes
for hostPath, containerPath := range config.Volumes {
mount := fmt.Sprintf("%s:%s", hostPath, containerPath)
args = append(args, "-v", mount)
}
// Add ports
for hostPort, containerPort := range config.Ports {
portMapping := fmt.Sprintf("%d:%d", hostPort, containerPort)
args = append(args, "-p", portMapping)
}
// Add environment variables
for key, value := range config.Env {
args = append(args, "-e", fmt.Sprintf("%s=%s", key, value))
}
// Add image and command
args = append(args, config.Image)
args = append(args, config.Command...)
// Execute command
cmd := exec.CommandContext(ctx, "podman", args...)
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("failed to start container: %w, output: %s", err, string(output))
}
// Return container ID (first line of output)
containerID := strings.TrimSpace(string(output))
if containerID == "" {
return "", fmt.Errorf("no container ID returned")
}
pm.logger.Info("container started", "container_id", containerID, "name", config.Name)
return containerID, nil
}
// StopContainer stops a container
func (pm *PodmanManager) StopContainer(ctx context.Context, containerID string) error {
cmd := exec.CommandContext(ctx, "podman", "stop", containerID)
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("failed to stop container: %w, output: %s", err, string(output))
}
pm.logger.Info("container stopped", "container_id", containerID)
return nil
}
// RemoveContainer removes a container
func (pm *PodmanManager) RemoveContainer(ctx context.Context, containerID string) error {
cmd := exec.CommandContext(ctx, "podman", "rm", containerID)
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("failed to remove container: %w, output: %s", err, string(output))
}
pm.logger.Info("container removed", "container_id", containerID)
return nil
}
// GetContainerStatus gets the status of a container
func (pm *PodmanManager) GetContainerStatus(ctx context.Context, containerID string) (string, error) {
// Validate containerID to prevent injection
if containerID == "" || strings.ContainsAny(containerID, "&;|<>$`\"'") {
return "", fmt.Errorf("invalid container ID: %s", containerID)
}
cmd := exec.CommandContext(ctx, "podman", "ps", "--filter", "id="+containerID,
"--format", "{{.Status}}") //nolint:gosec
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("failed to get container status: %w, output: %s", err, string(output))
}
status := strings.TrimSpace(string(output))
if status == "" {
// Container might be stopped, check all containers
cmd = exec.CommandContext(ctx, "podman", "ps", "-a", "--filter", "id="+containerID, "--format", "{{.Status}}") //nolint:gosec
output, err = cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("failed to get container status: %w, output: %s", err, string(output))
}
status = strings.TrimSpace(string(output))
if status == "" {
return "unknown", nil
}
}
return status, nil
}
// PodmanConfig holds configuration for Podman container execution
type PodmanConfig struct {
Image string
Workspace string
Results string
ContainerWorkspace string
ContainerResults string
GPUAccess bool
Memory string
CPUs string
}
// BuildPodmanCommand builds a Podman command for executing ML experiments
func BuildPodmanCommand(
ctx context.Context,
cfg PodmanConfig,
scriptPath, requirementsPath string,
extraArgs []string,
) *exec.Cmd {
args := []string{
"run", "--rm",
"--security-opt", "no-new-privileges",
"--cap-drop", "ALL",
}
if cfg.Memory != "" {
args = append(args, "--memory", cfg.Memory)
} else {
args = append(args, "--memory", config.DefaultPodmanMemory)
}
if cfg.CPUs != "" {
args = append(args, "--cpus", cfg.CPUs)
} else {
args = append(args, "--cpus", config.DefaultPodmanCPUs)
}
args = append(args, "--userns", "keep-id")
// Mount workspace
workspaceMount := fmt.Sprintf("%s:%s:rw", cfg.Workspace, cfg.ContainerWorkspace)
args = append(args, "-v", workspaceMount)
// Mount results
resultsMount := fmt.Sprintf("%s:%s:rw", cfg.Results, cfg.ContainerResults)
args = append(args, "-v", resultsMount)
if cfg.GPUAccess {
args = append(args, "--device", "/dev/dri")
}
// Image and command
args = append(args, cfg.Image,
"--workspace", cfg.ContainerWorkspace,
"--requirements", requirementsPath,
"--script", scriptPath,
)
// Add extra arguments via --args flag
if len(extraArgs) > 0 {
args = append(args, "--args")
args = append(args, extraArgs...)
}
return exec.CommandContext(ctx, "podman", args...)
}
// SanitizePath ensures a path is safe to use (prevents path traversal)
func SanitizePath(path string) (string, error) {
// Clean the path to remove any .. or . components
cleaned := filepath.Clean(path)
// Check for path traversal attempts
if strings.Contains(cleaned, "..") {
return "", fmt.Errorf("path traversal detected: %s", path)
}
return cleaned, nil
}
// ValidateJobName validates a job name is safe
func ValidateJobName(jobName string) error {
if jobName == "" {
return fmt.Errorf("job name cannot be empty")
}
// Check for dangerous characters
if strings.ContainsAny(jobName, "/\\<>:\"|?*") {
return fmt.Errorf("job name contains invalid characters: %s", jobName)
}
// Check for path traversal
if strings.Contains(jobName, "..") {
return fmt.Errorf("job name contains path traversal: %s", jobName)
}
return nil
}