From 6b2c377680b173f59ac413cee65c8587e539aa3b Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Thu, 26 Feb 2026 12:06:35 -0500 Subject: [PATCH] refactor(jupyter): enhance security and scheduler integration Update Jupyter integration for security and scheduler support: - Enhanced security configuration with audit logging - Health monitoring with scheduler event integration - Package manager with network policy enforcement - Service manager with lifecycle hooks - Network manager with tenant isolation - Workspace metadata with tenant tags - Config with resource limits - Podman container integration improvements - Experiment manager with tracking integration - Manifest runner with security checks --- internal/container/podman.go | 144 +++++++------------- internal/experiment/manager.go | 5 +- internal/jupyter/config.go | 37 ++--- internal/jupyter/health_monitor.go | 14 +- internal/jupyter/network_manager.go | 4 +- internal/jupyter/package_manager.go | 37 +++-- internal/jupyter/security_enhanced.go | 89 ++++++------ internal/jupyter/service_manager.go | 67 +++++---- internal/jupyter/workspace_manager.go | 12 +- internal/jupyter/workspace_metadata.go | 25 ++-- internal/manifest/run_manifest.go | 181 +++++++++++++++++-------- internal/manifest/schema_version.go | 2 +- 12 files changed, 319 insertions(+), 298 deletions(-) diff --git a/internal/container/podman.go b/internal/container/podman.go index ae1f71f..1128762 100644 --- a/internal/container/podman.go +++ b/internal/container/podman.go @@ -306,20 +306,20 @@ func (pm *PodmanManager) ExecContainer(ctx context.Context, containerID string, // PodmanConfig holds configuration for Podman container execution type PodmanConfig struct { - Image string - Workspace string - Results string - ContainerWorkspace string - ContainerResults string - AppleGPU bool - GPUDevices []string Env map[string]string Volumes map[string]string Memory string + ContainerWorkspace string + ContainerResults string + Results string + Workspace string + Image string CPUs string - Privileged bool // Security: must be false - Network string // Security: must not be "host" - ReadOnlyMounts bool // Security: true for dataset mounts + Network string + GPUDevices []string + AppleGPU bool + Privileged bool + ReadOnlyMounts bool } // PodmanResourceOverrides converts per-task resource requests into Podman-compatible @@ -338,15 +338,22 @@ func PodmanResourceOverrides(cpu int, memoryGB int) (cpus string, memory string) // PodmanSecurityConfig holds security configuration for Podman containers type PodmanSecurityConfig struct { - NoNewPrivileges bool - DropAllCaps bool + SeccompProfile string + NetworkMode string AllowedCaps []string - UserNS bool RunAsUID int RunAsGID int - SeccompProfile string + NoNewPrivileges bool + DropAllCaps bool + UserNS bool ReadOnlyRoot bool - NetworkMode string + // Process Isolation + MaxProcesses int + MaxOpenFiles int + DisableSwap bool + OOMScoreAdj int + TaskUID int + TaskGID int } // BuildSecurityArgs builds security-related podman arguments from PodmanSecurityConfig @@ -395,6 +402,27 @@ func BuildSecurityArgs(sandbox PodmanSecurityConfig) []string { } args = append(args, "--network", networkMode) + // Process Isolation + // Fork bomb protection - limit number of processes + if sandbox.MaxProcesses > 0 { + args = append(args, "--pids-limit", strconv.Itoa(sandbox.MaxProcesses)) + } + + // File descriptor limits + if sandbox.MaxOpenFiles > 0 { + args = append(args, "--ulimit", fmt.Sprintf("nofile=%d:%d", sandbox.MaxOpenFiles, sandbox.MaxOpenFiles)) + } + + // OOM killer score adjustment (lower = less likely to be killed) + if sandbox.OOMScoreAdj != 0 { + args = append(args, "--oom-score-adj", strconv.Itoa(sandbox.OOMScoreAdj)) + } + + // Disable swap (memory-swap equals memory means no swap) + if sandbox.DisableSwap { + args = append(args, "--memory-swap=0") + } + return args } @@ -488,84 +516,6 @@ func BuildPodmanCommand( return exec.CommandContext(ctx, "podman", args...) } -// BuildPodmanCommandLegacy builds a Podman command using legacy security settings -// Deprecated: Use BuildPodmanCommand with SandboxConfig instead -func BuildPodmanCommandLegacy( - ctx context.Context, - cfg PodmanConfig, - scriptPath, depsPath string, - extraArgs []string, -) *exec.Cmd { - args := []string{ - "run", "--rm", - "--security-opt", "no-new-privileges", - "--cap-drop", "ALL", - } - - // Add network mode if specified - if cfg.Network != "" { - args = append(args, "--network", cfg.Network) - } - - // Add read-only root filesystem - if cfg.ReadOnlyMounts { - args = append(args, "--read-only") - } - - if cfg.Memory != "" { - args = append(args, "--memory", cfg.Memory) - } else { - args = append(args, "--memory", config.DefaultPodmanMemory) - } - - if cfg.CPUs != "" { - args = append(args, "--cpus", cfg.CPUs) - } else { - args = append(args, "--cpus", config.DefaultPodmanCPUs) - } - - args = append(args, "--userns", "keep-id") - - // Mount workspace - workspaceMount := fmt.Sprintf("%s:%s:rw", cfg.Workspace, cfg.ContainerWorkspace) - args = append(args, "-v", workspaceMount) - - // Mount results - resultsMount := fmt.Sprintf("%s:%s:rw", cfg.Results, cfg.ContainerResults) - args = append(args, "-v", resultsMount) - - // Mount additional volumes - for hostPath, containerPath := range cfg.Volumes { - mount := fmt.Sprintf("%s:%s", hostPath, containerPath) - args = append(args, "-v", mount) - } - - // Use injected GPU device paths for Apple GPU or custom configurations - for _, device := range cfg.GPUDevices { - args = append(args, "--device", device) - } - - // Add environment variables - for key, value := range cfg.Env { - args = append(args, "-e", fmt.Sprintf("%s=%s", key, value)) - } - - // Image and command - args = append(args, cfg.Image, - "--workspace", cfg.ContainerWorkspace, - "--deps", depsPath, - "--script", scriptPath, - ) - - // Add extra arguments via --args flag - if len(extraArgs) > 0 { - args = append(args, "--args") - args = append(args, extraArgs...) - } - - return exec.CommandContext(ctx, "podman", args...) -} - // ValidateSecurityPolicy validates that the container configuration meets security requirements. // Returns an error if the configuration violates security policies. func ValidateSecurityPolicy(cfg PodmanConfig) error { @@ -588,10 +538,10 @@ func ValidateSecurityPolicy(cfg PodmanConfig) error { // PodmanSecret represents a secret to be mounted in a container type PodmanSecret struct { - Name string // Secret name in Podman - Data []byte // Secret data (will be base64 encoded) - Target string // Mount path inside container - EnvVar string // Environment variable name (optional, if set mounts as env var instead of file) + Name string + Target string + EnvVar string + Data []byte } // CreateSecret creates a Podman secret from the given data diff --git a/internal/experiment/manager.go b/internal/experiment/manager.go index b910861..44c460f 100644 --- a/internal/experiment/manager.go +++ b/internal/experiment/manager.go @@ -104,7 +104,7 @@ func (m *Manager) CreateExperiment(commitID string) error { return nil } -// WriteMetadata writes experiment metadata to meta.bin +// WriteMetadata writes experiment metadata to meta.bin with crash safety (fsync) func (m *Manager) WriteMetadata(meta *Metadata) error { path := m.GetMetadataPath(meta.CommitID) @@ -134,7 +134,8 @@ func (m *Manager) WriteMetadata(meta *Metadata) error { buf = append(buf, byte(len(meta.User))) buf = append(buf, []byte(meta.User)...) - return os.WriteFile(path, buf, 0o600) + // SECURITY: Write with fsync for crash safety + return fileutil.WriteFileSafe(path, buf, 0o600) } // ReadMetadata reads experiment metadata from meta.bin diff --git a/internal/jupyter/config.go b/internal/jupyter/config.go index 080cea0..f40fa46 100644 --- a/internal/jupyter/config.go +++ b/internal/jupyter/config.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/jfraeys/fetch_ml/internal/fileutil" "github.com/jfraeys/fetch_ml/internal/logging" ) @@ -58,34 +59,34 @@ type ConfigManager struct { type JupyterConfig struct { Version string `json:"version"` Environment string `json:"environment"` - Service ServiceConfig `json:"service"` + Logging LoggingConfig `json:"logging"` + Resources ResourceConfig `json:"resources"` + Security SecurityConfig `json:"security"` Workspace WorkspaceConfig `json:"workspace"` Network NetworkConfig `json:"network"` - Security SecurityConfig `json:"security"` - Resources ResourceConfig `json:"resources"` - Health HealthConfig `json:"health"` - Logging LoggingConfig `json:"logging"` DefaultSettings DefaultSettingsConfig `json:"default_settings"` + Service ServiceConfig `json:"service"` AdvancedSettings AdvancedSettingsConfig `json:"advanced_settings"` + Health HealthConfig `json:"health"` } // WorkspaceConfig defines workspace configuration type WorkspaceConfig struct { - DefaultPath string `json:"default_path"` - AutoCreate bool `json:"auto_create"` MountOptions map[string]string `json:"mount_options"` + DefaultPath string `json:"default_path"` + MaxWorkspaceSize string `json:"max_workspace_size"` AllowedPaths []string `json:"allowed_paths"` DeniedPaths []string `json:"denied_paths"` - MaxWorkspaceSize string `json:"max_workspace_size"` + AutoCreate bool `json:"auto_create"` } // HealthConfig defines health monitoring configuration type HealthConfig struct { - Enabled bool `json:"enabled"` CheckInterval time.Duration `json:"check_interval"` Timeout time.Duration `json:"timeout"` RetryAttempts int `json:"retry_attempts"` MaxServiceAge time.Duration `json:"max_service_age"` + Enabled bool `json:"enabled"` AutoCleanup bool `json:"auto_cleanup"` MetricsEnabled bool `json:"metrics_enabled"` } @@ -96,31 +97,31 @@ type LoggingConfig struct { Format string `json:"format"` Output string `json:"output"` MaxSize string `json:"max_size"` - MaxBackups int `json:"max_backups"` MaxAge string `json:"max_age"` + MaxBackups int `json:"max_backups"` } // DefaultSettingsConfig defines default settings for new services type DefaultSettingsConfig struct { - Image string `json:"default_image"` - Port int `json:"default_port"` - Workspace string `json:"default_workspace"` Environment map[string]string `json:"environment"` + Image string `json:"default_image"` + Workspace string `json:"default_workspace"` + ShutdownPolicy string `json:"shutdown_policy"` + Port int `json:"default_port"` + StopTimeout time.Duration `json:"stop_timeout"` AutoStart bool `json:"auto_start"` AutoStop bool `json:"auto_stop"` - StopTimeout time.Duration `json:"stop_timeout"` - ShutdownPolicy string `json:"shutdown_policy"` } // AdvancedSettingsConfig defines advanced configuration options type AdvancedSettingsConfig struct { + ExperimentalFeatures []string `json:"experimental_features"` MaxConcurrentServices int `json:"max_concurrent_services"` ServiceTimeout time.Duration `json:"service_timeout"` StartupTimeout time.Duration `json:"startup_timeout"` GracefulShutdown bool `json:"graceful_shutdown"` ForceCleanup bool `json:"force_cleanup"` DebugMode bool `json:"debug_mode"` - ExperimentalFeatures []string `json:"experimental_features"` } // NewConfigManager creates a new configuration manager @@ -190,8 +191,8 @@ func (cm *ConfigManager) SaveConfig() error { return fmt.Errorf("failed to marshal config: %w", err) } - // Write configuration file - if err := os.WriteFile(cm.configPath, data, 0600); err != nil { + // Write configuration file with crash safety (fsync) + if err := fileutil.WriteFileSafe(cm.configPath, data, 0600); err != nil { return fmt.Errorf("failed to write config file: %w", err) } diff --git a/internal/jupyter/health_monitor.go b/internal/jupyter/health_monitor.go index 291c424..3154728 100644 --- a/internal/jupyter/health_monitor.go +++ b/internal/jupyter/health_monitor.go @@ -20,33 +20,33 @@ const ( type HealthMonitor struct { logger *logging.Logger services map[string]*JupyterService - servicesMutex sync.RWMutex - interval time.Duration client *http.Client + interval time.Duration + servicesMutex sync.RWMutex } // HealthStatus represents the health status of a service type HealthStatus struct { + LastCheck time.Time `json:"last_check"` + Metrics map[string]interface{} `json:"metrics"` ServiceID string `json:"service_id"` ServiceName string `json:"service_name"` Status string `json:"status"` - LastCheck time.Time `json:"last_check"` - ResponseTime time.Duration `json:"response_time"` URL string `json:"url"` ContainerID string `json:"container_id"` Errors []string `json:"errors"` - Metrics map[string]interface{} `json:"metrics"` + ResponseTime time.Duration `json:"response_time"` } // HealthReport contains a comprehensive health report type HealthReport struct { Timestamp time.Time `json:"timestamp"` + Services map[string]*HealthStatus `json:"services"` + Summary string `json:"summary"` TotalServices int `json:"total_services"` Healthy int `json:"healthy"` Unhealthy int `json:"unhealthy"` Unknown int `json:"unknown"` - Services map[string]*HealthStatus `json:"services"` - Summary string `json:"summary"` } // NewHealthMonitor creates a new health monitor diff --git a/internal/jupyter/network_manager.go b/internal/jupyter/network_manager.go index f0ade2a..57296b3 100644 --- a/internal/jupyter/network_manager.go +++ b/internal/jupyter/network_manager.go @@ -18,9 +18,9 @@ type NetworkManager struct { // PortAllocator manages port allocation for services type PortAllocator struct { + usedPorts map[int]bool startPort int endPort int - usedPorts map[int]bool } // NewNetworkManager creates a new network manager @@ -313,10 +313,10 @@ func (nm *NetworkManager) GetNetworkStatus() *NetworkStatus { // NetworkStatus contains network status information type NetworkStatus struct { + PortRange string `json:"port_range"` TotalPorts int `json:"total_ports"` AvailablePorts int `json:"available_ports"` UsedPorts int `json:"used_ports"` - PortRange string `json:"port_range"` Services int `json:"services"` } diff --git a/internal/jupyter/package_manager.go b/internal/jupyter/package_manager.go index 50bc41a..5ea1efa 100644 --- a/internal/jupyter/package_manager.go +++ b/internal/jupyter/package_manager.go @@ -8,6 +8,7 @@ import ( "strings" "time" + "github.com/jfraeys/fetch_ml/internal/fileutil" "github.com/jfraeys/fetch_ml/internal/logging" ) @@ -18,22 +19,22 @@ const ( // PackageManager manages package installations in Jupyter workspaces type PackageManager struct { logger *logging.Logger - trustedChannels []string allowedPackages map[string]bool - blockedPackages []string workspacePath string packageCachePath string + trustedChannels []string + blockedPackages []string } // PackageConfig defines package management configuration type PackageConfig struct { - TrustedChannels []string `json:"trusted_channels"` AllowedPackages map[string]bool `json:"allowed_packages"` + TrustedChannels []string `json:"trusted_channels"` BlockedPackages []string `json:"blocked_packages"` - RequireApproval bool `json:"require_approval"` - AutoApproveSafe bool `json:"auto_approve_safe"` MaxPackages int `json:"max_packages"` InstallTimeout time.Duration `json:"install_timeout"` + RequireApproval bool `json:"require_approval"` + AutoApproveSafe bool `json:"auto_approve_safe"` AllowCondaForge bool `json:"allow_conda_forge"` AllowPyPI bool `json:"allow_pypi"` AllowLocal bool `json:"allow_local"` @@ -41,28 +42,28 @@ type PackageConfig struct { // PackageRequest represents a package installation request type PackageRequest struct { + Timestamp time.Time `json:"timestamp"` + ApprovalTime time.Time `json:"approval_time,omitempty"` PackageName string `json:"package_name"` Version string `json:"version,omitempty"` Channel string `json:"channel,omitempty"` RequestedBy string `json:"requested_by"` WorkspacePath string `json:"workspace_path"` - Timestamp time.Time `json:"timestamp"` - Status string `json:"status"` // pending, approved, rejected, installed, failed + Status string `json:"status"` RejectionReason string `json:"rejection_reason,omitempty"` ApprovalUser string `json:"approval_user,omitempty"` - ApprovalTime time.Time `json:"approval_time,omitempty"` } // PackageInfo contains information about an installed package type PackageInfo struct { + InstalledAt time.Time `json:"installed_at"` + Metadata map[string]string `json:"metadata"` Name string `json:"name"` Version string `json:"version"` Channel string `json:"channel"` - InstalledAt time.Time `json:"installed_at"` InstalledBy string `json:"installed_by"` Size string `json:"size"` Dependencies []string `json:"dependencies"` - Metadata map[string]string `json:"metadata"` } // NewPackageManager creates a new package manager @@ -353,14 +354,18 @@ func (pm *PackageManager) GetPackageRequest(requestID string) (*PackageRequest, return pm.loadPackageRequest(requestID) } -// savePackageRequest saves a package request to cache +// savePackageRequest saves a package request to cache with crash safety (fsync) func (pm *PackageManager) savePackageRequest(req *PackageRequest) error { requestFile := filepath.Join(pm.packageCachePath, fmt.Sprintf("request_%s.json", req.PackageName)) data, err := json.MarshalIndent(req, "", " ") if err != nil { return err } - return os.WriteFile(requestFile, data, 0600) + // SECURITY: Write with fsync for crash safety + if err := fileutil.WriteFileSafe(requestFile, data, 0600); err != nil { + return fmt.Errorf("failed to write package request: %w", err) + } + return nil } // loadPackageRequest loads a package request from cache @@ -406,14 +411,18 @@ func (pm *PackageManager) loadAllPackageRequests() ([]*PackageRequest, error) { return requests, nil } -// savePackageInfo saves package information to cache +// savePackageInfo saves package information to cache with crash safety (fsync) func (pm *PackageManager) savePackageInfo(info *PackageInfo) error { infoFile := filepath.Join(pm.packageCachePath, fmt.Sprintf("installed_%s.json", info.Name)) data, err := json.MarshalIndent(info, "", " ") if err != nil { return err } - return os.WriteFile(infoFile, data, 0600) + // SECURITY: Write with fsync for crash safety + if err := fileutil.WriteFileSafe(infoFile, data, 0600); err != nil { + return fmt.Errorf("failed to write package info: %w", err) + } + return nil } // loadAllPackageInfo loads all installed package information diff --git a/internal/jupyter/security_enhanced.go b/internal/jupyter/security_enhanced.go index 0717eae..7a35935 100644 --- a/internal/jupyter/security_enhanced.go +++ b/internal/jupyter/security_enhanced.go @@ -23,57 +23,44 @@ type SecurityManager struct { // EnhancedSecurityConfig provides comprehensive security settings type EnhancedSecurityConfig struct { - // Network Security - AllowNetwork bool `json:"allow_network"` - AllowedHosts []string `json:"allowed_hosts"` - BlockedHosts []string `json:"blocked_hosts"` - EnableFirewall bool `json:"enable_firewall"` - - // Package Security - TrustedChannels []string `json:"trusted_channels"` - BlockedPackages []string `json:"blocked_packages"` - AllowedPackages map[string]bool `json:"allowed_packages"` - RequireApproval bool `json:"require_approval"` - AutoApproveSafe bool `json:"auto_approve_safe"` - MaxPackages int `json:"max_packages"` - InstallTimeout time.Duration `json:"install_timeout"` - AllowCondaForge bool `json:"allow_conda_forge"` - AllowPyPI bool `json:"allow_pypi"` - AllowLocal bool `json:"allow_local"` - - // Container Security - ReadOnlyRoot bool `json:"read_only_root"` - DropCapabilities []string `json:"drop_capabilities"` - RunAsNonRoot bool `json:"run_as_non_root"` - EnableSeccomp bool `json:"enable_seccomp"` - NoNewPrivileges bool `json:"no_new_privileges"` - - // Authentication Security - EnableTokenAuth bool `json:"enable_token_auth"` - TokenLength int `json:"token_length"` - TokenExpiry time.Duration `json:"token_expiry"` - RequireHTTPS bool `json:"require_https"` - SessionTimeout time.Duration `json:"session_timeout"` - MaxFailedAttempts int `json:"max_failed_attempts"` - LockoutDuration time.Duration `json:"lockout_duration"` - - // File System Security - AllowedPaths []string `json:"allowed_paths"` - DeniedPaths []string `json:"denied_paths"` - MaxWorkspaceSize string `json:"max_workspace_size"` - AllowExecFrom []string `json:"allow_exec_from"` - BlockExecFrom []string `json:"block_exec_from"` - - // Resource Security - MaxMemoryLimit string `json:"max_memory_limit"` - MaxCPULimit string `json:"max_cpu_limit"` - MaxDiskUsage string `json:"max_disk_usage"` - MaxProcesses int `json:"max_processes"` - - // Logging & Monitoring - SecurityLogLevel string `json:"security_log_level"` - AuditEnabled bool `json:"audit_enabled"` - RealTimeAlerts bool `json:"real_time_alerts"` + AllowedPackages map[string]bool `json:"allowed_packages"` + SecurityLogLevel string `json:"security_log_level"` + MaxDiskUsage string `json:"max_disk_usage"` + MaxWorkspaceSize string `json:"max_workspace_size"` + MaxMemoryLimit string `json:"max_memory_limit"` + MaxCPULimit string `json:"max_cpu_limit"` + BlockedPackages []string `json:"blocked_packages"` + AllowedHosts []string `json:"allowed_hosts"` + BlockedHosts []string `json:"blocked_hosts"` + TrustedChannels []string `json:"trusted_channels"` + BlockExecFrom []string `json:"block_exec_from"` + AllowExecFrom []string `json:"allow_exec_from"` + DropCapabilities []string `json:"drop_capabilities"` + DeniedPaths []string `json:"denied_paths"` + AllowedPaths []string `json:"allowed_paths"` + MaxPackages int `json:"max_packages"` + SessionTimeout time.Duration `json:"session_timeout"` + MaxProcesses int `json:"max_processes"` + InstallTimeout time.Duration `json:"install_timeout"` + LockoutDuration time.Duration `json:"lockout_duration"` + TokenLength int `json:"token_length"` + TokenExpiry time.Duration `json:"token_expiry"` + MaxFailedAttempts int `json:"max_failed_attempts"` + ReadOnlyRoot bool `json:"read_only_root"` + NoNewPrivileges bool `json:"no_new_privileges"` + EnableTokenAuth bool `json:"enable_token_auth"` + RunAsNonRoot bool `json:"run_as_non_root"` + AllowLocal bool `json:"allow_local"` + AllowPyPI bool `json:"allow_pypi"` + AllowCondaForge bool `json:"allow_conda_forge"` + RequireHTTPS bool `json:"require_https"` + AllowNetwork bool `json:"allow_network"` + AutoApproveSafe bool `json:"auto_approve_safe"` + RequireApproval bool `json:"require_approval"` + EnableSeccomp bool `json:"enable_seccomp"` + EnableFirewall bool `json:"enable_firewall"` + AuditEnabled bool `json:"audit_enabled"` + RealTimeAlerts bool `json:"real_time_alerts"` } // SecurityEvent represents a security-related event diff --git a/internal/jupyter/service_manager.go b/internal/jupyter/service_manager.go index 5f1d321..aca1c07 100644 --- a/internal/jupyter/service_manager.go +++ b/internal/jupyter/service_manager.go @@ -12,6 +12,7 @@ import ( "github.com/jfraeys/fetch_ml/internal/config" "github.com/jfraeys/fetch_ml/internal/container" + "github.com/jfraeys/fetch_ml/internal/fileutil" "github.com/jfraeys/fetch_ml/internal/logging" ) @@ -73,12 +74,12 @@ func trashBaseDir() string { } type trashInfo struct { - OriginalName string `json:"original_name"` DeletedAt time.Time `json:"deleted_at"` - DeletedBy string `json:"deleted_by"` - SizeBytes int64 `json:"size_bytes"` PurgeAfter time.Time `json:"purge_after"` + OriginalName string `json:"original_name"` + DeletedBy string `json:"deleted_by"` Reason string `json:"reason"` + SizeBytes int64 `json:"size_bytes"` } func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalName string) (string, *trashInfo, error) { @@ -119,7 +120,11 @@ func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalNam } b, err := json.MarshalIndent(info, "", " ") if err == nil { - _ = os.WriteFile(filepath.Join(dest, ".trashinfo"), b, 0o600) + // SECURITY: Write with fsync for crash safety + trashinfoPath := filepath.Join(dest, ".trashinfo") + if err := fileutil.WriteFileSafe(trashinfoPath, b, 0o600); err != nil { + sm.logger.Warn("failed to write trashinfo", "error", err) + } } return dest, info, nil @@ -243,25 +248,25 @@ func startupBlockedPackages(installBlocked []string) []string { // ServiceConfig holds configuration for Jupyter services type ServiceConfig struct { DefaultImage string `json:"default_image"` - DefaultPort int `json:"default_port"` DefaultWorkspace string `json:"default_workspace"` - MaxServices int `json:"max_services"` DefaultResources ResourceConfig `json:"default_resources"` SecuritySettings SecurityConfig `json:"security_settings"` NetworkConfig NetworkConfig `json:"network_config"` + DefaultPort int `json:"default_port"` + MaxServices int `json:"max_services"` } // NetworkConfig defines network settings for Jupyter containers type NetworkConfig struct { + BindAddress string `json:"bind_address"` + Token string `json:"token"` + Password string `json:"password"` + NetworkName string `json:"network_name"` HostPort int `json:"host_port"` ContainerPort int `json:"container_port"` - BindAddress string `json:"bind_address"` EnableToken bool `json:"enable_token"` - Token string `json:"token"` EnablePassword bool `json:"enable_password"` - Password string `json:"password"` AllowRemote bool `json:"allow_remote"` - NetworkName string `json:"network_name"` } // ResourceConfig defines resource limits for Jupyter containers @@ -273,16 +278,16 @@ type ResourceConfig struct { // SecurityConfig holds security settings for Jupyter services type SecurityConfig struct { - AllowNetwork bool `json:"allow_network"` - AllowedHosts []string `json:"allowed_hosts"` + AllowedPackages map[string]bool `json:"allowed_packages"` + DropCapabilities []string `json:"drop_capabilities"` BlockedHosts []string `json:"blocked_hosts"` - EnableFirewall bool `json:"enable_firewall"` TrustedChannels []string `json:"trusted_channels"` BlockedPackages []string `json:"blocked_packages"` - AllowedPackages map[string]bool `json:"allowed_packages"` + AllowedHosts []string `json:"allowed_hosts"` + EnableFirewall bool `json:"enable_firewall"` RequireApproval bool `json:"require_approval"` ReadOnlyRoot bool `json:"read_only_root"` - DropCapabilities []string `json:"drop_capabilities"` + AllowNetwork bool `json:"allow_network"` RunAsNonRoot bool `json:"run_as_non_root"` EnableSeccomp bool `json:"enable_seccomp"` NoNewPrivileges bool `json:"no_new_privileges"` @@ -290,19 +295,19 @@ type SecurityConfig struct { // JupyterService represents a running Jupyter instance type JupyterService struct { - ID string `json:"id"` - Name string `json:"name"` - Status string `json:"status"` - ContainerID string `json:"container_id"` - Port int `json:"port"` - Workspace string `json:"workspace"` - Image string `json:"image"` - URL string `json:"url"` CreatedAt time.Time `json:"created_at"` LastAccess time.Time `json:"last_access"` - Config ServiceConfig `json:"config"` - Environment map[string]string `json:"environment"` Metadata map[string]string `json:"metadata"` + Environment map[string]string `json:"environment"` + Image string `json:"image"` + ContainerID string `json:"container_id"` + URL string `json:"url"` + Workspace string `json:"workspace"` + ID string `json:"id"` + Status string `json:"status"` + Name string `json:"name"` + Config ServiceConfig `json:"config"` + Port int `json:"port"` } type InstalledPackage struct { @@ -313,15 +318,15 @@ type InstalledPackage struct { // StartRequest defines parameters for starting a Jupyter service type StartRequest struct { + Environment map[string]string `json:"environment"` + Metadata map[string]string `json:"metadata"` Name string `json:"name"` Workspace string `json:"workspace"` Image string `json:"image"` - Port int `json:"port"` Resources ResourceConfig `json:"resources"` Security SecurityConfig `json:"security"` Network NetworkConfig `json:"network"` - Environment map[string]string `json:"environment"` - Metadata map[string]string `json:"metadata"` + Port int `json:"port"` } // NewServiceManager creates a new Jupyter service manager @@ -1182,7 +1187,11 @@ func (sm *ServiceManager) saveServices() error { return err } - return os.WriteFile(servicesFile, data, 0600) + // SECURITY: Write with fsync for crash safety + if err := fileutil.WriteFileSafe(servicesFile, data, 0600); err != nil { + return fmt.Errorf("failed to write services file: %w", err) + } + return nil } // LinkWorkspaceWithExperiment links a workspace with an experiment diff --git a/internal/jupyter/workspace_manager.go b/internal/jupyter/workspace_manager.go index c1eb310..c4fdfef 100644 --- a/internal/jupyter/workspace_manager.go +++ b/internal/jupyter/workspace_manager.go @@ -17,28 +17,28 @@ const ( // WorkspaceManager handles workspace mounting and volume management for Jupyter services type WorkspaceManager struct { logger *logging.Logger - basePath string mounts map[string]*WorkspaceMount + basePath string } // WorkspaceMount represents a workspace mount configuration type WorkspaceMount struct { + Options map[string]string `json:"options"` ID string `json:"id"` HostPath string `json:"host_path"` ContainerPath string `json:"container_path"` - MountType string `json:"mount_type"` // "bind", "volume", "tmpfs" + MountType string `json:"mount_type"` + Services []string `json:"services"` ReadOnly bool `json:"read_only"` - Options map[string]string `json:"options"` - Services []string `json:"services"` // Service IDs using this mount } // MountRequest defines parameters for creating a workspace mount type MountRequest struct { + Options map[string]string `json:"options"` HostPath string `json:"host_path"` ContainerPath string `json:"container_path"` MountType string `json:"mount_type"` ReadOnly bool `json:"read_only"` - Options map[string]string `json:"options"` } // NewWorkspaceManager creates a new workspace manager @@ -394,6 +394,7 @@ func (wm *WorkspaceManager) GetWorkspaceInfo(workspacePath string) (*WorkspaceIn // WorkspaceInfo contains information about a workspace type WorkspaceInfo struct { + Modified time.Time `json:"modified"` Path string `json:"path"` FileCount int64 `json:"file_count"` PythonFiles int64 `json:"python_files"` @@ -402,7 +403,6 @@ type WorkspaceInfo struct { ConfigFiles int64 `json:"config_files"` Size int64 `json:"size"` TotalSize int64 `json:"total_size"` - Modified time.Time `json:"modified"` } // Cleanup removes unused mounts diff --git a/internal/jupyter/workspace_metadata.go b/internal/jupyter/workspace_metadata.go index 09a817f..5309354 100644 --- a/internal/jupyter/workspace_metadata.go +++ b/internal/jupyter/workspace_metadata.go @@ -9,29 +9,30 @@ import ( "time" "github.com/jfraeys/fetch_ml/internal/config" + "github.com/jfraeys/fetch_ml/internal/fileutil" "github.com/jfraeys/fetch_ml/internal/logging" ) // WorkspaceMetadata tracks the relationship between Jupyter workspaces and experiments type WorkspaceMetadata struct { + LinkedAt time.Time `json:"linked_at"` + LastSync time.Time `json:"last_sync"` + AdditionalData map[string]string `json:"additional_data"` WorkspacePath string `json:"workspace_path"` ExperimentID string `json:"experiment_id"` ServiceID string `json:"service_id,omitempty"` - LinkedAt time.Time `json:"linked_at"` - LastSync time.Time `json:"last_sync"` - SyncDirection string `json:"sync_direction"` // "pull", "push", "bidirectional" - AutoSync bool `json:"auto_sync"` - SyncInterval time.Duration `json:"sync_interval"` + SyncDirection string `json:"sync_direction"` Tags []string `json:"tags"` - AdditionalData map[string]string `json:"additional_data"` + SyncInterval time.Duration `json:"sync_interval"` + AutoSync bool `json:"auto_sync"` } // WorkspaceMetadataManager manages workspace metadata type WorkspaceMetadataManager struct { logger *logging.Logger - metadata map[string]*WorkspaceMetadata // key: workspace path - mutex sync.RWMutex + metadata map[string]*WorkspaceMetadata dataFile string + mutex sync.RWMutex } // NewWorkspaceMetadataManager creates a new workspace metadata manager @@ -319,7 +320,7 @@ func (wmm *WorkspaceMetadataManager) loadMetadata() error { return nil } -// saveMetadata saves metadata to disk using PathRegistry +// saveMetadata saves metadata to disk using PathRegistry with crash safety (fsync) func (wmm *WorkspaceMetadataManager) saveMetadata() error { // Use PathRegistry for consistent directory creation paths := config.FromEnv() @@ -332,7 +333,8 @@ func (wmm *WorkspaceMetadataManager) saveMetadata() error { return fmt.Errorf("failed to marshal metadata: %w", err) } - if err := os.WriteFile(wmm.dataFile, data, 0644); err != nil { + // SECURITY: Write with fsync for crash safety + if err := fileutil.WriteFileSafe(wmm.dataFile, data, 0644); err != nil { return fmt.Errorf("failed to write metadata file: %w", err) } @@ -364,7 +366,8 @@ func (wmm *WorkspaceMetadataManager) createWorkspaceMetadataFile( return fmt.Errorf("failed to marshal workspace metadata: %w", err) } - if err := os.WriteFile(workspaceMetaFile, data, 0600); err != nil { + // SECURITY: Write with fsync for crash safety + if err := fileutil.WriteFileSafe(workspaceMetaFile, data, 0600); err != nil { return fmt.Errorf("failed to write workspace metadata file: %w", err) } diff --git a/internal/manifest/run_manifest.go b/internal/manifest/run_manifest.go index 379c643..0b8583f 100644 --- a/internal/manifest/run_manifest.go +++ b/internal/manifest/run_manifest.go @@ -108,16 +108,16 @@ type Outcome struct { } type ArtifactFile struct { + Modified time.Time `json:"modified"` Path string `json:"path"` SizeBytes int64 `json:"size_bytes"` - Modified time.Time `json:"modified"` } type Artifacts struct { DiscoveryTime time.Time `json:"discovery_time"` Files []ArtifactFile `json:"files,omitempty"` + Exclusions []Exclusion `json:"exclusions,omitempty"` TotalSizeBytes int64 `json:"total_size_bytes,omitempty"` - Exclusions []Exclusion `json:"exclusions,omitempty"` // R.5: Scan exclusions recorded } // Exclusion records why a path was excluded from artifact scanning @@ -129,69 +129,58 @@ type Exclusion struct { // ExecutionEnvironment captures the runtime environment for reproducibility. // This enables reconstruction and comparison of runs. type ExecutionEnvironment struct { - ConfigHash string `json:"config_hash"` // R.2: Resolved config hash - GPUCount int `json:"gpu_count"` // GPU count detected - GPUDetectionMethod string `json:"gpu_detection_method,omitempty"` // R.3: "nvml", "env_override", etc. - GPUVendor string `json:"gpu_vendor,omitempty"` // Configured GPU vendor - MaxWorkers int `json:"max_workers"` // Active resource limits - PodmanCPUs string `json:"podman_cpus,omitempty"` // CPU limit - SandboxNetworkMode string `json:"sandbox_network_mode"` // Sandbox settings - SandboxSeccomp string `json:"sandbox_seccomp,omitempty"` // Seccomp profile - SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"` // Security flags - ComplianceMode string `json:"compliance_mode,omitempty"` // HIPAA mode - ManifestNonce string `json:"manifest_nonce,omitempty"` // Unique manifest identifier - Metadata map[string]string `json:"metadata,omitempty"` // Additional env info + Metadata map[string]string `json:"metadata,omitempty"` + ConfigHash string `json:"config_hash"` + GPUDetectionMethod string `json:"gpu_detection_method,omitempty"` + GPUVendor string `json:"gpu_vendor,omitempty"` + PodmanCPUs string `json:"podman_cpus,omitempty"` + SandboxNetworkMode string `json:"sandbox_network_mode"` + SandboxSeccomp string `json:"sandbox_seccomp,omitempty"` + ComplianceMode string `json:"compliance_mode,omitempty"` + ManifestNonce string `json:"manifest_nonce,omitempty"` + GPUCount int `json:"gpu_count"` + MaxWorkers int `json:"max_workers"` + SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"` } // RunManifest is a best-effort, self-contained provenance record for a run. // It is written to /run_manifest.json. type RunManifest struct { - RunID string `json:"run_id"` - TaskID string `json:"task_id"` - JobName string `json:"job_name"` - CreatedAt time.Time `json:"created_at"` - StartedAt time.Time `json:"started_at,omitempty"` - EndedAt time.Time `json:"ended_at,omitempty"` - - Annotations []Annotation `json:"annotations,omitempty"` - Narrative *Narrative `json:"narrative,omitempty"` - Outcome *Outcome `json:"outcome,omitempty"` - Artifacts *Artifacts `json:"artifacts,omitempty"` - - CommitID string `json:"commit_id,omitempty"` - ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"` - DepsManifestName string `json:"deps_manifest_name,omitempty"` - DepsManifestSHA string `json:"deps_manifest_sha,omitempty"` - TrainScriptPath string `json:"train_script_path,omitempty"` - - WorkerVersion string `json:"worker_version,omitempty"` - PodmanImage string `json:"podman_image,omitempty"` - ImageDigest string `json:"image_digest,omitempty"` - - SnapshotID string `json:"snapshot_id,omitempty"` - SnapshotSHA256 string `json:"snapshot_sha256,omitempty"` - - Command string `json:"command,omitempty"` - Args string `json:"args,omitempty"` - ExitCode *int `json:"exit_code,omitempty"` - Error string `json:"error,omitempty"` - - StagingDurationMS int64 `json:"staging_duration_ms,omitempty"` - ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"` - FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"` - TotalDurationMS int64 `json:"total_duration_ms,omitempty"` - - GPUDevices []string `json:"gpu_devices,omitempty"` - WorkerHost string `json:"worker_host,omitempty"` - Metadata map[string]string `json:"metadata,omitempty"` - - // Environment captures execution environment for reproducibility (R.1) - Environment *ExecutionEnvironment `json:"environment,omitempty"` - - // Signature fields for tamper detection - Signature string `json:"signature,omitempty"` - SignerKeyID string `json:"signer_key_id,omitempty"` - SigAlg string `json:"sig_alg,omitempty"` + CreatedAt time.Time `json:"created_at"` + StartedAt time.Time `json:"started_at,omitempty"` + EndedAt time.Time `json:"ended_at,omitempty"` + Artifacts *Artifacts `json:"artifacts,omitempty"` + Environment *ExecutionEnvironment `json:"environment,omitempty"` + Metadata map[string]string `json:"metadata,omitempty"` + ExitCode *int `json:"exit_code,omitempty"` + Narrative *Narrative `json:"narrative,omitempty"` + Outcome *Outcome `json:"outcome,omitempty"` + PodmanImage string `json:"podman_image,omitempty"` + SnapshotID string `json:"snapshot_id,omitempty"` + ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"` + DepsManifestName string `json:"deps_manifest_name,omitempty"` + DepsManifestSHA string `json:"deps_manifest_sha,omitempty"` + TrainScriptPath string `json:"train_script_path,omitempty"` + WorkerVersion string `json:"worker_version,omitempty"` + RunID string `json:"run_id"` + ImageDigest string `json:"image_digest,omitempty"` + WorkerHost string `json:"worker_host,omitempty"` + SnapshotSHA256 string `json:"snapshot_sha256,omitempty"` + Command string `json:"command,omitempty"` + Args string `json:"args,omitempty"` + CommitID string `json:"commit_id,omitempty"` + Error string `json:"error,omitempty"` + SigAlg string `json:"sig_alg,omitempty"` + SignerKeyID string `json:"signer_key_id,omitempty"` + Signature string `json:"signature,omitempty"` + TaskID string `json:"task_id"` + JobName string `json:"job_name"` + Annotations []Annotation `json:"annotations,omitempty"` + GPUDevices []string `json:"gpu_devices,omitempty"` + TotalDurationMS int64 `json:"total_duration_ms,omitempty"` + FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"` + ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"` + StagingDurationMS int64 `json:"staging_duration_ms,omitempty"` } func NewRunManifest(runID, taskID, jobName string, createdAt time.Time) *RunManifest { @@ -402,3 +391,75 @@ func (m *RunManifest) ValidateStrict() error { v := NewValidator() return v.ValidateStrict(m) } + +// ValidateProvenance checks if the manifest has complete provenance information. +// When ProvenanceBestEffort is false (default), this must pass before writing. +// Returns an error describing what's missing if validation fails. +func (m *RunManifest) ValidateProvenance() error { + if m == nil { + return fmt.Errorf("manifest is nil") + } + + var missing []string + + // Check Environment is present + if m.Environment == nil { + missing = append(missing, "environment metadata") + } else { + // Check ConfigHash - critical for reproducibility + if m.Environment.ConfigHash == "" { + missing = append(missing, "config_hash") + } + + // Check GPU detection method + if m.Environment.GPUDetectionMethod == "" { + missing = append(missing, "gpu_detection_method") + } + + // Check Sandbox configuration + if m.Environment.SandboxNetworkMode == "" { + missing = append(missing, "sandbox_network_mode") + } + } + + // Check Artifacts are present (though they may be empty for new runs) + if m.Artifacts == nil { + missing = append(missing, "artifacts metadata") + } + + // Check basic run identification + if m.RunID == "" { + missing = append(missing, "run_id") + } + if m.TaskID == "" { + missing = append(missing, "task_id") + } + + if len(missing) > 0 { + return fmt.Errorf("incomplete provenance record: missing %v", missing) + } + + return nil +} + +// CanWrite checks if the manifest can be written given the provenance requirements. +// When ProvenanceBestEffort is false (default), requires complete environment capture. +// When true, allows partial manifests with warnings. +func (m *RunManifest) CanWrite(provenanceBestEffort bool) error { + // Always validate basic structure + if err := m.Validate(); err != nil { + return fmt.Errorf("manifest validation failed: %w", err) + } + + // If best-effort is enabled, allow partial manifests + if provenanceBestEffort { + return nil + } + + // Otherwise, require complete provenance (fail-closed default) + if err := m.ValidateProvenance(); err != nil { + return fmt.Errorf("provenance validation failed (set provenance_best_effort: true to allow partial manifests): %w", err) + } + + return nil +} diff --git a/internal/manifest/schema_version.go b/internal/manifest/schema_version.go index ca743e4..c77023c 100644 --- a/internal/manifest/schema_version.go +++ b/internal/manifest/schema_version.go @@ -8,8 +8,8 @@ const SchemaVersion = "1.0.0" type SchemaVersionInfo struct { Version string Date string - Breaking bool Description string + Breaking bool } // SchemaChangeHistory documents all schema versions