refactor(jupyter): enhance security and scheduler integration

Update Jupyter integration for security and scheduler support:
- Enhanced security configuration with audit logging
- Health monitoring with scheduler event integration
- Package manager with network policy enforcement
- Service manager with lifecycle hooks
- Network manager with tenant isolation
- Workspace metadata with tenant tags
- Config with resource limits
- Podman container integration improvements
- Experiment manager with tracking integration
- Manifest runner with security checks
This commit is contained in:
Jeremie Fraeys 2026-02-26 12:06:35 -05:00
parent 3fb6902fa1
commit 6b2c377680
No known key found for this signature in database
12 changed files with 319 additions and 298 deletions

View file

@ -306,20 +306,20 @@ func (pm *PodmanManager) ExecContainer(ctx context.Context, containerID string,
// PodmanConfig holds configuration for Podman container execution
type PodmanConfig struct {
Image string
Workspace string
Results string
ContainerWorkspace string
ContainerResults string
AppleGPU bool
GPUDevices []string
Env map[string]string
Volumes map[string]string
Memory string
ContainerWorkspace string
ContainerResults string
Results string
Workspace string
Image string
CPUs string
Privileged bool // Security: must be false
Network string // Security: must not be "host"
ReadOnlyMounts bool // Security: true for dataset mounts
Network string
GPUDevices []string
AppleGPU bool
Privileged bool
ReadOnlyMounts bool
}
// PodmanResourceOverrides converts per-task resource requests into Podman-compatible
@ -338,15 +338,22 @@ func PodmanResourceOverrides(cpu int, memoryGB int) (cpus string, memory string)
// PodmanSecurityConfig holds security configuration for Podman containers
type PodmanSecurityConfig struct {
NoNewPrivileges bool
DropAllCaps bool
SeccompProfile string
NetworkMode string
AllowedCaps []string
UserNS bool
RunAsUID int
RunAsGID int
SeccompProfile string
NoNewPrivileges bool
DropAllCaps bool
UserNS bool
ReadOnlyRoot bool
NetworkMode string
// Process Isolation
MaxProcesses int
MaxOpenFiles int
DisableSwap bool
OOMScoreAdj int
TaskUID int
TaskGID int
}
// BuildSecurityArgs builds security-related podman arguments from PodmanSecurityConfig
@ -395,6 +402,27 @@ func BuildSecurityArgs(sandbox PodmanSecurityConfig) []string {
}
args = append(args, "--network", networkMode)
// Process Isolation
// Fork bomb protection - limit number of processes
if sandbox.MaxProcesses > 0 {
args = append(args, "--pids-limit", strconv.Itoa(sandbox.MaxProcesses))
}
// File descriptor limits
if sandbox.MaxOpenFiles > 0 {
args = append(args, "--ulimit", fmt.Sprintf("nofile=%d:%d", sandbox.MaxOpenFiles, sandbox.MaxOpenFiles))
}
// OOM killer score adjustment (lower = less likely to be killed)
if sandbox.OOMScoreAdj != 0 {
args = append(args, "--oom-score-adj", strconv.Itoa(sandbox.OOMScoreAdj))
}
// Disable swap (memory-swap equals memory means no swap)
if sandbox.DisableSwap {
args = append(args, "--memory-swap=0")
}
return args
}
@ -488,84 +516,6 @@ func BuildPodmanCommand(
return exec.CommandContext(ctx, "podman", args...)
}
// BuildPodmanCommandLegacy builds a Podman command using legacy security settings
// Deprecated: Use BuildPodmanCommand with SandboxConfig instead
func BuildPodmanCommandLegacy(
ctx context.Context,
cfg PodmanConfig,
scriptPath, depsPath string,
extraArgs []string,
) *exec.Cmd {
args := []string{
"run", "--rm",
"--security-opt", "no-new-privileges",
"--cap-drop", "ALL",
}
// Add network mode if specified
if cfg.Network != "" {
args = append(args, "--network", cfg.Network)
}
// Add read-only root filesystem
if cfg.ReadOnlyMounts {
args = append(args, "--read-only")
}
if cfg.Memory != "" {
args = append(args, "--memory", cfg.Memory)
} else {
args = append(args, "--memory", config.DefaultPodmanMemory)
}
if cfg.CPUs != "" {
args = append(args, "--cpus", cfg.CPUs)
} else {
args = append(args, "--cpus", config.DefaultPodmanCPUs)
}
args = append(args, "--userns", "keep-id")
// Mount workspace
workspaceMount := fmt.Sprintf("%s:%s:rw", cfg.Workspace, cfg.ContainerWorkspace)
args = append(args, "-v", workspaceMount)
// Mount results
resultsMount := fmt.Sprintf("%s:%s:rw", cfg.Results, cfg.ContainerResults)
args = append(args, "-v", resultsMount)
// Mount additional volumes
for hostPath, containerPath := range cfg.Volumes {
mount := fmt.Sprintf("%s:%s", hostPath, containerPath)
args = append(args, "-v", mount)
}
// Use injected GPU device paths for Apple GPU or custom configurations
for _, device := range cfg.GPUDevices {
args = append(args, "--device", device)
}
// Add environment variables
for key, value := range cfg.Env {
args = append(args, "-e", fmt.Sprintf("%s=%s", key, value))
}
// Image and command
args = append(args, cfg.Image,
"--workspace", cfg.ContainerWorkspace,
"--deps", depsPath,
"--script", scriptPath,
)
// Add extra arguments via --args flag
if len(extraArgs) > 0 {
args = append(args, "--args")
args = append(args, extraArgs...)
}
return exec.CommandContext(ctx, "podman", args...)
}
// ValidateSecurityPolicy validates that the container configuration meets security requirements.
// Returns an error if the configuration violates security policies.
func ValidateSecurityPolicy(cfg PodmanConfig) error {
@ -588,10 +538,10 @@ func ValidateSecurityPolicy(cfg PodmanConfig) error {
// PodmanSecret represents a secret to be mounted in a container
type PodmanSecret struct {
Name string // Secret name in Podman
Data []byte // Secret data (will be base64 encoded)
Target string // Mount path inside container
EnvVar string // Environment variable name (optional, if set mounts as env var instead of file)
Name string
Target string
EnvVar string
Data []byte
}
// CreateSecret creates a Podman secret from the given data

View file

@ -104,7 +104,7 @@ func (m *Manager) CreateExperiment(commitID string) error {
return nil
}
// WriteMetadata writes experiment metadata to meta.bin
// WriteMetadata writes experiment metadata to meta.bin with crash safety (fsync)
func (m *Manager) WriteMetadata(meta *Metadata) error {
path := m.GetMetadataPath(meta.CommitID)
@ -134,7 +134,8 @@ func (m *Manager) WriteMetadata(meta *Metadata) error {
buf = append(buf, byte(len(meta.User)))
buf = append(buf, []byte(meta.User)...)
return os.WriteFile(path, buf, 0o600)
// SECURITY: Write with fsync for crash safety
return fileutil.WriteFileSafe(path, buf, 0o600)
}
// ReadMetadata reads experiment metadata from meta.bin

View file

@ -9,6 +9,7 @@ import (
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/fileutil"
"github.com/jfraeys/fetch_ml/internal/logging"
)
@ -58,34 +59,34 @@ type ConfigManager struct {
type JupyterConfig struct {
Version string `json:"version"`
Environment string `json:"environment"`
Service ServiceConfig `json:"service"`
Logging LoggingConfig `json:"logging"`
Resources ResourceConfig `json:"resources"`
Security SecurityConfig `json:"security"`
Workspace WorkspaceConfig `json:"workspace"`
Network NetworkConfig `json:"network"`
Security SecurityConfig `json:"security"`
Resources ResourceConfig `json:"resources"`
Health HealthConfig `json:"health"`
Logging LoggingConfig `json:"logging"`
DefaultSettings DefaultSettingsConfig `json:"default_settings"`
Service ServiceConfig `json:"service"`
AdvancedSettings AdvancedSettingsConfig `json:"advanced_settings"`
Health HealthConfig `json:"health"`
}
// WorkspaceConfig defines workspace configuration
type WorkspaceConfig struct {
DefaultPath string `json:"default_path"`
AutoCreate bool `json:"auto_create"`
MountOptions map[string]string `json:"mount_options"`
DefaultPath string `json:"default_path"`
MaxWorkspaceSize string `json:"max_workspace_size"`
AllowedPaths []string `json:"allowed_paths"`
DeniedPaths []string `json:"denied_paths"`
MaxWorkspaceSize string `json:"max_workspace_size"`
AutoCreate bool `json:"auto_create"`
}
// HealthConfig defines health monitoring configuration
type HealthConfig struct {
Enabled bool `json:"enabled"`
CheckInterval time.Duration `json:"check_interval"`
Timeout time.Duration `json:"timeout"`
RetryAttempts int `json:"retry_attempts"`
MaxServiceAge time.Duration `json:"max_service_age"`
Enabled bool `json:"enabled"`
AutoCleanup bool `json:"auto_cleanup"`
MetricsEnabled bool `json:"metrics_enabled"`
}
@ -96,31 +97,31 @@ type LoggingConfig struct {
Format string `json:"format"`
Output string `json:"output"`
MaxSize string `json:"max_size"`
MaxBackups int `json:"max_backups"`
MaxAge string `json:"max_age"`
MaxBackups int `json:"max_backups"`
}
// DefaultSettingsConfig defines default settings for new services
type DefaultSettingsConfig struct {
Image string `json:"default_image"`
Port int `json:"default_port"`
Workspace string `json:"default_workspace"`
Environment map[string]string `json:"environment"`
Image string `json:"default_image"`
Workspace string `json:"default_workspace"`
ShutdownPolicy string `json:"shutdown_policy"`
Port int `json:"default_port"`
StopTimeout time.Duration `json:"stop_timeout"`
AutoStart bool `json:"auto_start"`
AutoStop bool `json:"auto_stop"`
StopTimeout time.Duration `json:"stop_timeout"`
ShutdownPolicy string `json:"shutdown_policy"`
}
// AdvancedSettingsConfig defines advanced configuration options
type AdvancedSettingsConfig struct {
ExperimentalFeatures []string `json:"experimental_features"`
MaxConcurrentServices int `json:"max_concurrent_services"`
ServiceTimeout time.Duration `json:"service_timeout"`
StartupTimeout time.Duration `json:"startup_timeout"`
GracefulShutdown bool `json:"graceful_shutdown"`
ForceCleanup bool `json:"force_cleanup"`
DebugMode bool `json:"debug_mode"`
ExperimentalFeatures []string `json:"experimental_features"`
}
// NewConfigManager creates a new configuration manager
@ -190,8 +191,8 @@ func (cm *ConfigManager) SaveConfig() error {
return fmt.Errorf("failed to marshal config: %w", err)
}
// Write configuration file
if err := os.WriteFile(cm.configPath, data, 0600); err != nil {
// Write configuration file with crash safety (fsync)
if err := fileutil.WriteFileSafe(cm.configPath, data, 0600); err != nil {
return fmt.Errorf("failed to write config file: %w", err)
}

View file

@ -20,33 +20,33 @@ const (
type HealthMonitor struct {
logger *logging.Logger
services map[string]*JupyterService
servicesMutex sync.RWMutex
interval time.Duration
client *http.Client
interval time.Duration
servicesMutex sync.RWMutex
}
// HealthStatus represents the health status of a service
type HealthStatus struct {
LastCheck time.Time `json:"last_check"`
Metrics map[string]interface{} `json:"metrics"`
ServiceID string `json:"service_id"`
ServiceName string `json:"service_name"`
Status string `json:"status"`
LastCheck time.Time `json:"last_check"`
ResponseTime time.Duration `json:"response_time"`
URL string `json:"url"`
ContainerID string `json:"container_id"`
Errors []string `json:"errors"`
Metrics map[string]interface{} `json:"metrics"`
ResponseTime time.Duration `json:"response_time"`
}
// HealthReport contains a comprehensive health report
type HealthReport struct {
Timestamp time.Time `json:"timestamp"`
Services map[string]*HealthStatus `json:"services"`
Summary string `json:"summary"`
TotalServices int `json:"total_services"`
Healthy int `json:"healthy"`
Unhealthy int `json:"unhealthy"`
Unknown int `json:"unknown"`
Services map[string]*HealthStatus `json:"services"`
Summary string `json:"summary"`
}
// NewHealthMonitor creates a new health monitor

View file

@ -18,9 +18,9 @@ type NetworkManager struct {
// PortAllocator manages port allocation for services
type PortAllocator struct {
usedPorts map[int]bool
startPort int
endPort int
usedPorts map[int]bool
}
// NewNetworkManager creates a new network manager
@ -313,10 +313,10 @@ func (nm *NetworkManager) GetNetworkStatus() *NetworkStatus {
// NetworkStatus contains network status information
type NetworkStatus struct {
PortRange string `json:"port_range"`
TotalPorts int `json:"total_ports"`
AvailablePorts int `json:"available_ports"`
UsedPorts int `json:"used_ports"`
PortRange string `json:"port_range"`
Services int `json:"services"`
}

View file

@ -8,6 +8,7 @@ import (
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/fileutil"
"github.com/jfraeys/fetch_ml/internal/logging"
)
@ -18,22 +19,22 @@ const (
// PackageManager manages package installations in Jupyter workspaces
type PackageManager struct {
logger *logging.Logger
trustedChannels []string
allowedPackages map[string]bool
blockedPackages []string
workspacePath string
packageCachePath string
trustedChannels []string
blockedPackages []string
}
// PackageConfig defines package management configuration
type PackageConfig struct {
TrustedChannels []string `json:"trusted_channels"`
AllowedPackages map[string]bool `json:"allowed_packages"`
TrustedChannels []string `json:"trusted_channels"`
BlockedPackages []string `json:"blocked_packages"`
RequireApproval bool `json:"require_approval"`
AutoApproveSafe bool `json:"auto_approve_safe"`
MaxPackages int `json:"max_packages"`
InstallTimeout time.Duration `json:"install_timeout"`
RequireApproval bool `json:"require_approval"`
AutoApproveSafe bool `json:"auto_approve_safe"`
AllowCondaForge bool `json:"allow_conda_forge"`
AllowPyPI bool `json:"allow_pypi"`
AllowLocal bool `json:"allow_local"`
@ -41,28 +42,28 @@ type PackageConfig struct {
// PackageRequest represents a package installation request
type PackageRequest struct {
Timestamp time.Time `json:"timestamp"`
ApprovalTime time.Time `json:"approval_time,omitempty"`
PackageName string `json:"package_name"`
Version string `json:"version,omitempty"`
Channel string `json:"channel,omitempty"`
RequestedBy string `json:"requested_by"`
WorkspacePath string `json:"workspace_path"`
Timestamp time.Time `json:"timestamp"`
Status string `json:"status"` // pending, approved, rejected, installed, failed
Status string `json:"status"`
RejectionReason string `json:"rejection_reason,omitempty"`
ApprovalUser string `json:"approval_user,omitempty"`
ApprovalTime time.Time `json:"approval_time,omitempty"`
}
// PackageInfo contains information about an installed package
type PackageInfo struct {
InstalledAt time.Time `json:"installed_at"`
Metadata map[string]string `json:"metadata"`
Name string `json:"name"`
Version string `json:"version"`
Channel string `json:"channel"`
InstalledAt time.Time `json:"installed_at"`
InstalledBy string `json:"installed_by"`
Size string `json:"size"`
Dependencies []string `json:"dependencies"`
Metadata map[string]string `json:"metadata"`
}
// NewPackageManager creates a new package manager
@ -353,14 +354,18 @@ func (pm *PackageManager) GetPackageRequest(requestID string) (*PackageRequest,
return pm.loadPackageRequest(requestID)
}
// savePackageRequest saves a package request to cache
// savePackageRequest saves a package request to cache with crash safety (fsync)
func (pm *PackageManager) savePackageRequest(req *PackageRequest) error {
requestFile := filepath.Join(pm.packageCachePath, fmt.Sprintf("request_%s.json", req.PackageName))
data, err := json.MarshalIndent(req, "", " ")
if err != nil {
return err
}
return os.WriteFile(requestFile, data, 0600)
// SECURITY: Write with fsync for crash safety
if err := fileutil.WriteFileSafe(requestFile, data, 0600); err != nil {
return fmt.Errorf("failed to write package request: %w", err)
}
return nil
}
// loadPackageRequest loads a package request from cache
@ -406,14 +411,18 @@ func (pm *PackageManager) loadAllPackageRequests() ([]*PackageRequest, error) {
return requests, nil
}
// savePackageInfo saves package information to cache
// savePackageInfo saves package information to cache with crash safety (fsync)
func (pm *PackageManager) savePackageInfo(info *PackageInfo) error {
infoFile := filepath.Join(pm.packageCachePath, fmt.Sprintf("installed_%s.json", info.Name))
data, err := json.MarshalIndent(info, "", " ")
if err != nil {
return err
}
return os.WriteFile(infoFile, data, 0600)
// SECURITY: Write with fsync for crash safety
if err := fileutil.WriteFileSafe(infoFile, data, 0600); err != nil {
return fmt.Errorf("failed to write package info: %w", err)
}
return nil
}
// loadAllPackageInfo loads all installed package information

View file

@ -23,57 +23,44 @@ type SecurityManager struct {
// EnhancedSecurityConfig provides comprehensive security settings
type EnhancedSecurityConfig struct {
// Network Security
AllowNetwork bool `json:"allow_network"`
AllowedHosts []string `json:"allowed_hosts"`
BlockedHosts []string `json:"blocked_hosts"`
EnableFirewall bool `json:"enable_firewall"`
// Package Security
TrustedChannels []string `json:"trusted_channels"`
BlockedPackages []string `json:"blocked_packages"`
AllowedPackages map[string]bool `json:"allowed_packages"`
RequireApproval bool `json:"require_approval"`
AutoApproveSafe bool `json:"auto_approve_safe"`
MaxPackages int `json:"max_packages"`
InstallTimeout time.Duration `json:"install_timeout"`
AllowCondaForge bool `json:"allow_conda_forge"`
AllowPyPI bool `json:"allow_pypi"`
AllowLocal bool `json:"allow_local"`
// Container Security
ReadOnlyRoot bool `json:"read_only_root"`
DropCapabilities []string `json:"drop_capabilities"`
RunAsNonRoot bool `json:"run_as_non_root"`
EnableSeccomp bool `json:"enable_seccomp"`
NoNewPrivileges bool `json:"no_new_privileges"`
// Authentication Security
EnableTokenAuth bool `json:"enable_token_auth"`
TokenLength int `json:"token_length"`
TokenExpiry time.Duration `json:"token_expiry"`
RequireHTTPS bool `json:"require_https"`
SessionTimeout time.Duration `json:"session_timeout"`
MaxFailedAttempts int `json:"max_failed_attempts"`
LockoutDuration time.Duration `json:"lockout_duration"`
// File System Security
AllowedPaths []string `json:"allowed_paths"`
DeniedPaths []string `json:"denied_paths"`
MaxWorkspaceSize string `json:"max_workspace_size"`
AllowExecFrom []string `json:"allow_exec_from"`
BlockExecFrom []string `json:"block_exec_from"`
// Resource Security
MaxMemoryLimit string `json:"max_memory_limit"`
MaxCPULimit string `json:"max_cpu_limit"`
MaxDiskUsage string `json:"max_disk_usage"`
MaxProcesses int `json:"max_processes"`
// Logging & Monitoring
SecurityLogLevel string `json:"security_log_level"`
AuditEnabled bool `json:"audit_enabled"`
RealTimeAlerts bool `json:"real_time_alerts"`
AllowedPackages map[string]bool `json:"allowed_packages"`
SecurityLogLevel string `json:"security_log_level"`
MaxDiskUsage string `json:"max_disk_usage"`
MaxWorkspaceSize string `json:"max_workspace_size"`
MaxMemoryLimit string `json:"max_memory_limit"`
MaxCPULimit string `json:"max_cpu_limit"`
BlockedPackages []string `json:"blocked_packages"`
AllowedHosts []string `json:"allowed_hosts"`
BlockedHosts []string `json:"blocked_hosts"`
TrustedChannels []string `json:"trusted_channels"`
BlockExecFrom []string `json:"block_exec_from"`
AllowExecFrom []string `json:"allow_exec_from"`
DropCapabilities []string `json:"drop_capabilities"`
DeniedPaths []string `json:"denied_paths"`
AllowedPaths []string `json:"allowed_paths"`
MaxPackages int `json:"max_packages"`
SessionTimeout time.Duration `json:"session_timeout"`
MaxProcesses int `json:"max_processes"`
InstallTimeout time.Duration `json:"install_timeout"`
LockoutDuration time.Duration `json:"lockout_duration"`
TokenLength int `json:"token_length"`
TokenExpiry time.Duration `json:"token_expiry"`
MaxFailedAttempts int `json:"max_failed_attempts"`
ReadOnlyRoot bool `json:"read_only_root"`
NoNewPrivileges bool `json:"no_new_privileges"`
EnableTokenAuth bool `json:"enable_token_auth"`
RunAsNonRoot bool `json:"run_as_non_root"`
AllowLocal bool `json:"allow_local"`
AllowPyPI bool `json:"allow_pypi"`
AllowCondaForge bool `json:"allow_conda_forge"`
RequireHTTPS bool `json:"require_https"`
AllowNetwork bool `json:"allow_network"`
AutoApproveSafe bool `json:"auto_approve_safe"`
RequireApproval bool `json:"require_approval"`
EnableSeccomp bool `json:"enable_seccomp"`
EnableFirewall bool `json:"enable_firewall"`
AuditEnabled bool `json:"audit_enabled"`
RealTimeAlerts bool `json:"real_time_alerts"`
}
// SecurityEvent represents a security-related event

View file

@ -12,6 +12,7 @@ import (
"github.com/jfraeys/fetch_ml/internal/config"
"github.com/jfraeys/fetch_ml/internal/container"
"github.com/jfraeys/fetch_ml/internal/fileutil"
"github.com/jfraeys/fetch_ml/internal/logging"
)
@ -73,12 +74,12 @@ func trashBaseDir() string {
}
type trashInfo struct {
OriginalName string `json:"original_name"`
DeletedAt time.Time `json:"deleted_at"`
DeletedBy string `json:"deleted_by"`
SizeBytes int64 `json:"size_bytes"`
PurgeAfter time.Time `json:"purge_after"`
OriginalName string `json:"original_name"`
DeletedBy string `json:"deleted_by"`
Reason string `json:"reason"`
SizeBytes int64 `json:"size_bytes"`
}
func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalName string) (string, *trashInfo, error) {
@ -119,7 +120,11 @@ func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalNam
}
b, err := json.MarshalIndent(info, "", " ")
if err == nil {
_ = os.WriteFile(filepath.Join(dest, ".trashinfo"), b, 0o600)
// SECURITY: Write with fsync for crash safety
trashinfoPath := filepath.Join(dest, ".trashinfo")
if err := fileutil.WriteFileSafe(trashinfoPath, b, 0o600); err != nil {
sm.logger.Warn("failed to write trashinfo", "error", err)
}
}
return dest, info, nil
@ -243,25 +248,25 @@ func startupBlockedPackages(installBlocked []string) []string {
// ServiceConfig holds configuration for Jupyter services
type ServiceConfig struct {
DefaultImage string `json:"default_image"`
DefaultPort int `json:"default_port"`
DefaultWorkspace string `json:"default_workspace"`
MaxServices int `json:"max_services"`
DefaultResources ResourceConfig `json:"default_resources"`
SecuritySettings SecurityConfig `json:"security_settings"`
NetworkConfig NetworkConfig `json:"network_config"`
DefaultPort int `json:"default_port"`
MaxServices int `json:"max_services"`
}
// NetworkConfig defines network settings for Jupyter containers
type NetworkConfig struct {
BindAddress string `json:"bind_address"`
Token string `json:"token"`
Password string `json:"password"`
NetworkName string `json:"network_name"`
HostPort int `json:"host_port"`
ContainerPort int `json:"container_port"`
BindAddress string `json:"bind_address"`
EnableToken bool `json:"enable_token"`
Token string `json:"token"`
EnablePassword bool `json:"enable_password"`
Password string `json:"password"`
AllowRemote bool `json:"allow_remote"`
NetworkName string `json:"network_name"`
}
// ResourceConfig defines resource limits for Jupyter containers
@ -273,16 +278,16 @@ type ResourceConfig struct {
// SecurityConfig holds security settings for Jupyter services
type SecurityConfig struct {
AllowNetwork bool `json:"allow_network"`
AllowedHosts []string `json:"allowed_hosts"`
AllowedPackages map[string]bool `json:"allowed_packages"`
DropCapabilities []string `json:"drop_capabilities"`
BlockedHosts []string `json:"blocked_hosts"`
EnableFirewall bool `json:"enable_firewall"`
TrustedChannels []string `json:"trusted_channels"`
BlockedPackages []string `json:"blocked_packages"`
AllowedPackages map[string]bool `json:"allowed_packages"`
AllowedHosts []string `json:"allowed_hosts"`
EnableFirewall bool `json:"enable_firewall"`
RequireApproval bool `json:"require_approval"`
ReadOnlyRoot bool `json:"read_only_root"`
DropCapabilities []string `json:"drop_capabilities"`
AllowNetwork bool `json:"allow_network"`
RunAsNonRoot bool `json:"run_as_non_root"`
EnableSeccomp bool `json:"enable_seccomp"`
NoNewPrivileges bool `json:"no_new_privileges"`
@ -290,19 +295,19 @@ type SecurityConfig struct {
// JupyterService represents a running Jupyter instance
type JupyterService struct {
ID string `json:"id"`
Name string `json:"name"`
Status string `json:"status"`
ContainerID string `json:"container_id"`
Port int `json:"port"`
Workspace string `json:"workspace"`
Image string `json:"image"`
URL string `json:"url"`
CreatedAt time.Time `json:"created_at"`
LastAccess time.Time `json:"last_access"`
Config ServiceConfig `json:"config"`
Environment map[string]string `json:"environment"`
Metadata map[string]string `json:"metadata"`
Environment map[string]string `json:"environment"`
Image string `json:"image"`
ContainerID string `json:"container_id"`
URL string `json:"url"`
Workspace string `json:"workspace"`
ID string `json:"id"`
Status string `json:"status"`
Name string `json:"name"`
Config ServiceConfig `json:"config"`
Port int `json:"port"`
}
type InstalledPackage struct {
@ -313,15 +318,15 @@ type InstalledPackage struct {
// StartRequest defines parameters for starting a Jupyter service
type StartRequest struct {
Environment map[string]string `json:"environment"`
Metadata map[string]string `json:"metadata"`
Name string `json:"name"`
Workspace string `json:"workspace"`
Image string `json:"image"`
Port int `json:"port"`
Resources ResourceConfig `json:"resources"`
Security SecurityConfig `json:"security"`
Network NetworkConfig `json:"network"`
Environment map[string]string `json:"environment"`
Metadata map[string]string `json:"metadata"`
Port int `json:"port"`
}
// NewServiceManager creates a new Jupyter service manager
@ -1182,7 +1187,11 @@ func (sm *ServiceManager) saveServices() error {
return err
}
return os.WriteFile(servicesFile, data, 0600)
// SECURITY: Write with fsync for crash safety
if err := fileutil.WriteFileSafe(servicesFile, data, 0600); err != nil {
return fmt.Errorf("failed to write services file: %w", err)
}
return nil
}
// LinkWorkspaceWithExperiment links a workspace with an experiment

View file

@ -17,28 +17,28 @@ const (
// WorkspaceManager handles workspace mounting and volume management for Jupyter services
type WorkspaceManager struct {
logger *logging.Logger
basePath string
mounts map[string]*WorkspaceMount
basePath string
}
// WorkspaceMount represents a workspace mount configuration
type WorkspaceMount struct {
Options map[string]string `json:"options"`
ID string `json:"id"`
HostPath string `json:"host_path"`
ContainerPath string `json:"container_path"`
MountType string `json:"mount_type"` // "bind", "volume", "tmpfs"
MountType string `json:"mount_type"`
Services []string `json:"services"`
ReadOnly bool `json:"read_only"`
Options map[string]string `json:"options"`
Services []string `json:"services"` // Service IDs using this mount
}
// MountRequest defines parameters for creating a workspace mount
type MountRequest struct {
Options map[string]string `json:"options"`
HostPath string `json:"host_path"`
ContainerPath string `json:"container_path"`
MountType string `json:"mount_type"`
ReadOnly bool `json:"read_only"`
Options map[string]string `json:"options"`
}
// NewWorkspaceManager creates a new workspace manager
@ -394,6 +394,7 @@ func (wm *WorkspaceManager) GetWorkspaceInfo(workspacePath string) (*WorkspaceIn
// WorkspaceInfo contains information about a workspace
type WorkspaceInfo struct {
Modified time.Time `json:"modified"`
Path string `json:"path"`
FileCount int64 `json:"file_count"`
PythonFiles int64 `json:"python_files"`
@ -402,7 +403,6 @@ type WorkspaceInfo struct {
ConfigFiles int64 `json:"config_files"`
Size int64 `json:"size"`
TotalSize int64 `json:"total_size"`
Modified time.Time `json:"modified"`
}
// Cleanup removes unused mounts

View file

@ -9,29 +9,30 @@ import (
"time"
"github.com/jfraeys/fetch_ml/internal/config"
"github.com/jfraeys/fetch_ml/internal/fileutil"
"github.com/jfraeys/fetch_ml/internal/logging"
)
// WorkspaceMetadata tracks the relationship between Jupyter workspaces and experiments
type WorkspaceMetadata struct {
LinkedAt time.Time `json:"linked_at"`
LastSync time.Time `json:"last_sync"`
AdditionalData map[string]string `json:"additional_data"`
WorkspacePath string `json:"workspace_path"`
ExperimentID string `json:"experiment_id"`
ServiceID string `json:"service_id,omitempty"`
LinkedAt time.Time `json:"linked_at"`
LastSync time.Time `json:"last_sync"`
SyncDirection string `json:"sync_direction"` // "pull", "push", "bidirectional"
AutoSync bool `json:"auto_sync"`
SyncInterval time.Duration `json:"sync_interval"`
SyncDirection string `json:"sync_direction"`
Tags []string `json:"tags"`
AdditionalData map[string]string `json:"additional_data"`
SyncInterval time.Duration `json:"sync_interval"`
AutoSync bool `json:"auto_sync"`
}
// WorkspaceMetadataManager manages workspace metadata
type WorkspaceMetadataManager struct {
logger *logging.Logger
metadata map[string]*WorkspaceMetadata // key: workspace path
mutex sync.RWMutex
metadata map[string]*WorkspaceMetadata
dataFile string
mutex sync.RWMutex
}
// NewWorkspaceMetadataManager creates a new workspace metadata manager
@ -319,7 +320,7 @@ func (wmm *WorkspaceMetadataManager) loadMetadata() error {
return nil
}
// saveMetadata saves metadata to disk using PathRegistry
// saveMetadata saves metadata to disk using PathRegistry with crash safety (fsync)
func (wmm *WorkspaceMetadataManager) saveMetadata() error {
// Use PathRegistry for consistent directory creation
paths := config.FromEnv()
@ -332,7 +333,8 @@ func (wmm *WorkspaceMetadataManager) saveMetadata() error {
return fmt.Errorf("failed to marshal metadata: %w", err)
}
if err := os.WriteFile(wmm.dataFile, data, 0644); err != nil {
// SECURITY: Write with fsync for crash safety
if err := fileutil.WriteFileSafe(wmm.dataFile, data, 0644); err != nil {
return fmt.Errorf("failed to write metadata file: %w", err)
}
@ -364,7 +366,8 @@ func (wmm *WorkspaceMetadataManager) createWorkspaceMetadataFile(
return fmt.Errorf("failed to marshal workspace metadata: %w", err)
}
if err := os.WriteFile(workspaceMetaFile, data, 0600); err != nil {
// SECURITY: Write with fsync for crash safety
if err := fileutil.WriteFileSafe(workspaceMetaFile, data, 0600); err != nil {
return fmt.Errorf("failed to write workspace metadata file: %w", err)
}

View file

@ -108,16 +108,16 @@ type Outcome struct {
}
type ArtifactFile struct {
Modified time.Time `json:"modified"`
Path string `json:"path"`
SizeBytes int64 `json:"size_bytes"`
Modified time.Time `json:"modified"`
}
type Artifacts struct {
DiscoveryTime time.Time `json:"discovery_time"`
Files []ArtifactFile `json:"files,omitempty"`
Exclusions []Exclusion `json:"exclusions,omitempty"`
TotalSizeBytes int64 `json:"total_size_bytes,omitempty"`
Exclusions []Exclusion `json:"exclusions,omitempty"` // R.5: Scan exclusions recorded
}
// Exclusion records why a path was excluded from artifact scanning
@ -129,69 +129,58 @@ type Exclusion struct {
// ExecutionEnvironment captures the runtime environment for reproducibility.
// This enables reconstruction and comparison of runs.
type ExecutionEnvironment struct {
ConfigHash string `json:"config_hash"` // R.2: Resolved config hash
GPUCount int `json:"gpu_count"` // GPU count detected
GPUDetectionMethod string `json:"gpu_detection_method,omitempty"` // R.3: "nvml", "env_override", etc.
GPUVendor string `json:"gpu_vendor,omitempty"` // Configured GPU vendor
MaxWorkers int `json:"max_workers"` // Active resource limits
PodmanCPUs string `json:"podman_cpus,omitempty"` // CPU limit
SandboxNetworkMode string `json:"sandbox_network_mode"` // Sandbox settings
SandboxSeccomp string `json:"sandbox_seccomp,omitempty"` // Seccomp profile
SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"` // Security flags
ComplianceMode string `json:"compliance_mode,omitempty"` // HIPAA mode
ManifestNonce string `json:"manifest_nonce,omitempty"` // Unique manifest identifier
Metadata map[string]string `json:"metadata,omitempty"` // Additional env info
Metadata map[string]string `json:"metadata,omitempty"`
ConfigHash string `json:"config_hash"`
GPUDetectionMethod string `json:"gpu_detection_method,omitempty"`
GPUVendor string `json:"gpu_vendor,omitempty"`
PodmanCPUs string `json:"podman_cpus,omitempty"`
SandboxNetworkMode string `json:"sandbox_network_mode"`
SandboxSeccomp string `json:"sandbox_seccomp,omitempty"`
ComplianceMode string `json:"compliance_mode,omitempty"`
ManifestNonce string `json:"manifest_nonce,omitempty"`
GPUCount int `json:"gpu_count"`
MaxWorkers int `json:"max_workers"`
SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"`
}
// RunManifest is a best-effort, self-contained provenance record for a run.
// It is written to <run_dir>/run_manifest.json.
type RunManifest struct {
RunID string `json:"run_id"`
TaskID string `json:"task_id"`
JobName string `json:"job_name"`
CreatedAt time.Time `json:"created_at"`
StartedAt time.Time `json:"started_at,omitempty"`
EndedAt time.Time `json:"ended_at,omitempty"`
Annotations []Annotation `json:"annotations,omitempty"`
Narrative *Narrative `json:"narrative,omitempty"`
Outcome *Outcome `json:"outcome,omitempty"`
Artifacts *Artifacts `json:"artifacts,omitempty"`
CommitID string `json:"commit_id,omitempty"`
ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"`
DepsManifestName string `json:"deps_manifest_name,omitempty"`
DepsManifestSHA string `json:"deps_manifest_sha,omitempty"`
TrainScriptPath string `json:"train_script_path,omitempty"`
WorkerVersion string `json:"worker_version,omitempty"`
PodmanImage string `json:"podman_image,omitempty"`
ImageDigest string `json:"image_digest,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
SnapshotSHA256 string `json:"snapshot_sha256,omitempty"`
Command string `json:"command,omitempty"`
Args string `json:"args,omitempty"`
ExitCode *int `json:"exit_code,omitempty"`
Error string `json:"error,omitempty"`
StagingDurationMS int64 `json:"staging_duration_ms,omitempty"`
ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"`
FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"`
TotalDurationMS int64 `json:"total_duration_ms,omitempty"`
GPUDevices []string `json:"gpu_devices,omitempty"`
WorkerHost string `json:"worker_host,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
// Environment captures execution environment for reproducibility (R.1)
Environment *ExecutionEnvironment `json:"environment,omitempty"`
// Signature fields for tamper detection
Signature string `json:"signature,omitempty"`
SignerKeyID string `json:"signer_key_id,omitempty"`
SigAlg string `json:"sig_alg,omitempty"`
CreatedAt time.Time `json:"created_at"`
StartedAt time.Time `json:"started_at,omitempty"`
EndedAt time.Time `json:"ended_at,omitempty"`
Artifacts *Artifacts `json:"artifacts,omitempty"`
Environment *ExecutionEnvironment `json:"environment,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
ExitCode *int `json:"exit_code,omitempty"`
Narrative *Narrative `json:"narrative,omitempty"`
Outcome *Outcome `json:"outcome,omitempty"`
PodmanImage string `json:"podman_image,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"`
DepsManifestName string `json:"deps_manifest_name,omitempty"`
DepsManifestSHA string `json:"deps_manifest_sha,omitempty"`
TrainScriptPath string `json:"train_script_path,omitempty"`
WorkerVersion string `json:"worker_version,omitempty"`
RunID string `json:"run_id"`
ImageDigest string `json:"image_digest,omitempty"`
WorkerHost string `json:"worker_host,omitempty"`
SnapshotSHA256 string `json:"snapshot_sha256,omitempty"`
Command string `json:"command,omitempty"`
Args string `json:"args,omitempty"`
CommitID string `json:"commit_id,omitempty"`
Error string `json:"error,omitempty"`
SigAlg string `json:"sig_alg,omitempty"`
SignerKeyID string `json:"signer_key_id,omitempty"`
Signature string `json:"signature,omitempty"`
TaskID string `json:"task_id"`
JobName string `json:"job_name"`
Annotations []Annotation `json:"annotations,omitempty"`
GPUDevices []string `json:"gpu_devices,omitempty"`
TotalDurationMS int64 `json:"total_duration_ms,omitempty"`
FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"`
ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"`
StagingDurationMS int64 `json:"staging_duration_ms,omitempty"`
}
func NewRunManifest(runID, taskID, jobName string, createdAt time.Time) *RunManifest {
@ -402,3 +391,75 @@ func (m *RunManifest) ValidateStrict() error {
v := NewValidator()
return v.ValidateStrict(m)
}
// ValidateProvenance checks if the manifest has complete provenance information.
// When ProvenanceBestEffort is false (default), this must pass before writing.
// Returns an error describing what's missing if validation fails.
func (m *RunManifest) ValidateProvenance() error {
if m == nil {
return fmt.Errorf("manifest is nil")
}
var missing []string
// Check Environment is present
if m.Environment == nil {
missing = append(missing, "environment metadata")
} else {
// Check ConfigHash - critical for reproducibility
if m.Environment.ConfigHash == "" {
missing = append(missing, "config_hash")
}
// Check GPU detection method
if m.Environment.GPUDetectionMethod == "" {
missing = append(missing, "gpu_detection_method")
}
// Check Sandbox configuration
if m.Environment.SandboxNetworkMode == "" {
missing = append(missing, "sandbox_network_mode")
}
}
// Check Artifacts are present (though they may be empty for new runs)
if m.Artifacts == nil {
missing = append(missing, "artifacts metadata")
}
// Check basic run identification
if m.RunID == "" {
missing = append(missing, "run_id")
}
if m.TaskID == "" {
missing = append(missing, "task_id")
}
if len(missing) > 0 {
return fmt.Errorf("incomplete provenance record: missing %v", missing)
}
return nil
}
// CanWrite checks if the manifest can be written given the provenance requirements.
// When ProvenanceBestEffort is false (default), requires complete environment capture.
// When true, allows partial manifests with warnings.
func (m *RunManifest) CanWrite(provenanceBestEffort bool) error {
// Always validate basic structure
if err := m.Validate(); err != nil {
return fmt.Errorf("manifest validation failed: %w", err)
}
// If best-effort is enabled, allow partial manifests
if provenanceBestEffort {
return nil
}
// Otherwise, require complete provenance (fail-closed default)
if err := m.ValidateProvenance(); err != nil {
return fmt.Errorf("provenance validation failed (set provenance_best_effort: true to allow partial manifests): %w", err)
}
return nil
}

View file

@ -8,8 +8,8 @@ const SchemaVersion = "1.0.0"
type SchemaVersionInfo struct {
Version string
Date string
Breaking bool
Description string
Breaking bool
}
// SchemaChangeHistory documents all schema versions