refactor(jupyter): enhance security and scheduler integration
Update Jupyter integration for security and scheduler support: - Enhanced security configuration with audit logging - Health monitoring with scheduler event integration - Package manager with network policy enforcement - Service manager with lifecycle hooks - Network manager with tenant isolation - Workspace metadata with tenant tags - Config with resource limits - Podman container integration improvements - Experiment manager with tracking integration - Manifest runner with security checks
This commit is contained in:
parent
3fb6902fa1
commit
6b2c377680
12 changed files with 319 additions and 298 deletions
|
|
@ -306,20 +306,20 @@ func (pm *PodmanManager) ExecContainer(ctx context.Context, containerID string,
|
|||
|
||||
// PodmanConfig holds configuration for Podman container execution
|
||||
type PodmanConfig struct {
|
||||
Image string
|
||||
Workspace string
|
||||
Results string
|
||||
ContainerWorkspace string
|
||||
ContainerResults string
|
||||
AppleGPU bool
|
||||
GPUDevices []string
|
||||
Env map[string]string
|
||||
Volumes map[string]string
|
||||
Memory string
|
||||
ContainerWorkspace string
|
||||
ContainerResults string
|
||||
Results string
|
||||
Workspace string
|
||||
Image string
|
||||
CPUs string
|
||||
Privileged bool // Security: must be false
|
||||
Network string // Security: must not be "host"
|
||||
ReadOnlyMounts bool // Security: true for dataset mounts
|
||||
Network string
|
||||
GPUDevices []string
|
||||
AppleGPU bool
|
||||
Privileged bool
|
||||
ReadOnlyMounts bool
|
||||
}
|
||||
|
||||
// PodmanResourceOverrides converts per-task resource requests into Podman-compatible
|
||||
|
|
@ -338,15 +338,22 @@ func PodmanResourceOverrides(cpu int, memoryGB int) (cpus string, memory string)
|
|||
|
||||
// PodmanSecurityConfig holds security configuration for Podman containers
|
||||
type PodmanSecurityConfig struct {
|
||||
NoNewPrivileges bool
|
||||
DropAllCaps bool
|
||||
SeccompProfile string
|
||||
NetworkMode string
|
||||
AllowedCaps []string
|
||||
UserNS bool
|
||||
RunAsUID int
|
||||
RunAsGID int
|
||||
SeccompProfile string
|
||||
NoNewPrivileges bool
|
||||
DropAllCaps bool
|
||||
UserNS bool
|
||||
ReadOnlyRoot bool
|
||||
NetworkMode string
|
||||
// Process Isolation
|
||||
MaxProcesses int
|
||||
MaxOpenFiles int
|
||||
DisableSwap bool
|
||||
OOMScoreAdj int
|
||||
TaskUID int
|
||||
TaskGID int
|
||||
}
|
||||
|
||||
// BuildSecurityArgs builds security-related podman arguments from PodmanSecurityConfig
|
||||
|
|
@ -395,6 +402,27 @@ func BuildSecurityArgs(sandbox PodmanSecurityConfig) []string {
|
|||
}
|
||||
args = append(args, "--network", networkMode)
|
||||
|
||||
// Process Isolation
|
||||
// Fork bomb protection - limit number of processes
|
||||
if sandbox.MaxProcesses > 0 {
|
||||
args = append(args, "--pids-limit", strconv.Itoa(sandbox.MaxProcesses))
|
||||
}
|
||||
|
||||
// File descriptor limits
|
||||
if sandbox.MaxOpenFiles > 0 {
|
||||
args = append(args, "--ulimit", fmt.Sprintf("nofile=%d:%d", sandbox.MaxOpenFiles, sandbox.MaxOpenFiles))
|
||||
}
|
||||
|
||||
// OOM killer score adjustment (lower = less likely to be killed)
|
||||
if sandbox.OOMScoreAdj != 0 {
|
||||
args = append(args, "--oom-score-adj", strconv.Itoa(sandbox.OOMScoreAdj))
|
||||
}
|
||||
|
||||
// Disable swap (memory-swap equals memory means no swap)
|
||||
if sandbox.DisableSwap {
|
||||
args = append(args, "--memory-swap=0")
|
||||
}
|
||||
|
||||
return args
|
||||
}
|
||||
|
||||
|
|
@ -488,84 +516,6 @@ func BuildPodmanCommand(
|
|||
return exec.CommandContext(ctx, "podman", args...)
|
||||
}
|
||||
|
||||
// BuildPodmanCommandLegacy builds a Podman command using legacy security settings
|
||||
// Deprecated: Use BuildPodmanCommand with SandboxConfig instead
|
||||
func BuildPodmanCommandLegacy(
|
||||
ctx context.Context,
|
||||
cfg PodmanConfig,
|
||||
scriptPath, depsPath string,
|
||||
extraArgs []string,
|
||||
) *exec.Cmd {
|
||||
args := []string{
|
||||
"run", "--rm",
|
||||
"--security-opt", "no-new-privileges",
|
||||
"--cap-drop", "ALL",
|
||||
}
|
||||
|
||||
// Add network mode if specified
|
||||
if cfg.Network != "" {
|
||||
args = append(args, "--network", cfg.Network)
|
||||
}
|
||||
|
||||
// Add read-only root filesystem
|
||||
if cfg.ReadOnlyMounts {
|
||||
args = append(args, "--read-only")
|
||||
}
|
||||
|
||||
if cfg.Memory != "" {
|
||||
args = append(args, "--memory", cfg.Memory)
|
||||
} else {
|
||||
args = append(args, "--memory", config.DefaultPodmanMemory)
|
||||
}
|
||||
|
||||
if cfg.CPUs != "" {
|
||||
args = append(args, "--cpus", cfg.CPUs)
|
||||
} else {
|
||||
args = append(args, "--cpus", config.DefaultPodmanCPUs)
|
||||
}
|
||||
|
||||
args = append(args, "--userns", "keep-id")
|
||||
|
||||
// Mount workspace
|
||||
workspaceMount := fmt.Sprintf("%s:%s:rw", cfg.Workspace, cfg.ContainerWorkspace)
|
||||
args = append(args, "-v", workspaceMount)
|
||||
|
||||
// Mount results
|
||||
resultsMount := fmt.Sprintf("%s:%s:rw", cfg.Results, cfg.ContainerResults)
|
||||
args = append(args, "-v", resultsMount)
|
||||
|
||||
// Mount additional volumes
|
||||
for hostPath, containerPath := range cfg.Volumes {
|
||||
mount := fmt.Sprintf("%s:%s", hostPath, containerPath)
|
||||
args = append(args, "-v", mount)
|
||||
}
|
||||
|
||||
// Use injected GPU device paths for Apple GPU or custom configurations
|
||||
for _, device := range cfg.GPUDevices {
|
||||
args = append(args, "--device", device)
|
||||
}
|
||||
|
||||
// Add environment variables
|
||||
for key, value := range cfg.Env {
|
||||
args = append(args, "-e", fmt.Sprintf("%s=%s", key, value))
|
||||
}
|
||||
|
||||
// Image and command
|
||||
args = append(args, cfg.Image,
|
||||
"--workspace", cfg.ContainerWorkspace,
|
||||
"--deps", depsPath,
|
||||
"--script", scriptPath,
|
||||
)
|
||||
|
||||
// Add extra arguments via --args flag
|
||||
if len(extraArgs) > 0 {
|
||||
args = append(args, "--args")
|
||||
args = append(args, extraArgs...)
|
||||
}
|
||||
|
||||
return exec.CommandContext(ctx, "podman", args...)
|
||||
}
|
||||
|
||||
// ValidateSecurityPolicy validates that the container configuration meets security requirements.
|
||||
// Returns an error if the configuration violates security policies.
|
||||
func ValidateSecurityPolicy(cfg PodmanConfig) error {
|
||||
|
|
@ -588,10 +538,10 @@ func ValidateSecurityPolicy(cfg PodmanConfig) error {
|
|||
|
||||
// PodmanSecret represents a secret to be mounted in a container
|
||||
type PodmanSecret struct {
|
||||
Name string // Secret name in Podman
|
||||
Data []byte // Secret data (will be base64 encoded)
|
||||
Target string // Mount path inside container
|
||||
EnvVar string // Environment variable name (optional, if set mounts as env var instead of file)
|
||||
Name string
|
||||
Target string
|
||||
EnvVar string
|
||||
Data []byte
|
||||
}
|
||||
|
||||
// CreateSecret creates a Podman secret from the given data
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@ func (m *Manager) CreateExperiment(commitID string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// WriteMetadata writes experiment metadata to meta.bin
|
||||
// WriteMetadata writes experiment metadata to meta.bin with crash safety (fsync)
|
||||
func (m *Manager) WriteMetadata(meta *Metadata) error {
|
||||
path := m.GetMetadataPath(meta.CommitID)
|
||||
|
||||
|
|
@ -134,7 +134,8 @@ func (m *Manager) WriteMetadata(meta *Metadata) error {
|
|||
buf = append(buf, byte(len(meta.User)))
|
||||
buf = append(buf, []byte(meta.User)...)
|
||||
|
||||
return os.WriteFile(path, buf, 0o600)
|
||||
// SECURITY: Write with fsync for crash safety
|
||||
return fileutil.WriteFileSafe(path, buf, 0o600)
|
||||
}
|
||||
|
||||
// ReadMetadata reads experiment metadata from meta.bin
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/jfraeys/fetch_ml/internal/fileutil"
|
||||
"github.com/jfraeys/fetch_ml/internal/logging"
|
||||
)
|
||||
|
||||
|
|
@ -58,34 +59,34 @@ type ConfigManager struct {
|
|||
type JupyterConfig struct {
|
||||
Version string `json:"version"`
|
||||
Environment string `json:"environment"`
|
||||
Service ServiceConfig `json:"service"`
|
||||
Logging LoggingConfig `json:"logging"`
|
||||
Resources ResourceConfig `json:"resources"`
|
||||
Security SecurityConfig `json:"security"`
|
||||
Workspace WorkspaceConfig `json:"workspace"`
|
||||
Network NetworkConfig `json:"network"`
|
||||
Security SecurityConfig `json:"security"`
|
||||
Resources ResourceConfig `json:"resources"`
|
||||
Health HealthConfig `json:"health"`
|
||||
Logging LoggingConfig `json:"logging"`
|
||||
DefaultSettings DefaultSettingsConfig `json:"default_settings"`
|
||||
Service ServiceConfig `json:"service"`
|
||||
AdvancedSettings AdvancedSettingsConfig `json:"advanced_settings"`
|
||||
Health HealthConfig `json:"health"`
|
||||
}
|
||||
|
||||
// WorkspaceConfig defines workspace configuration
|
||||
type WorkspaceConfig struct {
|
||||
DefaultPath string `json:"default_path"`
|
||||
AutoCreate bool `json:"auto_create"`
|
||||
MountOptions map[string]string `json:"mount_options"`
|
||||
DefaultPath string `json:"default_path"`
|
||||
MaxWorkspaceSize string `json:"max_workspace_size"`
|
||||
AllowedPaths []string `json:"allowed_paths"`
|
||||
DeniedPaths []string `json:"denied_paths"`
|
||||
MaxWorkspaceSize string `json:"max_workspace_size"`
|
||||
AutoCreate bool `json:"auto_create"`
|
||||
}
|
||||
|
||||
// HealthConfig defines health monitoring configuration
|
||||
type HealthConfig struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
CheckInterval time.Duration `json:"check_interval"`
|
||||
Timeout time.Duration `json:"timeout"`
|
||||
RetryAttempts int `json:"retry_attempts"`
|
||||
MaxServiceAge time.Duration `json:"max_service_age"`
|
||||
Enabled bool `json:"enabled"`
|
||||
AutoCleanup bool `json:"auto_cleanup"`
|
||||
MetricsEnabled bool `json:"metrics_enabled"`
|
||||
}
|
||||
|
|
@ -96,31 +97,31 @@ type LoggingConfig struct {
|
|||
Format string `json:"format"`
|
||||
Output string `json:"output"`
|
||||
MaxSize string `json:"max_size"`
|
||||
MaxBackups int `json:"max_backups"`
|
||||
MaxAge string `json:"max_age"`
|
||||
MaxBackups int `json:"max_backups"`
|
||||
}
|
||||
|
||||
// DefaultSettingsConfig defines default settings for new services
|
||||
type DefaultSettingsConfig struct {
|
||||
Image string `json:"default_image"`
|
||||
Port int `json:"default_port"`
|
||||
Workspace string `json:"default_workspace"`
|
||||
Environment map[string]string `json:"environment"`
|
||||
Image string `json:"default_image"`
|
||||
Workspace string `json:"default_workspace"`
|
||||
ShutdownPolicy string `json:"shutdown_policy"`
|
||||
Port int `json:"default_port"`
|
||||
StopTimeout time.Duration `json:"stop_timeout"`
|
||||
AutoStart bool `json:"auto_start"`
|
||||
AutoStop bool `json:"auto_stop"`
|
||||
StopTimeout time.Duration `json:"stop_timeout"`
|
||||
ShutdownPolicy string `json:"shutdown_policy"`
|
||||
}
|
||||
|
||||
// AdvancedSettingsConfig defines advanced configuration options
|
||||
type AdvancedSettingsConfig struct {
|
||||
ExperimentalFeatures []string `json:"experimental_features"`
|
||||
MaxConcurrentServices int `json:"max_concurrent_services"`
|
||||
ServiceTimeout time.Duration `json:"service_timeout"`
|
||||
StartupTimeout time.Duration `json:"startup_timeout"`
|
||||
GracefulShutdown bool `json:"graceful_shutdown"`
|
||||
ForceCleanup bool `json:"force_cleanup"`
|
||||
DebugMode bool `json:"debug_mode"`
|
||||
ExperimentalFeatures []string `json:"experimental_features"`
|
||||
}
|
||||
|
||||
// NewConfigManager creates a new configuration manager
|
||||
|
|
@ -190,8 +191,8 @@ func (cm *ConfigManager) SaveConfig() error {
|
|||
return fmt.Errorf("failed to marshal config: %w", err)
|
||||
}
|
||||
|
||||
// Write configuration file
|
||||
if err := os.WriteFile(cm.configPath, data, 0600); err != nil {
|
||||
// Write configuration file with crash safety (fsync)
|
||||
if err := fileutil.WriteFileSafe(cm.configPath, data, 0600); err != nil {
|
||||
return fmt.Errorf("failed to write config file: %w", err)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -20,33 +20,33 @@ const (
|
|||
type HealthMonitor struct {
|
||||
logger *logging.Logger
|
||||
services map[string]*JupyterService
|
||||
servicesMutex sync.RWMutex
|
||||
interval time.Duration
|
||||
client *http.Client
|
||||
interval time.Duration
|
||||
servicesMutex sync.RWMutex
|
||||
}
|
||||
|
||||
// HealthStatus represents the health status of a service
|
||||
type HealthStatus struct {
|
||||
LastCheck time.Time `json:"last_check"`
|
||||
Metrics map[string]interface{} `json:"metrics"`
|
||||
ServiceID string `json:"service_id"`
|
||||
ServiceName string `json:"service_name"`
|
||||
Status string `json:"status"`
|
||||
LastCheck time.Time `json:"last_check"`
|
||||
ResponseTime time.Duration `json:"response_time"`
|
||||
URL string `json:"url"`
|
||||
ContainerID string `json:"container_id"`
|
||||
Errors []string `json:"errors"`
|
||||
Metrics map[string]interface{} `json:"metrics"`
|
||||
ResponseTime time.Duration `json:"response_time"`
|
||||
}
|
||||
|
||||
// HealthReport contains a comprehensive health report
|
||||
type HealthReport struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Services map[string]*HealthStatus `json:"services"`
|
||||
Summary string `json:"summary"`
|
||||
TotalServices int `json:"total_services"`
|
||||
Healthy int `json:"healthy"`
|
||||
Unhealthy int `json:"unhealthy"`
|
||||
Unknown int `json:"unknown"`
|
||||
Services map[string]*HealthStatus `json:"services"`
|
||||
Summary string `json:"summary"`
|
||||
}
|
||||
|
||||
// NewHealthMonitor creates a new health monitor
|
||||
|
|
|
|||
|
|
@ -18,9 +18,9 @@ type NetworkManager struct {
|
|||
|
||||
// PortAllocator manages port allocation for services
|
||||
type PortAllocator struct {
|
||||
usedPorts map[int]bool
|
||||
startPort int
|
||||
endPort int
|
||||
usedPorts map[int]bool
|
||||
}
|
||||
|
||||
// NewNetworkManager creates a new network manager
|
||||
|
|
@ -313,10 +313,10 @@ func (nm *NetworkManager) GetNetworkStatus() *NetworkStatus {
|
|||
|
||||
// NetworkStatus contains network status information
|
||||
type NetworkStatus struct {
|
||||
PortRange string `json:"port_range"`
|
||||
TotalPorts int `json:"total_ports"`
|
||||
AvailablePorts int `json:"available_ports"`
|
||||
UsedPorts int `json:"used_ports"`
|
||||
PortRange string `json:"port_range"`
|
||||
Services int `json:"services"`
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/jfraeys/fetch_ml/internal/fileutil"
|
||||
"github.com/jfraeys/fetch_ml/internal/logging"
|
||||
)
|
||||
|
||||
|
|
@ -18,22 +19,22 @@ const (
|
|||
// PackageManager manages package installations in Jupyter workspaces
|
||||
type PackageManager struct {
|
||||
logger *logging.Logger
|
||||
trustedChannels []string
|
||||
allowedPackages map[string]bool
|
||||
blockedPackages []string
|
||||
workspacePath string
|
||||
packageCachePath string
|
||||
trustedChannels []string
|
||||
blockedPackages []string
|
||||
}
|
||||
|
||||
// PackageConfig defines package management configuration
|
||||
type PackageConfig struct {
|
||||
TrustedChannels []string `json:"trusted_channels"`
|
||||
AllowedPackages map[string]bool `json:"allowed_packages"`
|
||||
TrustedChannels []string `json:"trusted_channels"`
|
||||
BlockedPackages []string `json:"blocked_packages"`
|
||||
RequireApproval bool `json:"require_approval"`
|
||||
AutoApproveSafe bool `json:"auto_approve_safe"`
|
||||
MaxPackages int `json:"max_packages"`
|
||||
InstallTimeout time.Duration `json:"install_timeout"`
|
||||
RequireApproval bool `json:"require_approval"`
|
||||
AutoApproveSafe bool `json:"auto_approve_safe"`
|
||||
AllowCondaForge bool `json:"allow_conda_forge"`
|
||||
AllowPyPI bool `json:"allow_pypi"`
|
||||
AllowLocal bool `json:"allow_local"`
|
||||
|
|
@ -41,28 +42,28 @@ type PackageConfig struct {
|
|||
|
||||
// PackageRequest represents a package installation request
|
||||
type PackageRequest struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
ApprovalTime time.Time `json:"approval_time,omitempty"`
|
||||
PackageName string `json:"package_name"`
|
||||
Version string `json:"version,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
RequestedBy string `json:"requested_by"`
|
||||
WorkspacePath string `json:"workspace_path"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Status string `json:"status"` // pending, approved, rejected, installed, failed
|
||||
Status string `json:"status"`
|
||||
RejectionReason string `json:"rejection_reason,omitempty"`
|
||||
ApprovalUser string `json:"approval_user,omitempty"`
|
||||
ApprovalTime time.Time `json:"approval_time,omitempty"`
|
||||
}
|
||||
|
||||
// PackageInfo contains information about an installed package
|
||||
type PackageInfo struct {
|
||||
InstalledAt time.Time `json:"installed_at"`
|
||||
Metadata map[string]string `json:"metadata"`
|
||||
Name string `json:"name"`
|
||||
Version string `json:"version"`
|
||||
Channel string `json:"channel"`
|
||||
InstalledAt time.Time `json:"installed_at"`
|
||||
InstalledBy string `json:"installed_by"`
|
||||
Size string `json:"size"`
|
||||
Dependencies []string `json:"dependencies"`
|
||||
Metadata map[string]string `json:"metadata"`
|
||||
}
|
||||
|
||||
// NewPackageManager creates a new package manager
|
||||
|
|
@ -353,14 +354,18 @@ func (pm *PackageManager) GetPackageRequest(requestID string) (*PackageRequest,
|
|||
return pm.loadPackageRequest(requestID)
|
||||
}
|
||||
|
||||
// savePackageRequest saves a package request to cache
|
||||
// savePackageRequest saves a package request to cache with crash safety (fsync)
|
||||
func (pm *PackageManager) savePackageRequest(req *PackageRequest) error {
|
||||
requestFile := filepath.Join(pm.packageCachePath, fmt.Sprintf("request_%s.json", req.PackageName))
|
||||
data, err := json.MarshalIndent(req, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(requestFile, data, 0600)
|
||||
// SECURITY: Write with fsync for crash safety
|
||||
if err := fileutil.WriteFileSafe(requestFile, data, 0600); err != nil {
|
||||
return fmt.Errorf("failed to write package request: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// loadPackageRequest loads a package request from cache
|
||||
|
|
@ -406,14 +411,18 @@ func (pm *PackageManager) loadAllPackageRequests() ([]*PackageRequest, error) {
|
|||
return requests, nil
|
||||
}
|
||||
|
||||
// savePackageInfo saves package information to cache
|
||||
// savePackageInfo saves package information to cache with crash safety (fsync)
|
||||
func (pm *PackageManager) savePackageInfo(info *PackageInfo) error {
|
||||
infoFile := filepath.Join(pm.packageCachePath, fmt.Sprintf("installed_%s.json", info.Name))
|
||||
data, err := json.MarshalIndent(info, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(infoFile, data, 0600)
|
||||
// SECURITY: Write with fsync for crash safety
|
||||
if err := fileutil.WriteFileSafe(infoFile, data, 0600); err != nil {
|
||||
return fmt.Errorf("failed to write package info: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// loadAllPackageInfo loads all installed package information
|
||||
|
|
|
|||
|
|
@ -23,57 +23,44 @@ type SecurityManager struct {
|
|||
|
||||
// EnhancedSecurityConfig provides comprehensive security settings
|
||||
type EnhancedSecurityConfig struct {
|
||||
// Network Security
|
||||
AllowNetwork bool `json:"allow_network"`
|
||||
AllowedHosts []string `json:"allowed_hosts"`
|
||||
BlockedHosts []string `json:"blocked_hosts"`
|
||||
EnableFirewall bool `json:"enable_firewall"`
|
||||
|
||||
// Package Security
|
||||
TrustedChannels []string `json:"trusted_channels"`
|
||||
BlockedPackages []string `json:"blocked_packages"`
|
||||
AllowedPackages map[string]bool `json:"allowed_packages"`
|
||||
RequireApproval bool `json:"require_approval"`
|
||||
AutoApproveSafe bool `json:"auto_approve_safe"`
|
||||
MaxPackages int `json:"max_packages"`
|
||||
InstallTimeout time.Duration `json:"install_timeout"`
|
||||
AllowCondaForge bool `json:"allow_conda_forge"`
|
||||
AllowPyPI bool `json:"allow_pypi"`
|
||||
AllowLocal bool `json:"allow_local"`
|
||||
|
||||
// Container Security
|
||||
ReadOnlyRoot bool `json:"read_only_root"`
|
||||
DropCapabilities []string `json:"drop_capabilities"`
|
||||
RunAsNonRoot bool `json:"run_as_non_root"`
|
||||
EnableSeccomp bool `json:"enable_seccomp"`
|
||||
NoNewPrivileges bool `json:"no_new_privileges"`
|
||||
|
||||
// Authentication Security
|
||||
EnableTokenAuth bool `json:"enable_token_auth"`
|
||||
TokenLength int `json:"token_length"`
|
||||
TokenExpiry time.Duration `json:"token_expiry"`
|
||||
RequireHTTPS bool `json:"require_https"`
|
||||
SessionTimeout time.Duration `json:"session_timeout"`
|
||||
MaxFailedAttempts int `json:"max_failed_attempts"`
|
||||
LockoutDuration time.Duration `json:"lockout_duration"`
|
||||
|
||||
// File System Security
|
||||
AllowedPaths []string `json:"allowed_paths"`
|
||||
DeniedPaths []string `json:"denied_paths"`
|
||||
MaxWorkspaceSize string `json:"max_workspace_size"`
|
||||
AllowExecFrom []string `json:"allow_exec_from"`
|
||||
BlockExecFrom []string `json:"block_exec_from"`
|
||||
|
||||
// Resource Security
|
||||
MaxMemoryLimit string `json:"max_memory_limit"`
|
||||
MaxCPULimit string `json:"max_cpu_limit"`
|
||||
MaxDiskUsage string `json:"max_disk_usage"`
|
||||
MaxProcesses int `json:"max_processes"`
|
||||
|
||||
// Logging & Monitoring
|
||||
SecurityLogLevel string `json:"security_log_level"`
|
||||
AuditEnabled bool `json:"audit_enabled"`
|
||||
RealTimeAlerts bool `json:"real_time_alerts"`
|
||||
AllowedPackages map[string]bool `json:"allowed_packages"`
|
||||
SecurityLogLevel string `json:"security_log_level"`
|
||||
MaxDiskUsage string `json:"max_disk_usage"`
|
||||
MaxWorkspaceSize string `json:"max_workspace_size"`
|
||||
MaxMemoryLimit string `json:"max_memory_limit"`
|
||||
MaxCPULimit string `json:"max_cpu_limit"`
|
||||
BlockedPackages []string `json:"blocked_packages"`
|
||||
AllowedHosts []string `json:"allowed_hosts"`
|
||||
BlockedHosts []string `json:"blocked_hosts"`
|
||||
TrustedChannels []string `json:"trusted_channels"`
|
||||
BlockExecFrom []string `json:"block_exec_from"`
|
||||
AllowExecFrom []string `json:"allow_exec_from"`
|
||||
DropCapabilities []string `json:"drop_capabilities"`
|
||||
DeniedPaths []string `json:"denied_paths"`
|
||||
AllowedPaths []string `json:"allowed_paths"`
|
||||
MaxPackages int `json:"max_packages"`
|
||||
SessionTimeout time.Duration `json:"session_timeout"`
|
||||
MaxProcesses int `json:"max_processes"`
|
||||
InstallTimeout time.Duration `json:"install_timeout"`
|
||||
LockoutDuration time.Duration `json:"lockout_duration"`
|
||||
TokenLength int `json:"token_length"`
|
||||
TokenExpiry time.Duration `json:"token_expiry"`
|
||||
MaxFailedAttempts int `json:"max_failed_attempts"`
|
||||
ReadOnlyRoot bool `json:"read_only_root"`
|
||||
NoNewPrivileges bool `json:"no_new_privileges"`
|
||||
EnableTokenAuth bool `json:"enable_token_auth"`
|
||||
RunAsNonRoot bool `json:"run_as_non_root"`
|
||||
AllowLocal bool `json:"allow_local"`
|
||||
AllowPyPI bool `json:"allow_pypi"`
|
||||
AllowCondaForge bool `json:"allow_conda_forge"`
|
||||
RequireHTTPS bool `json:"require_https"`
|
||||
AllowNetwork bool `json:"allow_network"`
|
||||
AutoApproveSafe bool `json:"auto_approve_safe"`
|
||||
RequireApproval bool `json:"require_approval"`
|
||||
EnableSeccomp bool `json:"enable_seccomp"`
|
||||
EnableFirewall bool `json:"enable_firewall"`
|
||||
AuditEnabled bool `json:"audit_enabled"`
|
||||
RealTimeAlerts bool `json:"real_time_alerts"`
|
||||
}
|
||||
|
||||
// SecurityEvent represents a security-related event
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import (
|
|||
|
||||
"github.com/jfraeys/fetch_ml/internal/config"
|
||||
"github.com/jfraeys/fetch_ml/internal/container"
|
||||
"github.com/jfraeys/fetch_ml/internal/fileutil"
|
||||
"github.com/jfraeys/fetch_ml/internal/logging"
|
||||
)
|
||||
|
||||
|
|
@ -73,12 +74,12 @@ func trashBaseDir() string {
|
|||
}
|
||||
|
||||
type trashInfo struct {
|
||||
OriginalName string `json:"original_name"`
|
||||
DeletedAt time.Time `json:"deleted_at"`
|
||||
DeletedBy string `json:"deleted_by"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
PurgeAfter time.Time `json:"purge_after"`
|
||||
OriginalName string `json:"original_name"`
|
||||
DeletedBy string `json:"deleted_by"`
|
||||
Reason string `json:"reason"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
}
|
||||
|
||||
func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalName string) (string, *trashInfo, error) {
|
||||
|
|
@ -119,7 +120,11 @@ func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalNam
|
|||
}
|
||||
b, err := json.MarshalIndent(info, "", " ")
|
||||
if err == nil {
|
||||
_ = os.WriteFile(filepath.Join(dest, ".trashinfo"), b, 0o600)
|
||||
// SECURITY: Write with fsync for crash safety
|
||||
trashinfoPath := filepath.Join(dest, ".trashinfo")
|
||||
if err := fileutil.WriteFileSafe(trashinfoPath, b, 0o600); err != nil {
|
||||
sm.logger.Warn("failed to write trashinfo", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
return dest, info, nil
|
||||
|
|
@ -243,25 +248,25 @@ func startupBlockedPackages(installBlocked []string) []string {
|
|||
// ServiceConfig holds configuration for Jupyter services
|
||||
type ServiceConfig struct {
|
||||
DefaultImage string `json:"default_image"`
|
||||
DefaultPort int `json:"default_port"`
|
||||
DefaultWorkspace string `json:"default_workspace"`
|
||||
MaxServices int `json:"max_services"`
|
||||
DefaultResources ResourceConfig `json:"default_resources"`
|
||||
SecuritySettings SecurityConfig `json:"security_settings"`
|
||||
NetworkConfig NetworkConfig `json:"network_config"`
|
||||
DefaultPort int `json:"default_port"`
|
||||
MaxServices int `json:"max_services"`
|
||||
}
|
||||
|
||||
// NetworkConfig defines network settings for Jupyter containers
|
||||
type NetworkConfig struct {
|
||||
BindAddress string `json:"bind_address"`
|
||||
Token string `json:"token"`
|
||||
Password string `json:"password"`
|
||||
NetworkName string `json:"network_name"`
|
||||
HostPort int `json:"host_port"`
|
||||
ContainerPort int `json:"container_port"`
|
||||
BindAddress string `json:"bind_address"`
|
||||
EnableToken bool `json:"enable_token"`
|
||||
Token string `json:"token"`
|
||||
EnablePassword bool `json:"enable_password"`
|
||||
Password string `json:"password"`
|
||||
AllowRemote bool `json:"allow_remote"`
|
||||
NetworkName string `json:"network_name"`
|
||||
}
|
||||
|
||||
// ResourceConfig defines resource limits for Jupyter containers
|
||||
|
|
@ -273,16 +278,16 @@ type ResourceConfig struct {
|
|||
|
||||
// SecurityConfig holds security settings for Jupyter services
|
||||
type SecurityConfig struct {
|
||||
AllowNetwork bool `json:"allow_network"`
|
||||
AllowedHosts []string `json:"allowed_hosts"`
|
||||
AllowedPackages map[string]bool `json:"allowed_packages"`
|
||||
DropCapabilities []string `json:"drop_capabilities"`
|
||||
BlockedHosts []string `json:"blocked_hosts"`
|
||||
EnableFirewall bool `json:"enable_firewall"`
|
||||
TrustedChannels []string `json:"trusted_channels"`
|
||||
BlockedPackages []string `json:"blocked_packages"`
|
||||
AllowedPackages map[string]bool `json:"allowed_packages"`
|
||||
AllowedHosts []string `json:"allowed_hosts"`
|
||||
EnableFirewall bool `json:"enable_firewall"`
|
||||
RequireApproval bool `json:"require_approval"`
|
||||
ReadOnlyRoot bool `json:"read_only_root"`
|
||||
DropCapabilities []string `json:"drop_capabilities"`
|
||||
AllowNetwork bool `json:"allow_network"`
|
||||
RunAsNonRoot bool `json:"run_as_non_root"`
|
||||
EnableSeccomp bool `json:"enable_seccomp"`
|
||||
NoNewPrivileges bool `json:"no_new_privileges"`
|
||||
|
|
@ -290,19 +295,19 @@ type SecurityConfig struct {
|
|||
|
||||
// JupyterService represents a running Jupyter instance
|
||||
type JupyterService struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
ContainerID string `json:"container_id"`
|
||||
Port int `json:"port"`
|
||||
Workspace string `json:"workspace"`
|
||||
Image string `json:"image"`
|
||||
URL string `json:"url"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
LastAccess time.Time `json:"last_access"`
|
||||
Config ServiceConfig `json:"config"`
|
||||
Environment map[string]string `json:"environment"`
|
||||
Metadata map[string]string `json:"metadata"`
|
||||
Environment map[string]string `json:"environment"`
|
||||
Image string `json:"image"`
|
||||
ContainerID string `json:"container_id"`
|
||||
URL string `json:"url"`
|
||||
Workspace string `json:"workspace"`
|
||||
ID string `json:"id"`
|
||||
Status string `json:"status"`
|
||||
Name string `json:"name"`
|
||||
Config ServiceConfig `json:"config"`
|
||||
Port int `json:"port"`
|
||||
}
|
||||
|
||||
type InstalledPackage struct {
|
||||
|
|
@ -313,15 +318,15 @@ type InstalledPackage struct {
|
|||
|
||||
// StartRequest defines parameters for starting a Jupyter service
|
||||
type StartRequest struct {
|
||||
Environment map[string]string `json:"environment"`
|
||||
Metadata map[string]string `json:"metadata"`
|
||||
Name string `json:"name"`
|
||||
Workspace string `json:"workspace"`
|
||||
Image string `json:"image"`
|
||||
Port int `json:"port"`
|
||||
Resources ResourceConfig `json:"resources"`
|
||||
Security SecurityConfig `json:"security"`
|
||||
Network NetworkConfig `json:"network"`
|
||||
Environment map[string]string `json:"environment"`
|
||||
Metadata map[string]string `json:"metadata"`
|
||||
Port int `json:"port"`
|
||||
}
|
||||
|
||||
// NewServiceManager creates a new Jupyter service manager
|
||||
|
|
@ -1182,7 +1187,11 @@ func (sm *ServiceManager) saveServices() error {
|
|||
return err
|
||||
}
|
||||
|
||||
return os.WriteFile(servicesFile, data, 0600)
|
||||
// SECURITY: Write with fsync for crash safety
|
||||
if err := fileutil.WriteFileSafe(servicesFile, data, 0600); err != nil {
|
||||
return fmt.Errorf("failed to write services file: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// LinkWorkspaceWithExperiment links a workspace with an experiment
|
||||
|
|
|
|||
|
|
@ -17,28 +17,28 @@ const (
|
|||
// WorkspaceManager handles workspace mounting and volume management for Jupyter services
|
||||
type WorkspaceManager struct {
|
||||
logger *logging.Logger
|
||||
basePath string
|
||||
mounts map[string]*WorkspaceMount
|
||||
basePath string
|
||||
}
|
||||
|
||||
// WorkspaceMount represents a workspace mount configuration
|
||||
type WorkspaceMount struct {
|
||||
Options map[string]string `json:"options"`
|
||||
ID string `json:"id"`
|
||||
HostPath string `json:"host_path"`
|
||||
ContainerPath string `json:"container_path"`
|
||||
MountType string `json:"mount_type"` // "bind", "volume", "tmpfs"
|
||||
MountType string `json:"mount_type"`
|
||||
Services []string `json:"services"`
|
||||
ReadOnly bool `json:"read_only"`
|
||||
Options map[string]string `json:"options"`
|
||||
Services []string `json:"services"` // Service IDs using this mount
|
||||
}
|
||||
|
||||
// MountRequest defines parameters for creating a workspace mount
|
||||
type MountRequest struct {
|
||||
Options map[string]string `json:"options"`
|
||||
HostPath string `json:"host_path"`
|
||||
ContainerPath string `json:"container_path"`
|
||||
MountType string `json:"mount_type"`
|
||||
ReadOnly bool `json:"read_only"`
|
||||
Options map[string]string `json:"options"`
|
||||
}
|
||||
|
||||
// NewWorkspaceManager creates a new workspace manager
|
||||
|
|
@ -394,6 +394,7 @@ func (wm *WorkspaceManager) GetWorkspaceInfo(workspacePath string) (*WorkspaceIn
|
|||
|
||||
// WorkspaceInfo contains information about a workspace
|
||||
type WorkspaceInfo struct {
|
||||
Modified time.Time `json:"modified"`
|
||||
Path string `json:"path"`
|
||||
FileCount int64 `json:"file_count"`
|
||||
PythonFiles int64 `json:"python_files"`
|
||||
|
|
@ -402,7 +403,6 @@ type WorkspaceInfo struct {
|
|||
ConfigFiles int64 `json:"config_files"`
|
||||
Size int64 `json:"size"`
|
||||
TotalSize int64 `json:"total_size"`
|
||||
Modified time.Time `json:"modified"`
|
||||
}
|
||||
|
||||
// Cleanup removes unused mounts
|
||||
|
|
|
|||
|
|
@ -9,29 +9,30 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/jfraeys/fetch_ml/internal/config"
|
||||
"github.com/jfraeys/fetch_ml/internal/fileutil"
|
||||
"github.com/jfraeys/fetch_ml/internal/logging"
|
||||
)
|
||||
|
||||
// WorkspaceMetadata tracks the relationship between Jupyter workspaces and experiments
|
||||
type WorkspaceMetadata struct {
|
||||
LinkedAt time.Time `json:"linked_at"`
|
||||
LastSync time.Time `json:"last_sync"`
|
||||
AdditionalData map[string]string `json:"additional_data"`
|
||||
WorkspacePath string `json:"workspace_path"`
|
||||
ExperimentID string `json:"experiment_id"`
|
||||
ServiceID string `json:"service_id,omitempty"`
|
||||
LinkedAt time.Time `json:"linked_at"`
|
||||
LastSync time.Time `json:"last_sync"`
|
||||
SyncDirection string `json:"sync_direction"` // "pull", "push", "bidirectional"
|
||||
AutoSync bool `json:"auto_sync"`
|
||||
SyncInterval time.Duration `json:"sync_interval"`
|
||||
SyncDirection string `json:"sync_direction"`
|
||||
Tags []string `json:"tags"`
|
||||
AdditionalData map[string]string `json:"additional_data"`
|
||||
SyncInterval time.Duration `json:"sync_interval"`
|
||||
AutoSync bool `json:"auto_sync"`
|
||||
}
|
||||
|
||||
// WorkspaceMetadataManager manages workspace metadata
|
||||
type WorkspaceMetadataManager struct {
|
||||
logger *logging.Logger
|
||||
metadata map[string]*WorkspaceMetadata // key: workspace path
|
||||
mutex sync.RWMutex
|
||||
metadata map[string]*WorkspaceMetadata
|
||||
dataFile string
|
||||
mutex sync.RWMutex
|
||||
}
|
||||
|
||||
// NewWorkspaceMetadataManager creates a new workspace metadata manager
|
||||
|
|
@ -319,7 +320,7 @@ func (wmm *WorkspaceMetadataManager) loadMetadata() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// saveMetadata saves metadata to disk using PathRegistry
|
||||
// saveMetadata saves metadata to disk using PathRegistry with crash safety (fsync)
|
||||
func (wmm *WorkspaceMetadataManager) saveMetadata() error {
|
||||
// Use PathRegistry for consistent directory creation
|
||||
paths := config.FromEnv()
|
||||
|
|
@ -332,7 +333,8 @@ func (wmm *WorkspaceMetadataManager) saveMetadata() error {
|
|||
return fmt.Errorf("failed to marshal metadata: %w", err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(wmm.dataFile, data, 0644); err != nil {
|
||||
// SECURITY: Write with fsync for crash safety
|
||||
if err := fileutil.WriteFileSafe(wmm.dataFile, data, 0644); err != nil {
|
||||
return fmt.Errorf("failed to write metadata file: %w", err)
|
||||
}
|
||||
|
||||
|
|
@ -364,7 +366,8 @@ func (wmm *WorkspaceMetadataManager) createWorkspaceMetadataFile(
|
|||
return fmt.Errorf("failed to marshal workspace metadata: %w", err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(workspaceMetaFile, data, 0600); err != nil {
|
||||
// SECURITY: Write with fsync for crash safety
|
||||
if err := fileutil.WriteFileSafe(workspaceMetaFile, data, 0600); err != nil {
|
||||
return fmt.Errorf("failed to write workspace metadata file: %w", err)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -108,16 +108,16 @@ type Outcome struct {
|
|||
}
|
||||
|
||||
type ArtifactFile struct {
|
||||
Modified time.Time `json:"modified"`
|
||||
Path string `json:"path"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
Modified time.Time `json:"modified"`
|
||||
}
|
||||
|
||||
type Artifacts struct {
|
||||
DiscoveryTime time.Time `json:"discovery_time"`
|
||||
Files []ArtifactFile `json:"files,omitempty"`
|
||||
Exclusions []Exclusion `json:"exclusions,omitempty"`
|
||||
TotalSizeBytes int64 `json:"total_size_bytes,omitempty"`
|
||||
Exclusions []Exclusion `json:"exclusions,omitempty"` // R.5: Scan exclusions recorded
|
||||
}
|
||||
|
||||
// Exclusion records why a path was excluded from artifact scanning
|
||||
|
|
@ -129,69 +129,58 @@ type Exclusion struct {
|
|||
// ExecutionEnvironment captures the runtime environment for reproducibility.
|
||||
// This enables reconstruction and comparison of runs.
|
||||
type ExecutionEnvironment struct {
|
||||
ConfigHash string `json:"config_hash"` // R.2: Resolved config hash
|
||||
GPUCount int `json:"gpu_count"` // GPU count detected
|
||||
GPUDetectionMethod string `json:"gpu_detection_method,omitempty"` // R.3: "nvml", "env_override", etc.
|
||||
GPUVendor string `json:"gpu_vendor,omitempty"` // Configured GPU vendor
|
||||
MaxWorkers int `json:"max_workers"` // Active resource limits
|
||||
PodmanCPUs string `json:"podman_cpus,omitempty"` // CPU limit
|
||||
SandboxNetworkMode string `json:"sandbox_network_mode"` // Sandbox settings
|
||||
SandboxSeccomp string `json:"sandbox_seccomp,omitempty"` // Seccomp profile
|
||||
SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"` // Security flags
|
||||
ComplianceMode string `json:"compliance_mode,omitempty"` // HIPAA mode
|
||||
ManifestNonce string `json:"manifest_nonce,omitempty"` // Unique manifest identifier
|
||||
Metadata map[string]string `json:"metadata,omitempty"` // Additional env info
|
||||
Metadata map[string]string `json:"metadata,omitempty"`
|
||||
ConfigHash string `json:"config_hash"`
|
||||
GPUDetectionMethod string `json:"gpu_detection_method,omitempty"`
|
||||
GPUVendor string `json:"gpu_vendor,omitempty"`
|
||||
PodmanCPUs string `json:"podman_cpus,omitempty"`
|
||||
SandboxNetworkMode string `json:"sandbox_network_mode"`
|
||||
SandboxSeccomp string `json:"sandbox_seccomp,omitempty"`
|
||||
ComplianceMode string `json:"compliance_mode,omitempty"`
|
||||
ManifestNonce string `json:"manifest_nonce,omitempty"`
|
||||
GPUCount int `json:"gpu_count"`
|
||||
MaxWorkers int `json:"max_workers"`
|
||||
SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"`
|
||||
}
|
||||
|
||||
// RunManifest is a best-effort, self-contained provenance record for a run.
|
||||
// It is written to <run_dir>/run_manifest.json.
|
||||
type RunManifest struct {
|
||||
RunID string `json:"run_id"`
|
||||
TaskID string `json:"task_id"`
|
||||
JobName string `json:"job_name"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt time.Time `json:"started_at,omitempty"`
|
||||
EndedAt time.Time `json:"ended_at,omitempty"`
|
||||
|
||||
Annotations []Annotation `json:"annotations,omitempty"`
|
||||
Narrative *Narrative `json:"narrative,omitempty"`
|
||||
Outcome *Outcome `json:"outcome,omitempty"`
|
||||
Artifacts *Artifacts `json:"artifacts,omitempty"`
|
||||
|
||||
CommitID string `json:"commit_id,omitempty"`
|
||||
ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"`
|
||||
DepsManifestName string `json:"deps_manifest_name,omitempty"`
|
||||
DepsManifestSHA string `json:"deps_manifest_sha,omitempty"`
|
||||
TrainScriptPath string `json:"train_script_path,omitempty"`
|
||||
|
||||
WorkerVersion string `json:"worker_version,omitempty"`
|
||||
PodmanImage string `json:"podman_image,omitempty"`
|
||||
ImageDigest string `json:"image_digest,omitempty"`
|
||||
|
||||
SnapshotID string `json:"snapshot_id,omitempty"`
|
||||
SnapshotSHA256 string `json:"snapshot_sha256,omitempty"`
|
||||
|
||||
Command string `json:"command,omitempty"`
|
||||
Args string `json:"args,omitempty"`
|
||||
ExitCode *int `json:"exit_code,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
|
||||
StagingDurationMS int64 `json:"staging_duration_ms,omitempty"`
|
||||
ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"`
|
||||
FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"`
|
||||
TotalDurationMS int64 `json:"total_duration_ms,omitempty"`
|
||||
|
||||
GPUDevices []string `json:"gpu_devices,omitempty"`
|
||||
WorkerHost string `json:"worker_host,omitempty"`
|
||||
Metadata map[string]string `json:"metadata,omitempty"`
|
||||
|
||||
// Environment captures execution environment for reproducibility (R.1)
|
||||
Environment *ExecutionEnvironment `json:"environment,omitempty"`
|
||||
|
||||
// Signature fields for tamper detection
|
||||
Signature string `json:"signature,omitempty"`
|
||||
SignerKeyID string `json:"signer_key_id,omitempty"`
|
||||
SigAlg string `json:"sig_alg,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt time.Time `json:"started_at,omitempty"`
|
||||
EndedAt time.Time `json:"ended_at,omitempty"`
|
||||
Artifacts *Artifacts `json:"artifacts,omitempty"`
|
||||
Environment *ExecutionEnvironment `json:"environment,omitempty"`
|
||||
Metadata map[string]string `json:"metadata,omitempty"`
|
||||
ExitCode *int `json:"exit_code,omitempty"`
|
||||
Narrative *Narrative `json:"narrative,omitempty"`
|
||||
Outcome *Outcome `json:"outcome,omitempty"`
|
||||
PodmanImage string `json:"podman_image,omitempty"`
|
||||
SnapshotID string `json:"snapshot_id,omitempty"`
|
||||
ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"`
|
||||
DepsManifestName string `json:"deps_manifest_name,omitempty"`
|
||||
DepsManifestSHA string `json:"deps_manifest_sha,omitempty"`
|
||||
TrainScriptPath string `json:"train_script_path,omitempty"`
|
||||
WorkerVersion string `json:"worker_version,omitempty"`
|
||||
RunID string `json:"run_id"`
|
||||
ImageDigest string `json:"image_digest,omitempty"`
|
||||
WorkerHost string `json:"worker_host,omitempty"`
|
||||
SnapshotSHA256 string `json:"snapshot_sha256,omitempty"`
|
||||
Command string `json:"command,omitempty"`
|
||||
Args string `json:"args,omitempty"`
|
||||
CommitID string `json:"commit_id,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
SigAlg string `json:"sig_alg,omitempty"`
|
||||
SignerKeyID string `json:"signer_key_id,omitempty"`
|
||||
Signature string `json:"signature,omitempty"`
|
||||
TaskID string `json:"task_id"`
|
||||
JobName string `json:"job_name"`
|
||||
Annotations []Annotation `json:"annotations,omitempty"`
|
||||
GPUDevices []string `json:"gpu_devices,omitempty"`
|
||||
TotalDurationMS int64 `json:"total_duration_ms,omitempty"`
|
||||
FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"`
|
||||
ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"`
|
||||
StagingDurationMS int64 `json:"staging_duration_ms,omitempty"`
|
||||
}
|
||||
|
||||
func NewRunManifest(runID, taskID, jobName string, createdAt time.Time) *RunManifest {
|
||||
|
|
@ -402,3 +391,75 @@ func (m *RunManifest) ValidateStrict() error {
|
|||
v := NewValidator()
|
||||
return v.ValidateStrict(m)
|
||||
}
|
||||
|
||||
// ValidateProvenance checks if the manifest has complete provenance information.
|
||||
// When ProvenanceBestEffort is false (default), this must pass before writing.
|
||||
// Returns an error describing what's missing if validation fails.
|
||||
func (m *RunManifest) ValidateProvenance() error {
|
||||
if m == nil {
|
||||
return fmt.Errorf("manifest is nil")
|
||||
}
|
||||
|
||||
var missing []string
|
||||
|
||||
// Check Environment is present
|
||||
if m.Environment == nil {
|
||||
missing = append(missing, "environment metadata")
|
||||
} else {
|
||||
// Check ConfigHash - critical for reproducibility
|
||||
if m.Environment.ConfigHash == "" {
|
||||
missing = append(missing, "config_hash")
|
||||
}
|
||||
|
||||
// Check GPU detection method
|
||||
if m.Environment.GPUDetectionMethod == "" {
|
||||
missing = append(missing, "gpu_detection_method")
|
||||
}
|
||||
|
||||
// Check Sandbox configuration
|
||||
if m.Environment.SandboxNetworkMode == "" {
|
||||
missing = append(missing, "sandbox_network_mode")
|
||||
}
|
||||
}
|
||||
|
||||
// Check Artifacts are present (though they may be empty for new runs)
|
||||
if m.Artifacts == nil {
|
||||
missing = append(missing, "artifacts metadata")
|
||||
}
|
||||
|
||||
// Check basic run identification
|
||||
if m.RunID == "" {
|
||||
missing = append(missing, "run_id")
|
||||
}
|
||||
if m.TaskID == "" {
|
||||
missing = append(missing, "task_id")
|
||||
}
|
||||
|
||||
if len(missing) > 0 {
|
||||
return fmt.Errorf("incomplete provenance record: missing %v", missing)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// CanWrite checks if the manifest can be written given the provenance requirements.
|
||||
// When ProvenanceBestEffort is false (default), requires complete environment capture.
|
||||
// When true, allows partial manifests with warnings.
|
||||
func (m *RunManifest) CanWrite(provenanceBestEffort bool) error {
|
||||
// Always validate basic structure
|
||||
if err := m.Validate(); err != nil {
|
||||
return fmt.Errorf("manifest validation failed: %w", err)
|
||||
}
|
||||
|
||||
// If best-effort is enabled, allow partial manifests
|
||||
if provenanceBestEffort {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Otherwise, require complete provenance (fail-closed default)
|
||||
if err := m.ValidateProvenance(); err != nil {
|
||||
return fmt.Errorf("provenance validation failed (set provenance_best_effort: true to allow partial manifests): %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,8 +8,8 @@ const SchemaVersion = "1.0.0"
|
|||
type SchemaVersionInfo struct {
|
||||
Version string
|
||||
Date string
|
||||
Breaking bool
|
||||
Description string
|
||||
Breaking bool
|
||||
}
|
||||
|
||||
// SchemaChangeHistory documents all schema versions
|
||||
|
|
|
|||
Loading…
Reference in a new issue