fetch_ml/internal/jupyter/service_manager.go
Jeremie Fraeys a70d8aad8e
refactor: remove dead code and fix unused variables
**Cleanup:**
- Delete internal/worker/testutil.go (150 lines of unused test utilities)
- Remove unused stateDir() function from internal/jupyter/service_manager.go
- Silence unused variable warning in internal/worker/executor/container.go
2026-02-23 18:03:38 -05:00

1273 lines
37 KiB
Go

package jupyter
import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/config"
"github.com/jfraeys/fetch_ml/internal/container"
"github.com/jfraeys/fetch_ml/internal/logging"
)
// stripTokenFromURL removes the token query parameter from a URL for safe logging
func stripTokenFromURL(url string) string {
idx := strings.Index(url, "?token=")
if idx == -1 {
idx = strings.Index(url, "&token=")
}
if idx != -1 {
return url[:idx]
}
return url
}
const (
serviceStatusRunning = "running"
)
func workspaceBaseDir() string {
// First check environment variable for backward compatibility
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_WORKSPACE_BASE")); v != "" {
return v
}
// Use PathRegistry for consistent path management
paths := config.FromEnv()
return paths.ActiveDataDir()
}
func resolveWorkspacePath(workspace string) (string, error) {
ws := strings.TrimSpace(workspace)
if ws == "" {
return "", fmt.Errorf("workspace is required")
}
clean := filepath.Clean(ws)
// Reject obvious traversal attempts.
if clean == ".." || strings.HasPrefix(clean, ".."+string(filepath.Separator)) {
return "", fmt.Errorf("invalid workspace path: %s", workspace)
}
// For container deployments, relative paths refer to the workspace base directory.
if !filepath.IsAbs(clean) {
clean = strings.TrimPrefix(clean, "."+string(filepath.Separator))
clean = filepath.Join(workspaceBaseDir(), clean)
}
return clean, nil
}
func trashBaseDir() string {
// First check environment variable for backward compatibility
if v := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_TRASH_DIR")); v != "" {
return v
}
// Use PathRegistry for consistent path management
paths := config.FromEnv()
return filepath.Join(paths.JupyterStateDir(), "trash")
}
type trashInfo struct {
OriginalName string `json:"original_name"`
DeletedAt time.Time `json:"deleted_at"`
DeletedBy string `json:"deleted_by"`
SizeBytes int64 `json:"size_bytes"`
PurgeAfter time.Time `json:"purge_after"`
Reason string `json:"reason"`
}
func (sm *ServiceManager) moveWorkspaceToTrash(workspacePath string, originalName string) (string, *trashInfo, error) {
ws := strings.TrimSpace(workspacePath)
if ws == "" {
return "", nil, fmt.Errorf("workspace is required")
}
name := strings.TrimSpace(originalName)
if name == "" {
return "", nil, fmt.Errorf("original name is required")
}
wsResolved, err := resolveWorkspacePath(ws)
if err == nil {
ws = wsResolved
}
if err := os.MkdirAll(trashBaseDir(), 0o750); err != nil {
return "", nil, fmt.Errorf("failed to create trash directory: %w", err)
}
ts := time.Now().UTC().Format("20060102_150405")
destName := fmt.Sprintf("%s_%s", name, ts)
dest := filepath.Join(trashBaseDir(), destName)
sizeBytes, _ := dirSizeBytes(ws)
info := &trashInfo{
OriginalName: name,
DeletedAt: time.Now().UTC(),
DeletedBy: "system",
SizeBytes: sizeBytes,
PurgeAfter: time.Now().UTC().Add(30 * 24 * time.Hour),
Reason: "user_request",
}
if err := os.Rename(ws, dest); err != nil {
return "", nil, fmt.Errorf("failed to move workspace to trash: %w", err)
}
b, err := json.MarshalIndent(info, "", " ")
if err == nil {
_ = os.WriteFile(filepath.Join(dest, ".trashinfo"), b, 0o600)
}
return dest, info, nil
}
func (sm *ServiceManager) RestoreWorkspace(ctx context.Context, name string) (string, error) {
_ = ctx
wsName := strings.TrimSpace(name)
if wsName == "" {
return "", fmt.Errorf("workspace name is required")
}
base := trashBaseDir()
entries, err := os.ReadDir(base)
if err != nil {
if os.IsNotExist(err) {
return "", fmt.Errorf("no trash directory found")
}
return "", fmt.Errorf("failed to read trash directory: %w", err)
}
prefix := wsName + "_"
var best string
var bestTs string
for _, e := range entries {
if !e.IsDir() {
continue
}
n := e.Name()
if !strings.HasPrefix(n, prefix) {
continue
}
ts := strings.TrimPrefix(n, prefix)
if best == "" || ts > bestTs {
best = n
bestTs = ts
}
}
if best == "" {
return "", fmt.Errorf("no trashed workspace found for %q", wsName)
}
src := filepath.Join(base, best)
dest := filepath.Join(workspaceBaseDir(), wsName)
if _, err := os.Stat(dest); err == nil {
return "", fmt.Errorf("workspace %q already exists", wsName)
}
if err := os.MkdirAll(workspaceBaseDir(), 0o750); err != nil {
return "", fmt.Errorf("failed to create workspace base directory: %w", err)
}
if err := os.Rename(src, dest); err != nil {
return "", fmt.Errorf("failed to restore workspace: %w", err)
}
_ = os.Remove(filepath.Join(dest, ".trashinfo"))
return dest, nil
}
func dirSizeBytes(path string) (int64, error) {
var total int64
err := filepath.WalkDir(path, func(_ string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
info, err := d.Info()
if err != nil {
return err
}
total += info.Size()
return nil
})
if err != nil {
return 0, err
}
return total, nil
}
// ServiceManager manages standalone Jupyter services
type ServiceManager struct {
logger *logging.Logger
podman *container.PodmanManager
config *ServiceConfig
services map[string]*JupyterService
workspaceMetadataMgr *WorkspaceMetadataManager
securityMgr *SecurityManager
startupBlockedPkgs []string
}
func splitPackageList(value string) []string {
value = strings.TrimSpace(value)
if value == "" {
return nil
}
parts := strings.Split(value, ",")
out := make([]string, 0, len(parts))
for _, p := range parts {
p = strings.TrimSpace(p)
if p == "" {
continue
}
out = append(out, p)
}
return out
}
func startupBlockedPackages(installBlocked []string) []string {
val, ok := os.LookupEnv("FETCHML_JUPYTER_STARTUP_BLOCKED_PACKAGES")
if !ok {
return append([]string{}, installBlocked...)
}
val = strings.TrimSpace(val)
if val == "" || strings.EqualFold(val, "off") || strings.EqualFold(val, "none") || strings.EqualFold(val, "disabled") {
return []string{}
}
return splitPackageList(val)
}
// ServiceConfig holds configuration for Jupyter services
type ServiceConfig struct {
DefaultImage string `json:"default_image"`
DefaultPort int `json:"default_port"`
DefaultWorkspace string `json:"default_workspace"`
MaxServices int `json:"max_services"`
DefaultResources ResourceConfig `json:"default_resources"`
SecuritySettings SecurityConfig `json:"security_settings"`
NetworkConfig NetworkConfig `json:"network_config"`
}
// NetworkConfig defines network settings for Jupyter containers
type NetworkConfig struct {
HostPort int `json:"host_port"`
ContainerPort int `json:"container_port"`
BindAddress string `json:"bind_address"`
EnableToken bool `json:"enable_token"`
Token string `json:"token"`
EnablePassword bool `json:"enable_password"`
Password string `json:"password"`
AllowRemote bool `json:"allow_remote"`
NetworkName string `json:"network_name"`
}
// ResourceConfig defines resource limits for Jupyter containers
type ResourceConfig struct {
MemoryLimit string `json:"memory_limit"`
CPULimit string `json:"cpu_limit"`
GPUDevices []string `json:"gpu_devices"`
}
// SecurityConfig holds security settings for Jupyter services
type SecurityConfig struct {
AllowNetwork bool `json:"allow_network"`
AllowedHosts []string `json:"allowed_hosts"`
BlockedHosts []string `json:"blocked_hosts"`
EnableFirewall bool `json:"enable_firewall"`
TrustedChannels []string `json:"trusted_channels"`
BlockedPackages []string `json:"blocked_packages"`
AllowedPackages map[string]bool `json:"allowed_packages"`
RequireApproval bool `json:"require_approval"`
ReadOnlyRoot bool `json:"read_only_root"`
DropCapabilities []string `json:"drop_capabilities"`
RunAsNonRoot bool `json:"run_as_non_root"`
EnableSeccomp bool `json:"enable_seccomp"`
NoNewPrivileges bool `json:"no_new_privileges"`
}
// JupyterService represents a running Jupyter instance
type JupyterService struct {
ID string `json:"id"`
Name string `json:"name"`
Status string `json:"status"`
ContainerID string `json:"container_id"`
Port int `json:"port"`
Workspace string `json:"workspace"`
Image string `json:"image"`
URL string `json:"url"`
CreatedAt time.Time `json:"created_at"`
LastAccess time.Time `json:"last_access"`
Config ServiceConfig `json:"config"`
Environment map[string]string `json:"environment"`
Metadata map[string]string `json:"metadata"`
}
type InstalledPackage struct {
Name string `json:"name"`
Version string `json:"version"`
Source string `json:"source"`
}
// StartRequest defines parameters for starting a Jupyter service
type StartRequest struct {
Name string `json:"name"`
Workspace string `json:"workspace"`
Image string `json:"image"`
Port int `json:"port"`
Resources ResourceConfig `json:"resources"`
Security SecurityConfig `json:"security"`
Network NetworkConfig `json:"network"`
Environment map[string]string `json:"environment"`
Metadata map[string]string `json:"metadata"`
}
// NewServiceManager creates a new Jupyter service manager
func NewServiceManager(logger *logging.Logger, svcConfig *ServiceConfig) (*ServiceManager, error) {
podman, err := container.NewPodmanManager(logger)
if err != nil {
return nil, fmt.Errorf("failed to create podman manager: %w", err)
}
// Initialize workspace metadata manager using PathRegistry
paths := config.FromEnv()
dataFile := paths.JupyterWorkspacesFile()
if err := paths.EnsureDir(paths.JupyterStateDir()); err != nil {
return nil, fmt.Errorf("failed to create jupyter state directory: %w", err)
}
workspaceMetadataMgr := NewWorkspaceMetadataManager(logger, dataFile)
// Initialize security manager with enhanced config
securityConfig := GetDefaultSecurityConfig()
// Override blocked packages from environment variable if provided
if blockedPkgs := os.Getenv("FETCHML_JUPYTER_BLOCKED_PACKAGES"); blockedPkgs != "" {
securityConfig.BlockedPackages = strings.Split(strings.TrimSpace(blockedPkgs), ",")
// Trim whitespace from each package name
for i, pkg := range securityConfig.BlockedPackages {
securityConfig.BlockedPackages[i] = strings.TrimSpace(pkg)
}
}
// Override allowed packages from environment variable if provided
if allowedPkgs := os.Getenv("FETCHML_JUPYTER_ALLOWED_PACKAGES"); allowedPkgs != "" {
securityConfig.AllowedPackages = make(map[string]bool)
allowed := strings.Split(strings.TrimSpace(allowedPkgs), ",")
for _, pkg := range allowed {
securityConfig.AllowedPackages[strings.TrimSpace(pkg)] = true
}
}
securityMgr := NewSecurityManager(logger, securityConfig)
startupBlockedPkgs := startupBlockedPackages(securityConfig.BlockedPackages)
sm := &ServiceManager{
logger: logger,
podman: podman,
config: svcConfig,
services: make(map[string]*JupyterService),
workspaceMetadataMgr: workspaceMetadataMgr,
securityMgr: securityMgr,
startupBlockedPkgs: startupBlockedPkgs,
}
// Load existing services
if err := sm.loadServices(); err != nil {
logger.Warn("failed to load existing services", "error", err)
}
return sm, nil
}
// StartService starts a new Jupyter service
func (sm *ServiceManager) StartService(
ctx context.Context,
req *StartRequest,
) (*JupyterService, error) {
// Validate request
if err := sm.validateStartRequest(req); err != nil {
return nil, err
}
// Check service limit
if len(sm.services) >= sm.config.MaxServices {
return nil, fmt.Errorf("maximum number of services (%d) reached", sm.config.MaxServices)
}
// Generate service ID
serviceID := sm.generateServiceID(req.Name)
// Prepare container configuration
containerConfig := sm.prepareContainerConfig(serviceID, req)
// Start container
containerID, err := sm.podman.StartContainer(ctx, containerConfig)
if err != nil {
return nil, fmt.Errorf("failed to start container: %w", err)
}
// Check for blacklisted packages in the container
if err := sm.checkPackageBlacklist(ctx, containerID); err != nil {
// Cleanup on blacklist violation
_ = sm.podman.StopContainer(ctx, containerID)
return nil, err
}
// Wait for Jupyter to be ready
url, err := sm.waitForJupyterReady(ctx, containerID, req.Network)
if err != nil {
// Cleanup on failure
_ = sm.podman.StopContainer(ctx, containerID)
return nil, fmt.Errorf("jupyter failed to start: %w", err)
}
// Create service record
service := &JupyterService{
ID: serviceID,
Name: req.Name,
Status: serviceStatusRunning,
ContainerID: containerID,
Port: req.Network.HostPort,
Workspace: req.Workspace,
Image: req.Image,
URL: url,
CreatedAt: time.Now(),
LastAccess: time.Now(),
Config: *sm.config,
Environment: req.Environment,
Metadata: req.Metadata,
}
// Store service
sm.services[serviceID] = service
// Check if workspace is linked with an experiment
if workspaceMeta, err := sm.workspaceMetadataMgr.GetWorkspaceMetadata(req.Workspace); err == nil {
service.Metadata["experiment_id"] = workspaceMeta.ExperimentID
service.Metadata["linked_at"] = fmt.Sprintf("%d", workspaceMeta.LinkedAt.Unix())
sm.logger.Info("service started with linked experiment",
"service_id", serviceID,
"experiment_id", workspaceMeta.ExperimentID)
}
// Save services to disk
if err := sm.saveServices(); err != nil {
sm.logger.Warn("failed to save services", "error", err)
}
sm.logger.Info("jupyter service started",
"service_id", serviceID,
"name", req.Name,
"url", stripTokenFromURL(url),
"workspace", req.Workspace)
return service, nil
}
// checkPackageBlacklist validates that no blacklisted packages are installed in the container
func (sm *ServiceManager) checkPackageBlacklist(ctx context.Context, containerID string) error {
if len(sm.startupBlockedPkgs) == 0 {
return nil
}
// Get list of installed packages from the container
// Try both pip and conda package managers
packages, err := sm.getInstalledPackages(ctx, containerID)
if err != nil {
sm.logger.Warn("failed to get installed packages for blacklist check", "error", err)
// Don't fail startup if we can't check packages, but log it
return nil
}
// Check each installed package against the startup blacklist
var blockedPackages []string
for _, pkg := range packages {
for _, blocked := range sm.startupBlockedPkgs {
if strings.EqualFold(blocked, pkg) {
blockedPackages = append(blockedPackages, pkg)
break
}
}
}
// If any blocked packages are found, fail startup
if len(blockedPackages) > 0 {
return fmt.Errorf("container startup failed: blacklisted packages detected: %v. "+
"These packages are blocked by security policy. "+
"Remove them from the image or use FETCHML_JUPYTER_STARTUP_BLOCKED_PACKAGES to configure the startup blacklist",
blockedPackages)
}
return nil
}
// getInstalledPackages retrieves the list of installed packages from the container
func (sm *ServiceManager) getInstalledPackages(ctx context.Context, containerID string) ([]string, error) {
var packages []string
// Try pip list first
pipOutput, err := sm.podman.ExecContainer(ctx, containerID, []string{"pip", "list", "--format=freeze"})
if err == nil && pipOutput != "" {
pipPackages := sm.parsePipList(pipOutput)
packages = append(packages, pipPackages...)
}
// Try conda list as well
condaOutput, err := sm.podman.ExecContainer(ctx, containerID, []string{"conda", "list", "--export"})
if err == nil && condaOutput != "" {
condaPackages := sm.parseCondaList(condaOutput)
packages = append(packages, condaPackages...)
}
// Remove duplicates
uniquePackages := make(map[string]bool)
var result []string
for _, pkg := range packages {
if !uniquePackages[pkg] {
uniquePackages[pkg] = true
result = append(result, pkg)
}
}
return result, nil
}
// parsePipList parses pip list --format=freeze output
func (sm *ServiceManager) parsePipList(output string) []string {
var packages []string
lines := strings.Split(output, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" && !strings.HasPrefix(line, "#") {
// Format: package==version
parts := strings.Split(line, "==")
if len(parts) > 0 {
pkgName := strings.TrimSpace(parts[0])
if pkgName != "" {
packages = append(packages, pkgName)
}
}
}
}
return packages
}
func (sm *ServiceManager) serviceByName(name string) *JupyterService {
name = strings.TrimSpace(name)
if name == "" {
return nil
}
for _, svc := range sm.services {
if svc == nil {
continue
}
if strings.EqualFold(strings.TrimSpace(svc.Name), name) {
return svc
}
}
return nil
}
func (sm *ServiceManager) listInstalledPackages(ctx context.Context, containerID string) ([]InstalledPackage, error) {
var pkgs []InstalledPackage
// pip
pipJSON, err := sm.podman.ExecContainer(ctx, containerID, []string{"pip", "list", "--format=json"})
if err == nil {
var parsed []struct {
Name string `json:"name"`
Version string `json:"version"`
}
if json.Unmarshal([]byte(pipJSON), &parsed) == nil {
for _, p := range parsed {
name := strings.TrimSpace(p.Name)
if name == "" {
continue
}
pkgs = append(pkgs, InstalledPackage{Name: name, Version: strings.TrimSpace(p.Version), Source: "pip"})
}
}
}
// conda
condaJSON, err := sm.podman.ExecContainer(ctx, containerID, []string{"conda", "list", "--json"})
if err == nil {
var parsed []struct {
Name string `json:"name"`
Version string `json:"version"`
}
if json.Unmarshal([]byte(condaJSON), &parsed) == nil {
for _, p := range parsed {
name := strings.TrimSpace(p.Name)
if name == "" {
continue
}
pkgs = append(pkgs, InstalledPackage{Name: name, Version: strings.TrimSpace(p.Version), Source: "conda"})
}
}
}
seen := make(map[string]bool)
out := make([]InstalledPackage, 0, len(pkgs))
for _, p := range pkgs {
key := strings.ToLower(strings.TrimSpace(p.Name)) + ":" + strings.ToLower(strings.TrimSpace(p.Source))
if seen[key] {
continue
}
seen[key] = true
out = append(out, p)
}
return out, nil
}
func (sm *ServiceManager) ListInstalledPackages(ctx context.Context, serviceName string) ([]InstalledPackage, error) {
if sm == nil {
return nil, fmt.Errorf("service manager is nil")
}
svc := sm.serviceByName(serviceName)
if svc == nil {
return nil, fmt.Errorf("service %s not found", strings.TrimSpace(serviceName))
}
if strings.TrimSpace(svc.ContainerID) == "" {
return nil, fmt.Errorf("service container not available")
}
return sm.listInstalledPackages(ctx, svc.ContainerID)
}
// parseCondaList parses conda list --export output
func (sm *ServiceManager) parseCondaList(output string) []string {
var packages []string
lines := strings.Split(output, "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" && !strings.HasPrefix(line, "#") {
// Format: package=version=build
parts := strings.Split(line, "=")
if len(parts) > 0 {
pkgName := strings.TrimSpace(parts[0])
if pkgName != "" {
packages = append(packages, pkgName)
}
}
}
}
return packages
}
// ParsePipList parses pip list --format=freeze output.
func ParsePipList(output string) []string {
sm := &ServiceManager{}
return sm.parsePipList(output)
}
// ParseCondaList parses conda list --export output.
func ParseCondaList(output string) []string {
sm := &ServiceManager{}
return sm.parseCondaList(output)
}
// PrepareContainerConfig builds the Podman container config for a start request.
func PrepareContainerConfig(serviceID string, req *StartRequest) *container.ContainerConfig {
sm := &ServiceManager{}
return sm.prepareContainerConfig(serviceID, req)
}
// MoveWorkspaceToTrash moves a workspace directory to the configured trash directory.
func MoveWorkspaceToTrash(workspacePath string, originalName string) (string, error) {
sm := &ServiceManager{}
trashPath, _, err := sm.moveWorkspaceToTrash(workspacePath, originalName)
return trashPath, err
}
// RestoreWorkspace restores the most recently trashed workspace for the given name.
func RestoreWorkspace(ctx context.Context, name string) (string, error) {
sm := &ServiceManager{}
return sm.RestoreWorkspace(ctx, name)
}
// StopService stops a Jupyter service
func (sm *ServiceManager) StopService(ctx context.Context, serviceID string) error {
service, exists := sm.services[serviceID]
if !exists {
return fmt.Errorf("service %s not found", serviceID)
}
// Stop container
if err := sm.podman.StopContainer(ctx, service.ContainerID); err != nil {
sm.logger.Warn("failed to stop container", "service_id", serviceID, "error", err)
}
// Remove container
if err := sm.podman.RemoveContainer(ctx, service.ContainerID); err != nil {
sm.logger.Warn("failed to remove container", "service_id", serviceID, "error", err)
}
// Update service status
service.Status = "stopped"
service.LastAccess = time.Now()
// Remove from active services
delete(sm.services, serviceID)
// Save services to disk
if err := sm.saveServices(); err != nil {
sm.logger.Warn("failed to save services", "error", err)
}
sm.logger.Info("jupyter service stopped", "service_id", serviceID, "name", service.Name)
return nil
}
// RemoveService removes a Jupyter service. If purge is false, it soft-deletes the workspace by moving it to trash.
func (sm *ServiceManager) RemoveService(ctx context.Context, serviceID string, purge bool) error {
service, exists := sm.services[serviceID]
if !exists {
return fmt.Errorf("service %s not found", serviceID)
}
// Stop container first
if err := sm.podman.StopContainer(ctx, service.ContainerID); err != nil {
sm.logger.Warn("failed to stop container before removal", "service_id", serviceID, "error", err)
}
// Remove container
if err := sm.podman.RemoveContainer(ctx, service.ContainerID); err != nil {
sm.logger.Warn("failed to remove container", "service_id", serviceID, "error", err)
return fmt.Errorf("failed to remove container: %w", err)
}
// Best-effort: unlink workspace metadata.
if sm.workspaceMetadataMgr != nil && strings.TrimSpace(service.Workspace) != "" {
_ = sm.workspaceMetadataMgr.UnlinkWorkspace(service.Workspace)
}
// Workspace deletion policy.
ws := strings.TrimSpace(service.Workspace)
if ws != "" {
wsPath, err := resolveWorkspacePath(ws)
if err == nil {
ws = wsPath
}
if purge {
if err := os.RemoveAll(ws); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to delete workspace: %w", err)
}
} else {
dest, info, err := sm.moveWorkspaceToTrash(ws, strings.TrimSpace(service.Name))
if err != nil {
return err
}
// Persist trash path for observability.
if service.Metadata == nil {
service.Metadata = make(map[string]string)
}
service.Metadata["trash_path"] = dest
service.Metadata["purge_after"] = strconv.FormatInt(info.PurgeAfter.Unix(), 10)
}
}
// Remove from active services
delete(sm.services, serviceID)
// Save services to disk
if err := sm.saveServices(); err != nil {
sm.logger.Warn("failed to save services", "error", err)
}
sm.logger.Info("jupyter service removed", "service_id", serviceID, "name", service.Name, "purge", purge)
return nil
}
// GetService retrieves a service by ID
func (sm *ServiceManager) GetService(serviceID string) (*JupyterService, error) {
service, exists := sm.services[serviceID]
if !exists {
return nil, fmt.Errorf("service %s not found", serviceID)
}
// Update last access time
service.LastAccess = time.Now()
return service, nil
}
// ListServices returns all services
func (sm *ServiceManager) ListServices() []*JupyterService {
services := make([]*JupyterService, 0, len(sm.services))
for _, service := range sm.services {
services = append(services, service)
}
return services
}
// GetServiceStatus returns the current status of a service
func (sm *ServiceManager) GetServiceStatus(ctx context.Context, serviceID string) (string, error) {
service, exists := sm.services[serviceID]
if !exists {
return "", fmt.Errorf("service %s not found", serviceID)
}
// Check container status
status, err := sm.podman.GetContainerStatus(ctx, service.ContainerID)
if err != nil {
sm.logger.Warn("failed to get container status", "service_id", serviceID, "error", err)
return "unknown", err
}
// Update service status if different
if service.Status != status {
service.Status = status
service.LastAccess = time.Now()
_ = sm.saveServices()
}
return status, nil
}
// validateStartRequest validates a start request
func (sm *ServiceManager) validateStartRequest(req *StartRequest) error {
if req.Name == "" {
return fmt.Errorf("service name is required")
}
if req.Workspace == "" {
req.Workspace = sm.config.DefaultWorkspace
}
// Resolve/normalize workspace path before comparing.
wsPath, err := resolveWorkspacePath(req.Workspace)
if err != nil {
return err
}
req.Workspace = wsPath
// Enforce reproducibility: do not allow creating a service/workspace with the same name
// as any existing service (regardless of status).
for _, svc := range sm.services {
if svc == nil {
continue
}
if strings.EqualFold(strings.TrimSpace(svc.Name), strings.TrimSpace(req.Name)) {
return fmt.Errorf("a jupyter service/workspace named %q already exists", req.Name)
}
if svc.Workspace != "" && svc.Workspace == req.Workspace {
return fmt.Errorf("workspace path %q is already in use by service %q", req.Workspace, svc.Name)
}
}
// Ensure workspace directory exists.
if err := os.MkdirAll(req.Workspace, 0o750); err != nil {
return fmt.Errorf("failed to create workspace directory: %w", err)
}
if req.Image == "" {
req.Image = sm.config.DefaultImage
}
if req.Network.HostPort == 0 {
req.Network.HostPort = sm.config.DefaultPort
}
if req.Network.ContainerPort == 0 {
req.Network.ContainerPort = 8888
}
// Check for port conflicts
for _, service := range sm.services {
if service.Port == req.Network.HostPort && service.Status == serviceStatusRunning {
return fmt.Errorf("port %d is already in use by service %s", req.Network.HostPort, service.Name)
}
}
return nil
}
// generateServiceID generates a unique service ID
func (sm *ServiceManager) generateServiceID(name string) string {
timestamp := time.Now().Unix()
sanitizedName := strings.ToLower(strings.ReplaceAll(name, " ", "-"))
return fmt.Sprintf("jupyter-%s-%d", sanitizedName, timestamp)
}
// prepareContainerConfig prepares container configuration with secret support
func (sm *ServiceManager) prepareContainerConfig(
serviceID string,
req *StartRequest,
) *container.ContainerConfig {
imageLower := strings.ToLower(strings.TrimSpace(req.Image))
isPublicJupyter := strings.Contains(imageLower, "quay.io/jupyter/") || strings.Contains(imageLower, "jupyter/")
// Prepare volume mounts
volumes := map[string]string{}
workspaceMount := "/workspace"
if isPublicJupyter {
workspaceMount = "/home/jovyan/work"
}
volumes[req.Workspace] = workspaceMount
// Prepare environment variables (including sensitive ones)
rawEnv := map[string]string{
"JUPYTER_ENABLE_LAB": "yes",
}
if req.Network.EnableToken && req.Network.Token != "" {
rawEnv["JUPYTER_TOKEN"] = req.Network.Token
} else {
rawEnv["JUPYTER_TOKEN"] = "" // No token for development
}
if req.Network.EnablePassword && req.Network.Password != "" {
rawEnv["JUPYTER_PASSWORD"] = req.Network.Password
}
// Add custom environment variables
for k, v := range req.Environment {
rawEnv[k] = v
}
// Sanitize environment - extract sensitive values as secrets
sensitiveKeys := []string{"JUPYTER_TOKEN", "JUPYTER_PASSWORD", "SECRET", "PASSWORD", "API_KEY", "TOKEN"}
secrets, cleanEnv := container.SanitizeContainerEnv(rawEnv, sensitiveKeys)
// Prepare port mappings
ports := map[int]int{
req.Network.HostPort: req.Network.ContainerPort,
}
// Prepare container command (uses cleanEnv variables)
var cmd []string
if isPublicJupyter {
condaEnv := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_CONDA_ENV"))
if condaEnv == "" {
condaEnv = "base"
}
kernelName := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_KERNEL_NAME"))
if kernelName == "" {
kernelName = condaEnv
}
displayName := fmt.Sprintf("Python (%s)", kernelName)
jupyterTokenArg := ""
if !req.Network.EnableToken {
jupyterTokenArg = " --NotebookApp.token= --ServerApp.token="
}
script := fmt.Sprintf(
"set -euo pipefail; "+
"if conda run -n %s python -c 'import ipykernel' >/dev/null 2>&1; then "+
"conda run -n %s python -m ipykernel install --user --name %s --display-name %q; "+
"fi; "+
"exec start-notebook.sh --ip=0.0.0.0 --port=%d --no-browser --notebook-dir=%s%s",
condaEnv,
condaEnv,
kernelName,
displayName,
req.Network.ContainerPort,
workspaceMount,
jupyterTokenArg,
)
cmd = []string{"bash", "-lc", script}
} else {
condaEnv := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_CONDA_ENV"))
if condaEnv == "" {
condaEnv = "ml_env"
}
kernelName := strings.TrimSpace(os.Getenv("FETCHML_JUPYTER_KERNEL_NAME"))
if kernelName == "" {
kernelName = condaEnv
}
displayName := fmt.Sprintf("Python (%s)", kernelName)
jupyterTokenArg := ""
if !req.Network.EnableToken {
jupyterTokenArg = " --NotebookApp.token="
}
script := fmt.Sprintf(
"set -euo pipefail; "+
"if conda run -n %s python -c 'import ipykernel' >/dev/null 2>&1; then "+
"conda run -n %s python -m ipykernel install --user --name %s --display-name %q; "+
"fi; "+
"exec conda run -n %s jupyter notebook --no-browser --ip=0.0.0.0 --port=%d --NotebookApp.allow-root=True --NotebookApp.ip=0.0.0.0%s",
condaEnv,
condaEnv,
kernelName,
displayName,
condaEnv,
req.Network.ContainerPort,
jupyterTokenArg,
)
cmd = []string{"bash", "-lc", script}
}
// Prepare security options
securityOpts := []string{}
if req.Security.ReadOnlyRoot {
securityOpts = append(securityOpts, "--read-only")
}
for _, cap := range req.Security.DropCapabilities {
securityOpts = append(securityOpts, fmt.Sprintf("--cap-drop=%s", cap))
}
return &container.ContainerConfig{
Name: serviceID,
Image: req.Image,
Command: cmd,
Env: cleanEnv,
Secrets: secrets,
Volumes: volumes,
Ports: ports,
SecurityOpts: securityOpts,
Resources: container.ResourceConfig{
MemoryLimit: req.Resources.MemoryLimit,
CPULimit: req.Resources.CPULimit,
GPUDevices: req.Resources.GPUDevices,
},
Network: container.NetworkConfig{
AllowNetwork: req.Security.AllowNetwork,
},
}
}
// waitForJupyterReady waits for Jupyter to be ready and returns the URL
func (sm *ServiceManager) waitForJupyterReady(
ctx context.Context,
containerID string,
networkConfig NetworkConfig,
) (string, error) {
// Wait for container to be running
maxWait := 60 * time.Second
interval := 2 * time.Second
deadline := time.Now().Add(maxWait)
for time.Now().Before(deadline) {
status, err := sm.podman.GetContainerStateStatus(ctx, containerID)
if err != nil {
return "", fmt.Errorf("failed to check container status: %w", err)
}
if status == serviceStatusRunning {
break
}
if status == "exited" || status == "dead" {
return "", fmt.Errorf("container failed to start (status: %s)", status)
}
select {
case <-ctx.Done():
return "", ctx.Err()
case <-time.After(interval):
}
}
// Wait a bit more for Jupyter to initialize
time.Sleep(5 * time.Second)
// Construct URL
url := fmt.Sprintf("http://localhost:%d", networkConfig.HostPort)
if networkConfig.EnableToken && networkConfig.Token != "" {
url += fmt.Sprintf("?token=%s", networkConfig.Token)
}
return url, nil
}
// loadServices loads existing services from disk using PathRegistry
func (sm *ServiceManager) loadServices() error {
paths := config.FromEnv()
servicesFile := paths.JupyterServicesFile()
if err := paths.EnsureDir(paths.JupyterStateDir()); err != nil {
return fmt.Errorf("failed to create jupyter state directory: %w", err)
}
data, err := os.ReadFile(servicesFile)
if err != nil {
if os.IsNotExist(err) {
return nil // No existing services
}
return err
}
var services map[string]*JupyterService
if err := json.Unmarshal(data, &services); err != nil {
return err
}
// Reset in-memory map before re-hydrating from disk.
sm.services = make(map[string]*JupyterService)
// Normalize service status + de-dupe by service name.
// Prefer: running service > newest CreatedAt.
byName := make(map[string]*JupyterService)
for id, service := range services {
if service == nil {
continue
}
// Ensure stable ID even if old payloads didn't persist it.
if strings.TrimSpace(service.ID) == "" {
service.ID = id
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
status, err := sm.podman.GetContainerStateStatus(ctx, service.ContainerID)
cancel()
if err != nil {
sm.logger.Warn("failed to check service status", "service_id", id, "error", err)
service.Status = "stopped"
} else if status == serviceStatusRunning {
service.Status = serviceStatusRunning
} else {
service.Status = "stopped"
}
nameKey := strings.ToLower(strings.TrimSpace(service.Name))
if nameKey == "" {
nameKey = service.ID
}
if existing, ok := byName[nameKey]; ok {
// prefer running
if existing.Status != serviceStatusRunning && service.Status == serviceStatusRunning {
byName[nameKey] = service
continue
}
if existing.Status == serviceStatusRunning && service.Status != serviceStatusRunning {
continue
}
// otherwise prefer newest
if service.CreatedAt.After(existing.CreatedAt) {
byName[nameKey] = service
}
continue
}
byName[nameKey] = service
}
for _, service := range byName {
if service == nil {
continue
}
if strings.TrimSpace(service.ID) == "" {
continue
}
sm.services[service.ID] = service
}
// Best-effort: persist the cleaned registry to avoid accumulating duplicates.
_ = sm.saveServices()
return nil
}
// saveServices saves services to disk using PathRegistry
func (sm *ServiceManager) saveServices() error {
paths := config.FromEnv()
servicesFile := paths.JupyterServicesFile()
if err := paths.EnsureDir(paths.JupyterStateDir()); err != nil {
return fmt.Errorf("failed to create jupyter state directory: %w", err)
}
data, err := json.MarshalIndent(sm.services, "", " ")
if err != nil {
return err
}
return os.WriteFile(servicesFile, data, 0600)
}
// LinkWorkspaceWithExperiment links a workspace with an experiment
func (sm *ServiceManager) LinkWorkspaceWithExperiment(
workspacePath,
experimentID,
serviceID string,
) error {
return sm.workspaceMetadataMgr.LinkWorkspace(workspacePath, experimentID, serviceID)
}
// GetWorkspaceMetadata retrieves metadata for a workspace
func (sm *ServiceManager) GetWorkspaceMetadata(workspacePath string) (*WorkspaceMetadata, error) {
return sm.workspaceMetadataMgr.GetWorkspaceMetadata(workspacePath)
}
// SyncWorkspaceWithExperiment synchronizes a workspace with an experiment
func (sm *ServiceManager) SyncWorkspaceWithExperiment(
_ context.Context,
workspacePath,
experimentID,
direction string,
) error {
// Update sync time in metadata
if err := sm.workspaceMetadataMgr.UpdateSyncTime(workspacePath, direction); err != nil {
sm.logger.Warn("failed to update sync time", "error", err)
}
// In a real implementation, this would perform actual synchronization:
// - For "pull": Download experiment data/metrics to workspace
// - For "push": Upload workspace notebooks/results to experiment
sm.logger.Info("workspace sync completed",
"workspace", workspacePath,
"experiment_id", experimentID,
"direction", direction)
return nil
}
// ListLinkedWorkspaces returns all linked workspaces
func (sm *ServiceManager) ListLinkedWorkspaces() []*WorkspaceMetadata {
return sm.workspaceMetadataMgr.ListLinkedWorkspaces()
}
// GetWorkspacesForExperiment returns all workspaces linked to an experiment
func (sm *ServiceManager) GetWorkspacesForExperiment(experimentID string) []*WorkspaceMetadata {
return sm.workspaceMetadataMgr.GetWorkspacesForExperiment(experimentID)
}
// UnlinkWorkspace removes the link between workspace and experiment
func (sm *ServiceManager) UnlinkWorkspace(workspacePath string) error {
return sm.workspaceMetadataMgr.UnlinkWorkspace(workspacePath)
}
// ClearAllMetadata clears all workspace metadata (used for test isolation)
func (sm *ServiceManager) ClearAllMetadata() error {
return sm.workspaceMetadataMgr.ClearAllMetadata()
}
// SetAutoSync enables or disables auto-sync for a workspace
func (sm *ServiceManager) SetAutoSync(
workspacePath string,
enabled bool,
interval time.Duration,
) error {
return sm.workspaceMetadataMgr.SetAutoSync(workspacePath, enabled, interval)
}
// AddTag adds a tag to workspace metadata
func (sm *ServiceManager) AddTag(workspacePath, tag string) error {
return sm.workspaceMetadataMgr.AddTag(workspacePath, tag)
}
// Close cleans up the service manager
func (sm *ServiceManager) Close(ctx context.Context) error {
// Stop all running services
for _, service := range sm.services {
if service.Status == serviceStatusRunning {
if err := sm.StopService(ctx, service.ID); err != nil {
sm.logger.Warn("failed to stop service during cleanup",
"service_id", service.ID, "error", err)
}
}
}
return nil
}