fetch_ml/internal/jupyter/service_manager.go
Jeremie Fraeys cd5640ebd2 Slim and secure: move scripts, clean configs, remove secrets
- Move ci-test.sh and setup.sh to scripts/
- Trim docs/src/zig-cli.md to current structure
- Replace hardcoded secrets with placeholders in configs
- Update .gitignore to block .env*, secrets/, keys, build artifacts
- Slim README.md to reflect current CLI/TUI split
- Add cleanup trap to ci-test.sh
- Ensure no secrets are committed
2025-12-07 13:57:51 -05:00

575 lines
17 KiB
Go

package jupyter
import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/container"
"github.com/jfraeys/fetch_ml/internal/logging"
)
const (
serviceStatusRunning = "running"
)
// ServiceManager manages standalone Jupyter services
type ServiceManager struct {
logger *logging.Logger
podman *container.PodmanManager
config *ServiceConfig
services map[string]*JupyterService
workspaceMetadataMgr *WorkspaceMetadataManager
}
// ServiceConfig holds configuration for Jupyter services
type ServiceConfig struct {
DefaultImage string `json:"default_image"`
DefaultPort int `json:"default_port"`
DefaultWorkspace string `json:"default_workspace"`
MaxServices int `json:"max_services"`
DefaultResources ResourceConfig `json:"default_resources"`
SecuritySettings SecurityConfig `json:"security_settings"`
NetworkConfig NetworkConfig `json:"network_config"`
}
// NetworkConfig defines network settings for Jupyter containers
type NetworkConfig struct {
HostPort int `json:"host_port"`
ContainerPort int `json:"container_port"`
BindAddress string `json:"bind_address"`
EnableToken bool `json:"enable_token"`
Token string `json:"token"`
EnablePassword bool `json:"enable_password"`
Password string `json:"password"`
AllowRemote bool `json:"allow_remote"`
NetworkName string `json:"network_name"`
}
// ResourceConfig defines resource limits for Jupyter containers
type ResourceConfig struct {
MemoryLimit string `json:"memory_limit"`
CPULimit string `json:"cpu_limit"`
GPUAccess bool `json:"gpu_access"`
}
// SecurityConfig holds security settings for Jupyter services
type SecurityConfig struct {
AllowNetwork bool `json:"allow_network"`
AllowedHosts []string `json:"allowed_hosts"`
BlockedHosts []string `json:"blocked_hosts"`
EnableFirewall bool `json:"enable_firewall"`
TrustedChannels []string `json:"trusted_channels"`
BlockedPackages []string `json:"blocked_packages"`
AllowedPackages map[string]bool `json:"allowed_packages"`
RequireApproval bool `json:"require_approval"`
ReadOnlyRoot bool `json:"read_only_root"`
DropCapabilities []string `json:"drop_capabilities"`
RunAsNonRoot bool `json:"run_as_non_root"`
EnableSeccomp bool `json:"enable_seccomp"`
NoNewPrivileges bool `json:"no_new_privileges"`
}
// JupyterService represents a running Jupyter instance
type JupyterService struct {
ID string `json:"id"`
Name string `json:"name"`
Status string `json:"status"`
ContainerID string `json:"container_id"`
Port int `json:"port"`
Workspace string `json:"workspace"`
Image string `json:"image"`
URL string `json:"url"`
CreatedAt time.Time `json:"created_at"`
LastAccess time.Time `json:"last_access"`
Config ServiceConfig `json:"config"`
Environment map[string]string `json:"environment"`
Metadata map[string]string `json:"metadata"`
}
// StartRequest defines parameters for starting a Jupyter service
type StartRequest struct {
Name string `json:"name"`
Workspace string `json:"workspace"`
Image string `json:"image"`
Port int `json:"port"`
Resources ResourceConfig `json:"resources"`
Security SecurityConfig `json:"security"`
Network NetworkConfig `json:"network"`
Environment map[string]string `json:"environment"`
Metadata map[string]string `json:"metadata"`
}
// NewServiceManager creates a new Jupyter service manager
func NewServiceManager(logger *logging.Logger, config *ServiceConfig) (*ServiceManager, error) {
podman, err := container.NewPodmanManager(logger)
if err != nil {
return nil, fmt.Errorf("failed to create podman manager: %w", err)
}
// Initialize workspace metadata manager
dataFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_workspaces.json")
workspaceMetadataMgr := NewWorkspaceMetadataManager(logger, dataFile)
sm := &ServiceManager{
logger: logger,
podman: podman,
config: config,
services: make(map[string]*JupyterService),
workspaceMetadataMgr: workspaceMetadataMgr,
}
// Load existing services
if err := sm.loadServices(); err != nil {
logger.Warn("failed to load existing services", "error", err)
}
return sm, nil
}
// StartService starts a new Jupyter service
func (sm *ServiceManager) StartService(ctx context.Context, req *StartRequest) (*JupyterService, error) {
// Validate request
if err := sm.validateStartRequest(req); err != nil {
return nil, err
}
// Check service limit
if len(sm.services) >= sm.config.MaxServices {
return nil, fmt.Errorf("maximum number of services (%d) reached", sm.config.MaxServices)
}
// Generate service ID
serviceID := sm.generateServiceID(req.Name)
// Prepare container configuration
containerConfig := sm.prepareContainerConfig(serviceID, req)
// Start container
containerID, err := sm.podman.StartContainer(ctx, containerConfig)
if err != nil {
return nil, fmt.Errorf("failed to start container: %w", err)
}
// Wait for Jupyter to be ready
url, err := sm.waitForJupyterReady(ctx, containerID, req.Network)
if err != nil {
// Cleanup on failure
_ = sm.podman.StopContainer(ctx, containerID)
return nil, fmt.Errorf("jupyter failed to start: %w", err)
}
// Create service record
service := &JupyterService{
ID: serviceID,
Name: req.Name,
Status: serviceStatusRunning,
ContainerID: containerID,
Port: req.Network.HostPort,
Workspace: req.Workspace,
Image: req.Image,
URL: url,
CreatedAt: time.Now(),
LastAccess: time.Now(),
Config: *sm.config,
Environment: req.Environment,
Metadata: req.Metadata,
}
// Store service
sm.services[serviceID] = service
// Check if workspace is linked with an experiment
if workspaceMeta, err := sm.workspaceMetadataMgr.GetWorkspaceMetadata(req.Workspace); err == nil {
service.Metadata["experiment_id"] = workspaceMeta.ExperimentID
service.Metadata["linked_at"] = fmt.Sprintf("%d", workspaceMeta.LinkedAt.Unix())
sm.logger.Info("service started with linked experiment",
"service_id", serviceID,
"experiment_id", workspaceMeta.ExperimentID)
}
// Save services to disk
if err := sm.saveServices(); err != nil {
sm.logger.Warn("failed to save services", "error", err)
}
sm.logger.Info("jupyter service started",
"service_id", serviceID,
"name", req.Name,
"url", url,
"workspace", req.Workspace)
return service, nil
}
// StopService stops a Jupyter service
func (sm *ServiceManager) StopService(ctx context.Context, serviceID string) error {
service, exists := sm.services[serviceID]
if !exists {
return fmt.Errorf("service %s not found", serviceID)
}
// Stop container
if err := sm.podman.StopContainer(ctx, service.ContainerID); err != nil {
sm.logger.Warn("failed to stop container", "service_id", serviceID, "error", err)
}
// Remove container
if err := sm.podman.RemoveContainer(ctx, service.ContainerID); err != nil {
sm.logger.Warn("failed to remove container", "service_id", serviceID, "error", err)
}
// Update service status
service.Status = "stopped"
service.LastAccess = time.Now()
// Remove from active services
delete(sm.services, serviceID)
// Save services to disk
if err := sm.saveServices(); err != nil {
sm.logger.Warn("failed to save services", "error", err)
}
sm.logger.Info("jupyter service stopped", "service_id", serviceID, "name", service.Name)
return nil
}
// GetService retrieves a service by ID
func (sm *ServiceManager) GetService(serviceID string) (*JupyterService, error) {
service, exists := sm.services[serviceID]
if !exists {
return nil, fmt.Errorf("service %s not found", serviceID)
}
// Update last access time
service.LastAccess = time.Now()
return service, nil
}
// ListServices returns all services
func (sm *ServiceManager) ListServices() []*JupyterService {
services := make([]*JupyterService, 0, len(sm.services))
for _, service := range sm.services {
services = append(services, service)
}
return services
}
// GetServiceStatus returns the current status of a service
func (sm *ServiceManager) GetServiceStatus(ctx context.Context, serviceID string) (string, error) {
service, exists := sm.services[serviceID]
if !exists {
return "", fmt.Errorf("service %s not found", serviceID)
}
// Check container status
status, err := sm.podman.GetContainerStatus(ctx, service.ContainerID)
if err != nil {
sm.logger.Warn("failed to get container status", "service_id", serviceID, "error", err)
return "unknown", err
}
// Update service status if different
if service.Status != status {
service.Status = status
service.LastAccess = time.Now()
_ = sm.saveServices()
}
return status, nil
}
// validateStartRequest validates a start request
func (sm *ServiceManager) validateStartRequest(req *StartRequest) error {
if req.Name == "" {
return fmt.Errorf("service name is required")
}
if req.Workspace == "" {
req.Workspace = sm.config.DefaultWorkspace
}
// Check if workspace exists
if _, err := os.Stat(req.Workspace); os.IsNotExist(err) {
return fmt.Errorf("workspace %s does not exist", req.Workspace)
}
if req.Image == "" {
req.Image = sm.config.DefaultImage
}
if req.Network.HostPort == 0 {
req.Network.HostPort = sm.config.DefaultPort
}
if req.Network.ContainerPort == 0 {
req.Network.ContainerPort = 8888
}
// Check for port conflicts
for _, service := range sm.services {
if service.Port == req.Network.HostPort && service.Status == serviceStatusRunning {
return fmt.Errorf("port %d is already in use by service %s", req.Network.HostPort, service.Name)
}
}
return nil
}
// generateServiceID generates a unique service ID
func (sm *ServiceManager) generateServiceID(name string) string {
timestamp := time.Now().Unix()
sanitizedName := strings.ToLower(strings.ReplaceAll(name, " ", "-"))
return fmt.Sprintf("jupyter-%s-%d", sanitizedName, timestamp)
}
// prepareContainerConfig prepares container configuration
func (sm *ServiceManager) prepareContainerConfig(serviceID string, req *StartRequest) *container.ContainerConfig {
// Prepare volume mounts
volumes := map[string]string{
req.Workspace: "/workspace",
}
// Prepare environment variables
env := map[string]string{
"JUPYTER_ENABLE_LAB": "yes",
}
if req.Network.EnableToken && req.Network.Token != "" {
env["JUPYTER_TOKEN"] = req.Network.Token
} else {
env["JUPYTER_TOKEN"] = "" // No token for development
}
if req.Network.EnablePassword && req.Network.Password != "" {
env["JUPYTER_PASSWORD"] = req.Network.Password
}
// Add custom environment variables
for k, v := range req.Environment {
env[k] = v
}
// Prepare port mappings
ports := map[int]int{
req.Network.HostPort: req.Network.ContainerPort,
}
// Prepare container command
cmd := []string{
"conda", "run", "-n", "ml_env", "jupyter", "notebook",
"--no-browser",
"--ip=0.0.0.0",
fmt.Sprintf("--port=%d", req.Network.ContainerPort),
"--NotebookApp.allow-root=True",
"--NotebookApp.ip=0.0.0.0",
}
if !req.Network.EnableToken {
cmd = append(cmd, "--NotebookApp.token=")
}
// Prepare security options
securityOpts := []string{}
if req.Security.ReadOnlyRoot {
securityOpts = append(securityOpts, "--read-only")
}
for _, cap := range req.Security.DropCapabilities {
securityOpts = append(securityOpts, fmt.Sprintf("--cap-drop=%s", cap))
}
return &container.ContainerConfig{
Name: serviceID,
Image: req.Image,
Command: cmd,
Env: env,
Volumes: volumes,
Ports: ports,
SecurityOpts: securityOpts,
Resources: container.ResourceConfig{
MemoryLimit: req.Resources.MemoryLimit,
CPULimit: req.Resources.CPULimit,
GPUAccess: req.Resources.GPUAccess,
},
Network: container.NetworkConfig{
AllowNetwork: req.Security.AllowNetwork,
},
}
}
// waitForJupyterReady waits for Jupyter to be ready and returns the URL
func (sm *ServiceManager) waitForJupyterReady(
ctx context.Context,
containerID string,
networkConfig NetworkConfig,
) (string, error) {
// Wait for container to be running
maxWait := 60 * time.Second
interval := 2 * time.Second
deadline := time.Now().Add(maxWait)
for time.Now().Before(deadline) {
status, err := sm.podman.GetContainerStatus(ctx, containerID)
if err != nil {
return "", fmt.Errorf("failed to check container status: %w", err)
}
if status == serviceStatusRunning {
break
}
if status == "exited" || status == "error" {
return "", fmt.Errorf("container failed to start (status: %s)", status)
}
select {
case <-ctx.Done():
return "", ctx.Err()
case <-time.After(interval):
}
}
// Wait a bit more for Jupyter to initialize
time.Sleep(5 * time.Second)
// Construct URL
url := fmt.Sprintf("http://localhost:%d", networkConfig.HostPort)
if networkConfig.EnableToken && networkConfig.Token != "" {
url += fmt.Sprintf("?token=%s", networkConfig.Token)
}
return url, nil
}
// loadServices loads existing services from disk
func (sm *ServiceManager) loadServices() error {
servicesFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_services.json")
data, err := os.ReadFile(servicesFile)
if err != nil {
if os.IsNotExist(err) {
return nil // No existing services
}
return err
}
var services map[string]*JupyterService
if err := json.Unmarshal(data, &services); err != nil {
return err
}
// Validate services are still running
for id, service := range services {
if service.Status == serviceStatusRunning {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
status, err := sm.podman.GetContainerStatus(ctx, service.ContainerID)
cancel()
if err != nil || status != "running" {
service.Status = "stopped"
}
}
sm.services[id] = service
}
return nil
}
// saveServices saves services to disk
func (sm *ServiceManager) saveServices() error {
servicesFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_services.json")
data, err := json.MarshalIndent(sm.services, "", " ")
if err != nil {
return err
}
return os.WriteFile(servicesFile, data, 0600)
}
// LinkWorkspaceWithExperiment links a workspace with an experiment
func (sm *ServiceManager) LinkWorkspaceWithExperiment(workspacePath, experimentID, serviceID string) error {
return sm.workspaceMetadataMgr.LinkWorkspace(workspacePath, experimentID, serviceID)
}
// GetWorkspaceMetadata retrieves metadata for a workspace
func (sm *ServiceManager) GetWorkspaceMetadata(workspacePath string) (*WorkspaceMetadata, error) {
return sm.workspaceMetadataMgr.GetWorkspaceMetadata(workspacePath)
}
// SyncWorkspaceWithExperiment synchronizes a workspace with an experiment
func (sm *ServiceManager) SyncWorkspaceWithExperiment(
_ context.Context,
workspacePath,
experimentID,
direction string,
) error {
// Update sync time in metadata
if err := sm.workspaceMetadataMgr.UpdateSyncTime(workspacePath, direction); err != nil {
sm.logger.Warn("failed to update sync time", "error", err)
}
// In a real implementation, this would perform actual synchronization:
// - For "pull": Download experiment data/metrics to workspace
// - For "push": Upload workspace notebooks/results to experiment
sm.logger.Info("workspace sync completed",
"workspace", workspacePath,
"experiment_id", experimentID,
"direction", direction)
return nil
}
// ListLinkedWorkspaces returns all linked workspaces
func (sm *ServiceManager) ListLinkedWorkspaces() []*WorkspaceMetadata {
return sm.workspaceMetadataMgr.ListLinkedWorkspaces()
}
// GetWorkspacesForExperiment returns all workspaces linked to an experiment
func (sm *ServiceManager) GetWorkspacesForExperiment(experimentID string) []*WorkspaceMetadata {
return sm.workspaceMetadataMgr.GetWorkspacesForExperiment(experimentID)
}
// UnlinkWorkspace removes the link between workspace and experiment
func (sm *ServiceManager) UnlinkWorkspace(workspacePath string) error {
return sm.workspaceMetadataMgr.UnlinkWorkspace(workspacePath)
}
// ClearAllMetadata clears all workspace metadata (used for test isolation)
func (sm *ServiceManager) ClearAllMetadata() error {
return sm.workspaceMetadataMgr.ClearAllMetadata()
}
// SetAutoSync enables or disables auto-sync for a workspace
func (sm *ServiceManager) SetAutoSync(workspacePath string, enabled bool, interval time.Duration) error {
return sm.workspaceMetadataMgr.SetAutoSync(workspacePath, enabled, interval)
}
// AddTag adds a tag to workspace metadata
func (sm *ServiceManager) AddTag(workspacePath, tag string) error {
return sm.workspaceMetadataMgr.AddTag(workspacePath, tag)
}
// Close cleans up the service manager
func (sm *ServiceManager) Close(ctx context.Context) error {
// Stop all running services
for _, service := range sm.services {
if service.Status == serviceStatusRunning {
if err := sm.StopService(ctx, service.ID); err != nil {
sm.logger.Warn("failed to stop service during cleanup",
"service_id", service.ID, "error", err)
}
}
}
return nil
}