- Move ci-test.sh and setup.sh to scripts/ - Trim docs/src/zig-cli.md to current structure - Replace hardcoded secrets with placeholders in configs - Update .gitignore to block .env*, secrets/, keys, build artifacts - Slim README.md to reflect current CLI/TUI split - Add cleanup trap to ci-test.sh - Ensure no secrets are committed
575 lines
17 KiB
Go
575 lines
17 KiB
Go
package jupyter
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/container"
|
|
"github.com/jfraeys/fetch_ml/internal/logging"
|
|
)
|
|
|
|
const (
|
|
serviceStatusRunning = "running"
|
|
)
|
|
|
|
// ServiceManager manages standalone Jupyter services
|
|
type ServiceManager struct {
|
|
logger *logging.Logger
|
|
podman *container.PodmanManager
|
|
config *ServiceConfig
|
|
services map[string]*JupyterService
|
|
workspaceMetadataMgr *WorkspaceMetadataManager
|
|
}
|
|
|
|
// ServiceConfig holds configuration for Jupyter services
|
|
type ServiceConfig struct {
|
|
DefaultImage string `json:"default_image"`
|
|
DefaultPort int `json:"default_port"`
|
|
DefaultWorkspace string `json:"default_workspace"`
|
|
MaxServices int `json:"max_services"`
|
|
DefaultResources ResourceConfig `json:"default_resources"`
|
|
SecuritySettings SecurityConfig `json:"security_settings"`
|
|
NetworkConfig NetworkConfig `json:"network_config"`
|
|
}
|
|
|
|
// NetworkConfig defines network settings for Jupyter containers
|
|
type NetworkConfig struct {
|
|
HostPort int `json:"host_port"`
|
|
ContainerPort int `json:"container_port"`
|
|
BindAddress string `json:"bind_address"`
|
|
EnableToken bool `json:"enable_token"`
|
|
Token string `json:"token"`
|
|
EnablePassword bool `json:"enable_password"`
|
|
Password string `json:"password"`
|
|
AllowRemote bool `json:"allow_remote"`
|
|
NetworkName string `json:"network_name"`
|
|
}
|
|
|
|
// ResourceConfig defines resource limits for Jupyter containers
|
|
type ResourceConfig struct {
|
|
MemoryLimit string `json:"memory_limit"`
|
|
CPULimit string `json:"cpu_limit"`
|
|
GPUAccess bool `json:"gpu_access"`
|
|
}
|
|
|
|
// SecurityConfig holds security settings for Jupyter services
|
|
type SecurityConfig struct {
|
|
AllowNetwork bool `json:"allow_network"`
|
|
AllowedHosts []string `json:"allowed_hosts"`
|
|
BlockedHosts []string `json:"blocked_hosts"`
|
|
EnableFirewall bool `json:"enable_firewall"`
|
|
TrustedChannels []string `json:"trusted_channels"`
|
|
BlockedPackages []string `json:"blocked_packages"`
|
|
AllowedPackages map[string]bool `json:"allowed_packages"`
|
|
RequireApproval bool `json:"require_approval"`
|
|
ReadOnlyRoot bool `json:"read_only_root"`
|
|
DropCapabilities []string `json:"drop_capabilities"`
|
|
RunAsNonRoot bool `json:"run_as_non_root"`
|
|
EnableSeccomp bool `json:"enable_seccomp"`
|
|
NoNewPrivileges bool `json:"no_new_privileges"`
|
|
}
|
|
|
|
// JupyterService represents a running Jupyter instance
|
|
type JupyterService struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Status string `json:"status"`
|
|
ContainerID string `json:"container_id"`
|
|
Port int `json:"port"`
|
|
Workspace string `json:"workspace"`
|
|
Image string `json:"image"`
|
|
URL string `json:"url"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
LastAccess time.Time `json:"last_access"`
|
|
Config ServiceConfig `json:"config"`
|
|
Environment map[string]string `json:"environment"`
|
|
Metadata map[string]string `json:"metadata"`
|
|
}
|
|
|
|
// StartRequest defines parameters for starting a Jupyter service
|
|
type StartRequest struct {
|
|
Name string `json:"name"`
|
|
Workspace string `json:"workspace"`
|
|
Image string `json:"image"`
|
|
Port int `json:"port"`
|
|
Resources ResourceConfig `json:"resources"`
|
|
Security SecurityConfig `json:"security"`
|
|
Network NetworkConfig `json:"network"`
|
|
Environment map[string]string `json:"environment"`
|
|
Metadata map[string]string `json:"metadata"`
|
|
}
|
|
|
|
// NewServiceManager creates a new Jupyter service manager
|
|
func NewServiceManager(logger *logging.Logger, config *ServiceConfig) (*ServiceManager, error) {
|
|
podman, err := container.NewPodmanManager(logger)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create podman manager: %w", err)
|
|
}
|
|
|
|
// Initialize workspace metadata manager
|
|
dataFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_workspaces.json")
|
|
workspaceMetadataMgr := NewWorkspaceMetadataManager(logger, dataFile)
|
|
|
|
sm := &ServiceManager{
|
|
logger: logger,
|
|
podman: podman,
|
|
config: config,
|
|
services: make(map[string]*JupyterService),
|
|
workspaceMetadataMgr: workspaceMetadataMgr,
|
|
}
|
|
|
|
// Load existing services
|
|
if err := sm.loadServices(); err != nil {
|
|
logger.Warn("failed to load existing services", "error", err)
|
|
}
|
|
|
|
return sm, nil
|
|
}
|
|
|
|
// StartService starts a new Jupyter service
|
|
func (sm *ServiceManager) StartService(ctx context.Context, req *StartRequest) (*JupyterService, error) {
|
|
// Validate request
|
|
if err := sm.validateStartRequest(req); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Check service limit
|
|
if len(sm.services) >= sm.config.MaxServices {
|
|
return nil, fmt.Errorf("maximum number of services (%d) reached", sm.config.MaxServices)
|
|
}
|
|
|
|
// Generate service ID
|
|
serviceID := sm.generateServiceID(req.Name)
|
|
|
|
// Prepare container configuration
|
|
containerConfig := sm.prepareContainerConfig(serviceID, req)
|
|
|
|
// Start container
|
|
containerID, err := sm.podman.StartContainer(ctx, containerConfig)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to start container: %w", err)
|
|
}
|
|
|
|
// Wait for Jupyter to be ready
|
|
url, err := sm.waitForJupyterReady(ctx, containerID, req.Network)
|
|
if err != nil {
|
|
// Cleanup on failure
|
|
_ = sm.podman.StopContainer(ctx, containerID)
|
|
return nil, fmt.Errorf("jupyter failed to start: %w", err)
|
|
}
|
|
|
|
// Create service record
|
|
service := &JupyterService{
|
|
ID: serviceID,
|
|
Name: req.Name,
|
|
Status: serviceStatusRunning,
|
|
ContainerID: containerID,
|
|
Port: req.Network.HostPort,
|
|
Workspace: req.Workspace,
|
|
Image: req.Image,
|
|
URL: url,
|
|
CreatedAt: time.Now(),
|
|
LastAccess: time.Now(),
|
|
Config: *sm.config,
|
|
Environment: req.Environment,
|
|
Metadata: req.Metadata,
|
|
}
|
|
|
|
// Store service
|
|
sm.services[serviceID] = service
|
|
|
|
// Check if workspace is linked with an experiment
|
|
if workspaceMeta, err := sm.workspaceMetadataMgr.GetWorkspaceMetadata(req.Workspace); err == nil {
|
|
service.Metadata["experiment_id"] = workspaceMeta.ExperimentID
|
|
service.Metadata["linked_at"] = fmt.Sprintf("%d", workspaceMeta.LinkedAt.Unix())
|
|
sm.logger.Info("service started with linked experiment",
|
|
"service_id", serviceID,
|
|
"experiment_id", workspaceMeta.ExperimentID)
|
|
}
|
|
|
|
// Save services to disk
|
|
if err := sm.saveServices(); err != nil {
|
|
sm.logger.Warn("failed to save services", "error", err)
|
|
}
|
|
|
|
sm.logger.Info("jupyter service started",
|
|
"service_id", serviceID,
|
|
"name", req.Name,
|
|
"url", url,
|
|
"workspace", req.Workspace)
|
|
|
|
return service, nil
|
|
}
|
|
|
|
// StopService stops a Jupyter service
|
|
func (sm *ServiceManager) StopService(ctx context.Context, serviceID string) error {
|
|
service, exists := sm.services[serviceID]
|
|
if !exists {
|
|
return fmt.Errorf("service %s not found", serviceID)
|
|
}
|
|
|
|
// Stop container
|
|
if err := sm.podman.StopContainer(ctx, service.ContainerID); err != nil {
|
|
sm.logger.Warn("failed to stop container", "service_id", serviceID, "error", err)
|
|
}
|
|
|
|
// Remove container
|
|
if err := sm.podman.RemoveContainer(ctx, service.ContainerID); err != nil {
|
|
sm.logger.Warn("failed to remove container", "service_id", serviceID, "error", err)
|
|
}
|
|
|
|
// Update service status
|
|
service.Status = "stopped"
|
|
service.LastAccess = time.Now()
|
|
|
|
// Remove from active services
|
|
delete(sm.services, serviceID)
|
|
|
|
// Save services to disk
|
|
if err := sm.saveServices(); err != nil {
|
|
sm.logger.Warn("failed to save services", "error", err)
|
|
}
|
|
|
|
sm.logger.Info("jupyter service stopped", "service_id", serviceID, "name", service.Name)
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetService retrieves a service by ID
|
|
func (sm *ServiceManager) GetService(serviceID string) (*JupyterService, error) {
|
|
service, exists := sm.services[serviceID]
|
|
if !exists {
|
|
return nil, fmt.Errorf("service %s not found", serviceID)
|
|
}
|
|
|
|
// Update last access time
|
|
service.LastAccess = time.Now()
|
|
|
|
return service, nil
|
|
}
|
|
|
|
// ListServices returns all services
|
|
func (sm *ServiceManager) ListServices() []*JupyterService {
|
|
services := make([]*JupyterService, 0, len(sm.services))
|
|
for _, service := range sm.services {
|
|
services = append(services, service)
|
|
}
|
|
return services
|
|
}
|
|
|
|
// GetServiceStatus returns the current status of a service
|
|
func (sm *ServiceManager) GetServiceStatus(ctx context.Context, serviceID string) (string, error) {
|
|
service, exists := sm.services[serviceID]
|
|
if !exists {
|
|
return "", fmt.Errorf("service %s not found", serviceID)
|
|
}
|
|
|
|
// Check container status
|
|
status, err := sm.podman.GetContainerStatus(ctx, service.ContainerID)
|
|
if err != nil {
|
|
sm.logger.Warn("failed to get container status", "service_id", serviceID, "error", err)
|
|
return "unknown", err
|
|
}
|
|
|
|
// Update service status if different
|
|
if service.Status != status {
|
|
service.Status = status
|
|
service.LastAccess = time.Now()
|
|
_ = sm.saveServices()
|
|
}
|
|
|
|
return status, nil
|
|
}
|
|
|
|
// validateStartRequest validates a start request
|
|
func (sm *ServiceManager) validateStartRequest(req *StartRequest) error {
|
|
if req.Name == "" {
|
|
return fmt.Errorf("service name is required")
|
|
}
|
|
|
|
if req.Workspace == "" {
|
|
req.Workspace = sm.config.DefaultWorkspace
|
|
}
|
|
|
|
// Check if workspace exists
|
|
if _, err := os.Stat(req.Workspace); os.IsNotExist(err) {
|
|
return fmt.Errorf("workspace %s does not exist", req.Workspace)
|
|
}
|
|
|
|
if req.Image == "" {
|
|
req.Image = sm.config.DefaultImage
|
|
}
|
|
|
|
if req.Network.HostPort == 0 {
|
|
req.Network.HostPort = sm.config.DefaultPort
|
|
}
|
|
|
|
if req.Network.ContainerPort == 0 {
|
|
req.Network.ContainerPort = 8888
|
|
}
|
|
|
|
// Check for port conflicts
|
|
for _, service := range sm.services {
|
|
if service.Port == req.Network.HostPort && service.Status == serviceStatusRunning {
|
|
return fmt.Errorf("port %d is already in use by service %s", req.Network.HostPort, service.Name)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// generateServiceID generates a unique service ID
|
|
func (sm *ServiceManager) generateServiceID(name string) string {
|
|
timestamp := time.Now().Unix()
|
|
sanitizedName := strings.ToLower(strings.ReplaceAll(name, " ", "-"))
|
|
return fmt.Sprintf("jupyter-%s-%d", sanitizedName, timestamp)
|
|
}
|
|
|
|
// prepareContainerConfig prepares container configuration
|
|
func (sm *ServiceManager) prepareContainerConfig(serviceID string, req *StartRequest) *container.ContainerConfig {
|
|
// Prepare volume mounts
|
|
volumes := map[string]string{
|
|
req.Workspace: "/workspace",
|
|
}
|
|
|
|
// Prepare environment variables
|
|
env := map[string]string{
|
|
"JUPYTER_ENABLE_LAB": "yes",
|
|
}
|
|
|
|
if req.Network.EnableToken && req.Network.Token != "" {
|
|
env["JUPYTER_TOKEN"] = req.Network.Token
|
|
} else {
|
|
env["JUPYTER_TOKEN"] = "" // No token for development
|
|
}
|
|
|
|
if req.Network.EnablePassword && req.Network.Password != "" {
|
|
env["JUPYTER_PASSWORD"] = req.Network.Password
|
|
}
|
|
|
|
// Add custom environment variables
|
|
for k, v := range req.Environment {
|
|
env[k] = v
|
|
}
|
|
|
|
// Prepare port mappings
|
|
ports := map[int]int{
|
|
req.Network.HostPort: req.Network.ContainerPort,
|
|
}
|
|
|
|
// Prepare container command
|
|
cmd := []string{
|
|
"conda", "run", "-n", "ml_env", "jupyter", "notebook",
|
|
"--no-browser",
|
|
"--ip=0.0.0.0",
|
|
fmt.Sprintf("--port=%d", req.Network.ContainerPort),
|
|
"--NotebookApp.allow-root=True",
|
|
"--NotebookApp.ip=0.0.0.0",
|
|
}
|
|
|
|
if !req.Network.EnableToken {
|
|
cmd = append(cmd, "--NotebookApp.token=")
|
|
}
|
|
|
|
// Prepare security options
|
|
securityOpts := []string{}
|
|
if req.Security.ReadOnlyRoot {
|
|
securityOpts = append(securityOpts, "--read-only")
|
|
}
|
|
|
|
for _, cap := range req.Security.DropCapabilities {
|
|
securityOpts = append(securityOpts, fmt.Sprintf("--cap-drop=%s", cap))
|
|
}
|
|
|
|
return &container.ContainerConfig{
|
|
Name: serviceID,
|
|
Image: req.Image,
|
|
Command: cmd,
|
|
Env: env,
|
|
Volumes: volumes,
|
|
Ports: ports,
|
|
SecurityOpts: securityOpts,
|
|
Resources: container.ResourceConfig{
|
|
MemoryLimit: req.Resources.MemoryLimit,
|
|
CPULimit: req.Resources.CPULimit,
|
|
GPUAccess: req.Resources.GPUAccess,
|
|
},
|
|
Network: container.NetworkConfig{
|
|
AllowNetwork: req.Security.AllowNetwork,
|
|
},
|
|
}
|
|
}
|
|
|
|
// waitForJupyterReady waits for Jupyter to be ready and returns the URL
|
|
func (sm *ServiceManager) waitForJupyterReady(
|
|
ctx context.Context,
|
|
containerID string,
|
|
networkConfig NetworkConfig,
|
|
) (string, error) {
|
|
// Wait for container to be running
|
|
maxWait := 60 * time.Second
|
|
interval := 2 * time.Second
|
|
deadline := time.Now().Add(maxWait)
|
|
|
|
for time.Now().Before(deadline) {
|
|
status, err := sm.podman.GetContainerStatus(ctx, containerID)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to check container status: %w", err)
|
|
}
|
|
|
|
if status == serviceStatusRunning {
|
|
break
|
|
}
|
|
|
|
if status == "exited" || status == "error" {
|
|
return "", fmt.Errorf("container failed to start (status: %s)", status)
|
|
}
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return "", ctx.Err()
|
|
case <-time.After(interval):
|
|
}
|
|
}
|
|
|
|
// Wait a bit more for Jupyter to initialize
|
|
time.Sleep(5 * time.Second)
|
|
|
|
// Construct URL
|
|
url := fmt.Sprintf("http://localhost:%d", networkConfig.HostPort)
|
|
if networkConfig.EnableToken && networkConfig.Token != "" {
|
|
url += fmt.Sprintf("?token=%s", networkConfig.Token)
|
|
}
|
|
|
|
return url, nil
|
|
}
|
|
|
|
// loadServices loads existing services from disk
|
|
func (sm *ServiceManager) loadServices() error {
|
|
servicesFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_services.json")
|
|
|
|
data, err := os.ReadFile(servicesFile)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil // No existing services
|
|
}
|
|
return err
|
|
}
|
|
|
|
var services map[string]*JupyterService
|
|
if err := json.Unmarshal(data, &services); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Validate services are still running
|
|
for id, service := range services {
|
|
if service.Status == serviceStatusRunning {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
status, err := sm.podman.GetContainerStatus(ctx, service.ContainerID)
|
|
cancel()
|
|
|
|
if err != nil || status != "running" {
|
|
service.Status = "stopped"
|
|
}
|
|
}
|
|
sm.services[id] = service
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// saveServices saves services to disk
|
|
func (sm *ServiceManager) saveServices() error {
|
|
servicesFile := filepath.Join(os.TempDir(), "fetch_ml_jupyter_services.json")
|
|
|
|
data, err := json.MarshalIndent(sm.services, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return os.WriteFile(servicesFile, data, 0600)
|
|
}
|
|
|
|
// LinkWorkspaceWithExperiment links a workspace with an experiment
|
|
func (sm *ServiceManager) LinkWorkspaceWithExperiment(workspacePath, experimentID, serviceID string) error {
|
|
return sm.workspaceMetadataMgr.LinkWorkspace(workspacePath, experimentID, serviceID)
|
|
}
|
|
|
|
// GetWorkspaceMetadata retrieves metadata for a workspace
|
|
func (sm *ServiceManager) GetWorkspaceMetadata(workspacePath string) (*WorkspaceMetadata, error) {
|
|
return sm.workspaceMetadataMgr.GetWorkspaceMetadata(workspacePath)
|
|
}
|
|
|
|
// SyncWorkspaceWithExperiment synchronizes a workspace with an experiment
|
|
func (sm *ServiceManager) SyncWorkspaceWithExperiment(
|
|
_ context.Context,
|
|
workspacePath,
|
|
experimentID,
|
|
direction string,
|
|
) error {
|
|
// Update sync time in metadata
|
|
if err := sm.workspaceMetadataMgr.UpdateSyncTime(workspacePath, direction); err != nil {
|
|
sm.logger.Warn("failed to update sync time", "error", err)
|
|
}
|
|
|
|
// In a real implementation, this would perform actual synchronization:
|
|
// - For "pull": Download experiment data/metrics to workspace
|
|
// - For "push": Upload workspace notebooks/results to experiment
|
|
|
|
sm.logger.Info("workspace sync completed",
|
|
"workspace", workspacePath,
|
|
"experiment_id", experimentID,
|
|
"direction", direction)
|
|
|
|
return nil
|
|
}
|
|
|
|
// ListLinkedWorkspaces returns all linked workspaces
|
|
func (sm *ServiceManager) ListLinkedWorkspaces() []*WorkspaceMetadata {
|
|
return sm.workspaceMetadataMgr.ListLinkedWorkspaces()
|
|
}
|
|
|
|
// GetWorkspacesForExperiment returns all workspaces linked to an experiment
|
|
func (sm *ServiceManager) GetWorkspacesForExperiment(experimentID string) []*WorkspaceMetadata {
|
|
return sm.workspaceMetadataMgr.GetWorkspacesForExperiment(experimentID)
|
|
}
|
|
|
|
// UnlinkWorkspace removes the link between workspace and experiment
|
|
func (sm *ServiceManager) UnlinkWorkspace(workspacePath string) error {
|
|
return sm.workspaceMetadataMgr.UnlinkWorkspace(workspacePath)
|
|
}
|
|
|
|
// ClearAllMetadata clears all workspace metadata (used for test isolation)
|
|
func (sm *ServiceManager) ClearAllMetadata() error {
|
|
return sm.workspaceMetadataMgr.ClearAllMetadata()
|
|
}
|
|
|
|
// SetAutoSync enables or disables auto-sync for a workspace
|
|
func (sm *ServiceManager) SetAutoSync(workspacePath string, enabled bool, interval time.Duration) error {
|
|
return sm.workspaceMetadataMgr.SetAutoSync(workspacePath, enabled, interval)
|
|
}
|
|
|
|
// AddTag adds a tag to workspace metadata
|
|
func (sm *ServiceManager) AddTag(workspacePath, tag string) error {
|
|
return sm.workspaceMetadataMgr.AddTag(workspacePath, tag)
|
|
}
|
|
|
|
// Close cleans up the service manager
|
|
func (sm *ServiceManager) Close(ctx context.Context) error {
|
|
// Stop all running services
|
|
for _, service := range sm.services {
|
|
if service.Status == serviceStatusRunning {
|
|
if err := sm.StopService(ctx, service.ID); err != nil {
|
|
sm.logger.Warn("failed to stop service during cleanup",
|
|
"service_id", service.ID, "error", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|