fetch_ml/internal/scheduler/service_templates.go

// Package scheduler provides service plugin templates for fetch_ml.
// These templates define how long-running services like Jupyter are configured.
package scheduler

// ServiceTemplate defines a service job that runs indefinitely until stopped.
// This is used for Jupyter, vLLM, and similar interactive services.
type ServiceTemplate struct {
	// JobType identifies this as a service job
	JobType string `json:"job_type"` // Always "service"

	// SlotPool specifies which slot pool to use ("batch" or "service")
	SlotPool string `json:"slot_pool"`

	// GPUCount is the number of GPUs required (can be 0 for CPU-only services)
	GPUCount int `json:"gpu_count"`

	// Command is the service command with template variables
	Command []string `json:"command"`

	// Env defines environment variables with template variables
	Env map[string]string `json:"env"`

	// HealthCheck defines how to verify the service is healthy
	HealthCheck ServiceHealthCheck `json:"health_check"`

	// Mounts defines volume mounts for the service
	Mounts []ServiceMount `json:"mounts,omitempty"`

	// Ports to expose (if not using dynamic allocation)
	Ports []int `json:"ports,omitempty"`
}

// ServiceHealthCheck defines liveness and readiness probes
type ServiceHealthCheck struct {
	// Liveness endpoint - checks if service is running
	Liveness string `json:"liveness"`

	// Readiness endpoint - checks if service is ready for traffic
	Readiness string `json:"readiness"`

	// Interval between health checks in seconds
	Interval int `json:"interval"`

	// Timeout for each health check in seconds
	Timeout int `json:"timeout"`
}

// ServiceMount defines a volume mount
type ServiceMount struct {
	Source      string `json:"source"`
	Destination string `json:"destination"`
	ReadOnly    bool   `json:"readonly,omitempty"`
}

// Template variables available in ServiceTemplate:
//   {{SERVICE_PORT}}   - Dynamically allocated port for the service
//   {{WORKER_ID}}      - ID of the worker running the service
//   {{TASK_ID}}        - Unique task ID for this service instance
//   {{SECRET:xxx}}     - Secret value from scheduler's secret store

// JupyterLabTemplate is the default JupyterLab service configuration.
// Sysadmins can disable Jupyter by setting service_slots: 0 in worker config,
// or by not registering this template with the scheduler.
var JupyterLabTemplate = ServiceTemplate{
	JobType:  "service",
	SlotPool: "service", // Uses service slot pool, not batch
	GPUCount: 0,         // Jupyter typically runs CPU-only

	Command: []string{
		"jupyter", "lab",
		"--ip=0.0.0.0",
		"--port={{SERVICE_PORT}}",
		"--no-browser",
		"--allow-root",
		"--NotebookApp.token='{{SECRET:jupyter_token}}'",
		"--NotebookApp.password=''",
	},

	Env: map[string]string{
		"JUPYTER_TOKEN":      "{{SECRET:jupyter_token}}",
		"JUPYTER_CONFIG_DIR": "/workspace/.jupyter",
	},

	HealthCheck: ServiceHealthCheck{
		Liveness:  "http://localhost:{{SERVICE_PORT}}/api",
		Readiness: "http://localhost:{{SERVICE_PORT}}/api/kernels",
		Interval:  15,
		Timeout:   5,
	},

	Mounts: []ServiceMount{
		{Source: "{{WORKSPACE}}", Destination: "/workspace"},
	},
}

// JupyterNotebookTemplate is an alternative using classic Jupyter Notebook.
var JupyterNotebookTemplate = ServiceTemplate{
	JobType:  "service",
	SlotPool: "service",
	GPUCount: 0,

	Command: []string{
		"jupyter", "notebook",
		"--ip=0.0.0.0",
		"--port={{SERVICE_PORT}}",
		"--no-browser",
		"--allow-root",
		"--NotebookApp.token='{{SECRET:jupyter_token}}'",
	},

	Env: map[string]string{
		"JUPYTER_TOKEN": "{{SECRET:jupyter_token}}",
	},

	HealthCheck: ServiceHealthCheck{
		Liveness:  "http://localhost:{{SERVICE_PORT}}/api",
		Readiness: "http://localhost:{{SERVICE_PORT}}/api/kernels",
		Interval:  15,
		Timeout:   5,
	},

	Mounts: []ServiceMount{
		{Source: "{{WORKSPACE}}", Destination: "/workspace"},
	},
}

// VLLMTemplate is an example vLLM inference server template (future)
var VLLMTemplate = ServiceTemplate{
	JobType:  "service",
	SlotPool: "service",
	GPUCount: 1, // Requires GPU for inference

	Command: []string{
		"python", "-m", "vllm.entrypoints.openai.api_server",
		"--model", "{{MODEL_NAME}}",
		"--port", "{{SERVICE_PORT}}",
	},

	HealthCheck: ServiceHealthCheck{
		Liveness:  "http://localhost:{{SERVICE_PORT}}/health",
		Readiness: "http://localhost:{{SERVICE_PORT}}/health",
		Interval:  30,
		Timeout:   10,
	},
}