- Refactor worker configuration management - Improve container executor lifecycle handling - Update runloop and worker core logic - Enhance scheduler service template generation - Remove obsolete 'scheduler' symlink/directory
145 lines
4.3 KiB
Go
145 lines
4.3 KiB
Go
// Package scheduler provides service plugin templates for fetch_ml.
|
|
// These templates define how long-running services like Jupyter are configured.
|
|
package scheduler
|
|
|
|
// ServiceTemplate defines a service job that runs indefinitely until stopped.
|
|
// This is used for Jupyter, vLLM, and similar interactive services.
|
|
type ServiceTemplate struct {
|
|
// JobType identifies this as a service job
|
|
JobType string `json:"job_type"` // Always "service"
|
|
|
|
// SlotPool specifies which slot pool to use ("batch" or "service")
|
|
SlotPool string `json:"slot_pool"`
|
|
|
|
// GPUCount is the number of GPUs required (can be 0 for CPU-only services)
|
|
GPUCount int `json:"gpu_count"`
|
|
|
|
// Command is the service command with template variables
|
|
Command []string `json:"command"`
|
|
|
|
// Env defines environment variables with template variables
|
|
Env map[string]string `json:"env"`
|
|
|
|
// HealthCheck defines how to verify the service is healthy
|
|
HealthCheck ServiceHealthCheck `json:"health_check"`
|
|
|
|
// Mounts defines volume mounts for the service
|
|
Mounts []ServiceMount `json:"mounts,omitempty"`
|
|
|
|
// Ports to expose (if not using dynamic allocation)
|
|
Ports []int `json:"ports,omitempty"`
|
|
}
|
|
|
|
// ServiceHealthCheck defines liveness and readiness probes
|
|
type ServiceHealthCheck struct {
|
|
// Liveness endpoint - checks if service is running
|
|
Liveness string `json:"liveness"`
|
|
|
|
// Readiness endpoint - checks if service is ready for traffic
|
|
Readiness string `json:"readiness"`
|
|
|
|
// Interval between health checks in seconds
|
|
Interval int `json:"interval"`
|
|
|
|
// Timeout for each health check in seconds
|
|
Timeout int `json:"timeout"`
|
|
}
|
|
|
|
// ServiceMount defines a volume mount
|
|
type ServiceMount struct {
|
|
Source string `json:"source"`
|
|
Destination string `json:"destination"`
|
|
ReadOnly bool `json:"readonly,omitempty"`
|
|
}
|
|
|
|
// Template variables available in ServiceTemplate:
|
|
// {{SERVICE_PORT}} - Dynamically allocated port for the service
|
|
// {{WORKER_ID}} - ID of the worker running the service
|
|
// {{TASK_ID}} - Unique task ID for this service instance
|
|
// {{SECRET:xxx}} - Secret value from scheduler's secret store
|
|
|
|
// JupyterLabTemplate is the default JupyterLab service configuration.
|
|
// Users can disable Jupyter by setting service_slots: 0 in worker config,
|
|
// or by not registering this template with the scheduler.
|
|
var JupyterLabTemplate = ServiceTemplate{
|
|
JobType: "service",
|
|
SlotPool: "service", // Uses service slot pool, not batch
|
|
GPUCount: 0, // Jupyter typically runs CPU-only
|
|
|
|
Command: []string{
|
|
"jupyter", "lab",
|
|
"--ip=0.0.0.0",
|
|
"--port={{SERVICE_PORT}}",
|
|
"--no-browser",
|
|
"--allow-root",
|
|
"--NotebookApp.token='{{SECRET:jupyter_token}}'",
|
|
"--NotebookApp.password=''",
|
|
},
|
|
|
|
Env: map[string]string{
|
|
"JUPYTER_TOKEN": "{{SECRET:jupyter_token}}",
|
|
"JUPYTER_CONFIG_DIR": "/workspace/.jupyter",
|
|
},
|
|
|
|
HealthCheck: ServiceHealthCheck{
|
|
Liveness: "http://localhost:{{SERVICE_PORT}}/api",
|
|
Readiness: "http://localhost:{{SERVICE_PORT}}/api/kernels",
|
|
Interval: 15,
|
|
Timeout: 5,
|
|
},
|
|
|
|
Mounts: []ServiceMount{
|
|
{Source: "{{WORKSPACE}}", Destination: "/workspace"},
|
|
},
|
|
}
|
|
|
|
// JupyterNotebookTemplate is an alternative using classic Jupyter Notebook.
|
|
var JupyterNotebookTemplate = ServiceTemplate{
|
|
JobType: "service",
|
|
SlotPool: "service",
|
|
GPUCount: 0,
|
|
|
|
Command: []string{
|
|
"jupyter", "notebook",
|
|
"--ip=0.0.0.0",
|
|
"--port={{SERVICE_PORT}}",
|
|
"--no-browser",
|
|
"--allow-root",
|
|
"--NotebookApp.token='{{SECRET:jupyter_token}}'",
|
|
},
|
|
|
|
Env: map[string]string{
|
|
"JUPYTER_TOKEN": "{{SECRET:jupyter_token}}",
|
|
},
|
|
|
|
HealthCheck: ServiceHealthCheck{
|
|
Liveness: "http://localhost:{{SERVICE_PORT}}/api",
|
|
Readiness: "http://localhost:{{SERVICE_PORT}}/api/kernels",
|
|
Interval: 15,
|
|
Timeout: 5,
|
|
},
|
|
|
|
Mounts: []ServiceMount{
|
|
{Source: "{{WORKSPACE}}", Destination: "/workspace"},
|
|
},
|
|
}
|
|
|
|
// VLLMTemplate is an example vLLM inference server template (future)
|
|
var VLLMTemplate = ServiceTemplate{
|
|
JobType: "service",
|
|
SlotPool: "service",
|
|
GPUCount: 1, // Requires GPU for inference
|
|
|
|
Command: []string{
|
|
"python", "-m", "vllm.entrypoints.openai.api_server",
|
|
"--model", "{{MODEL_NAME}}",
|
|
"--port", "{{SERVICE_PORT}}",
|
|
},
|
|
|
|
HealthCheck: ServiceHealthCheck{
|
|
Liveness: "http://localhost:{{SERVICE_PORT}}/health",
|
|
Readiness: "http://localhost:{{SERVICE_PORT}}/health",
|
|
Interval: 30,
|
|
Timeout: 10,
|
|
},
|
|
}
|