fetch_ml/internal/scheduler/protocol.go
Jeremie Fraeys 43e6446587
feat(scheduler): implement multi-tenant job scheduler with gang scheduling
Add new scheduler component for distributed ML workload orchestration:
- Hub-based coordination for multi-worker clusters
- Pacing controller for rate limiting job submissions
- Priority queue with preemption support
- Port allocator for dynamic service discovery
- Protocol handlers for worker-scheduler communication
- Service manager with OS-specific implementations
- Connection management and state persistence
- Template system for service deployment

Includes comprehensive test suite:
- Unit tests for all core components
- Integration tests for distributed scenarios
- Benchmark tests for performance validation
- Mock fixtures for isolated testing

Refs: scheduler-architecture.md
2026-02-26 12:03:23 -05:00

137 lines
4 KiB
Go

package scheduler
import (
"encoding/json"
"time"
)
type Message struct {
Type MessageType `json:"type"`
Payload json.RawMessage `json:"payload,omitempty"`
Error string `json:"error,omitempty"`
}
type MessageType string
const (
// Worker → Scheduler
MsgRegister MessageType = "register"
MsgHeartbeat MessageType = "heartbeat" // slots only, every 10s
MsgReadyForWork MessageType = "ready_for_work"
MsgJobAccepted MessageType = "job_accepted"
MsgJobResult MessageType = "job_result"
MsgServiceHealth MessageType = "service_health"
MsgMetricsRequest MessageType = "metrics_request" // WSS metrics request
// Scheduler → Worker
MsgJobAssign MessageType = "job_assign"
MsgNoWork MessageType = "no_work" // nothing available right now
MsgJobCancel MessageType = "job_cancel"
MsgPrewarmHint MessageType = "prewarm_hint"
MsgAck MessageType = "ack"
MsgMetricsResponse MessageType = "metrics_response" // WSS metrics response
)
// Heartbeat — liveness and slot status combined, no CPU/mem load
type HeartbeatPayload struct {
WorkerID string `json:"worker_id"`
Slots SlotStatus `json:"slots"`
}
type ReadyPayload struct {
WorkerID string `json:"worker_id"`
Slots SlotStatus `json:"slots"`
Reason string `json:"reason"`
}
type JobResultPayload struct {
TaskID string `json:"task_id"`
State string `json:"state"`
ExitCode int `json:"exit_code"`
Error string `json:"error,omitempty"`
}
type PrewarmHintPayload struct {
TaskID string `json:"task_id"`
SnapshotID string `json:"snapshot_id"`
SnapshotSHA string `json:"snapshot_sha,omitempty"`
}
type WorkerRegistration struct {
ID string `json:"id"`
Capabilities WorkerCapabilities `json:"capabilities"`
ActiveTasks []ActiveTaskReport `json:"active_tasks"`
}
type ActiveTaskReport struct {
TaskID string `json:"task_id"`
State string `json:"state"`
StartedAt time.Time `json:"started_at,omitempty"`
}
type SlotStatus struct {
BatchTotal int `json:"batch_total"`
BatchInUse int `json:"batch_in_use"`
ServiceTotal int `json:"service_total"`
ServiceInUse int `json:"service_in_use"`
}
func (s SlotStatus) BatchAvailable() int { return s.BatchTotal - s.BatchInUse }
func (s SlotStatus) ServiceAvailable() int { return s.ServiceTotal - s.ServiceInUse }
type WorkerCapabilities struct {
GPUInfo GPUDetectionInfo `json:"gpu_info"`
GPUCount int `json:"gpu_count"`
GPUType string `json:"gpu_type"`
CPUCount int `json:"cpu_count"`
MemoryGB float64 `json:"memory_gb"`
Hostname string `json:"hostname"`
}
type GPUDetectionInfo struct {
GPUType string `json:"gpu_type"`
Count int `json:"count"`
Devices []string `json:"devices,omitempty"`
Driver string `json:"driver,omitempty"`
MemTotal uint64 `json:"mem_total,omitempty"`
}
type JobSpec struct {
ID string `json:"id"`
Type JobType `json:"type"` // "batch" | "service"
SlotPool string `json:"slot_pool"`
GPUCount int `json:"gpu_count"`
GPUType string `json:"gpu_type,omitempty"`
NodeCount int `json:"node_count"`
Command []string `json:"command"`
Env map[string]string `json:"env"`
Prolog []string `json:"prolog,omitempty"`
Epilog []string `json:"epilog,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
SnapshotSHA string `json:"snapshot_sha,omitempty"`
HealthCheck *HealthCheck `json:"health_check,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
}
type JobType string
const (
JobTypeBatch JobType = "batch"
JobTypeService JobType = "service"
)
type HealthCheck struct {
LivenessEndpoint string `json:"liveness"`
ReadinessEndpoint string `json:"readiness"`
IntervalSecs int `json:"interval_secs"`
}
type ServiceHealthPayload struct {
TaskID string `json:"task_id"`
Healthy bool `json:"healthy"`
Message string `json:"message,omitempty"`
}