package scheduler import ( "encoding/json" "time" ) type Message struct { Type MessageType `json:"type"` Payload json.RawMessage `json:"payload,omitempty"` Error string `json:"error,omitempty"` } type MessageType string const ( // Worker → Scheduler MsgRegister MessageType = "register" MsgHeartbeat MessageType = "heartbeat" // slots only, every 10s MsgReadyForWork MessageType = "ready_for_work" MsgJobAccepted MessageType = "job_accepted" MsgJobResult MessageType = "job_result" MsgServiceHealth MessageType = "service_health" MsgMetricsRequest MessageType = "metrics_request" // WSS metrics request // Scheduler → Worker MsgJobAssign MessageType = "job_assign" MsgNoWork MessageType = "no_work" // nothing available right now MsgJobCancel MessageType = "job_cancel" MsgPrewarmHint MessageType = "prewarm_hint" MsgAck MessageType = "ack" MsgMetricsResponse MessageType = "metrics_response" // WSS metrics response ) // Heartbeat — liveness and slot status combined, no CPU/mem load type HeartbeatPayload struct { WorkerID string `json:"worker_id"` Slots SlotStatus `json:"slots"` } type ReadyPayload struct { WorkerID string `json:"worker_id"` Slots SlotStatus `json:"slots"` Reason string `json:"reason"` } type JobResultPayload struct { TaskID string `json:"task_id"` State string `json:"state"` ExitCode int `json:"exit_code"` Error string `json:"error,omitempty"` } type PrewarmHintPayload struct { TaskID string `json:"task_id"` SnapshotID string `json:"snapshot_id"` SnapshotSHA string `json:"snapshot_sha,omitempty"` } type WorkerRegistration struct { ID string `json:"id"` Capabilities WorkerCapabilities `json:"capabilities"` ActiveTasks []ActiveTaskReport `json:"active_tasks"` } type ActiveTaskReport struct { TaskID string `json:"task_id"` State string `json:"state"` StartedAt time.Time `json:"started_at,omitempty"` } type SlotStatus struct { BatchTotal int `json:"batch_total"` BatchInUse int `json:"batch_in_use"` ServiceTotal int `json:"service_total"` ServiceInUse int `json:"service_in_use"` } func (s SlotStatus) BatchAvailable() int { return s.BatchTotal - s.BatchInUse } func (s SlotStatus) ServiceAvailable() int { return s.ServiceTotal - s.ServiceInUse } type WorkerCapabilities struct { GPUInfo GPUDetectionInfo `json:"gpu_info"` GPUCount int `json:"gpu_count"` GPUType string `json:"gpu_type"` CPUCount int `json:"cpu_count"` MemoryGB float64 `json:"memory_gb"` Hostname string `json:"hostname"` } type GPUDetectionInfo struct { GPUType string `json:"gpu_type"` Count int `json:"count"` Devices []string `json:"devices,omitempty"` Driver string `json:"driver,omitempty"` MemTotal uint64 `json:"mem_total,omitempty"` } type JobSpec struct { ID string `json:"id"` Type JobType `json:"type"` // "batch" | "service" SlotPool string `json:"slot_pool"` UserID string `json:"user_id,omitempty"` // NEW: for per-user quota tracking GPUCount int `json:"gpu_count"` GPUType string `json:"gpu_type,omitempty"` NodeCount int `json:"node_count"` // MaxRuntimeHours is the maximum wall-clock time for this job. // 0 = default (24h), capped at 168h (7d) by the scheduler. MaxRuntimeHours int `json:"max_runtime_hours,omitempty"` Command []string `json:"command"` Env map[string]string `json:"env"` Prolog []string `json:"prolog,omitempty"` Epilog []string `json:"epilog,omitempty"` SnapshotID string `json:"snapshot_id,omitempty"` SnapshotSHA string `json:"snapshot_sha,omitempty"` HealthCheck *HealthCheck `json:"health_check,omitempty"` Metadata map[string]string `json:"metadata,omitempty"` } type JobType string const ( JobTypeBatch JobType = "batch" JobTypeService JobType = "service" ) type HealthCheck struct { LivenessEndpoint string `json:"liveness"` ReadinessEndpoint string `json:"readiness"` IntervalSecs int `json:"interval_secs"` } type ServiceHealthPayload struct { TaskID string `json:"task_id"` Healthy bool `json:"healthy"` Message string `json:"message,omitempty"` } // JobAssignPayload is sent from scheduler to worker when assigning a task. type JobAssignPayload struct { Spec JobSpec `json:"spec"` RemainingTime time.Duration `json:"remaining_time"` // Wall-clock budget left }