- Update task domain model - Improve scheduler hub and priority queue - Enhance protocol definitions - Update manifest schema and run handling
148 lines
4.4 KiB
Go
148 lines
4.4 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"encoding/json"
|
|
"time"
|
|
)
|
|
|
|
type Message struct {
|
|
Type MessageType `json:"type"`
|
|
Payload json.RawMessage `json:"payload,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
type MessageType string
|
|
|
|
const (
|
|
// Worker → Scheduler
|
|
MsgRegister MessageType = "register"
|
|
MsgHeartbeat MessageType = "heartbeat" // slots only, every 10s
|
|
MsgReadyForWork MessageType = "ready_for_work"
|
|
MsgJobAccepted MessageType = "job_accepted"
|
|
MsgJobResult MessageType = "job_result"
|
|
MsgServiceHealth MessageType = "service_health"
|
|
MsgMetricsRequest MessageType = "metrics_request" // WSS metrics request
|
|
|
|
// Scheduler → Worker
|
|
MsgJobAssign MessageType = "job_assign"
|
|
MsgNoWork MessageType = "no_work" // nothing available right now
|
|
MsgJobCancel MessageType = "job_cancel"
|
|
MsgPrewarmHint MessageType = "prewarm_hint"
|
|
MsgAck MessageType = "ack"
|
|
MsgMetricsResponse MessageType = "metrics_response" // WSS metrics response
|
|
)
|
|
|
|
// Heartbeat — liveness and slot status combined, no CPU/mem load
|
|
type HeartbeatPayload struct {
|
|
WorkerID string `json:"worker_id"`
|
|
Slots SlotStatus `json:"slots"`
|
|
}
|
|
|
|
type ReadyPayload struct {
|
|
WorkerID string `json:"worker_id"`
|
|
Slots SlotStatus `json:"slots"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
type JobResultPayload struct {
|
|
TaskID string `json:"task_id"`
|
|
State string `json:"state"`
|
|
ExitCode int `json:"exit_code"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
type PrewarmHintPayload struct {
|
|
TaskID string `json:"task_id"`
|
|
SnapshotID string `json:"snapshot_id"`
|
|
SnapshotSHA string `json:"snapshot_sha,omitempty"`
|
|
}
|
|
|
|
type WorkerRegistration struct {
|
|
ID string `json:"id"`
|
|
Capabilities WorkerCapabilities `json:"capabilities"`
|
|
ActiveTasks []ActiveTaskReport `json:"active_tasks"`
|
|
}
|
|
|
|
type ActiveTaskReport struct {
|
|
TaskID string `json:"task_id"`
|
|
State string `json:"state"`
|
|
StartedAt time.Time `json:"started_at,omitempty"`
|
|
}
|
|
|
|
type SlotStatus struct {
|
|
BatchTotal int `json:"batch_total"`
|
|
BatchInUse int `json:"batch_in_use"`
|
|
ServiceTotal int `json:"service_total"`
|
|
ServiceInUse int `json:"service_in_use"`
|
|
}
|
|
|
|
func (s SlotStatus) BatchAvailable() int { return s.BatchTotal - s.BatchInUse }
|
|
func (s SlotStatus) ServiceAvailable() int { return s.ServiceTotal - s.ServiceInUse }
|
|
|
|
type WorkerCapabilities struct {
|
|
GPUInfo GPUDetectionInfo `json:"gpu_info"`
|
|
GPUCount int `json:"gpu_count"`
|
|
GPUType string `json:"gpu_type"`
|
|
CPUCount int `json:"cpu_count"`
|
|
MemoryGB float64 `json:"memory_gb"`
|
|
Hostname string `json:"hostname"`
|
|
}
|
|
|
|
type GPUDetectionInfo struct {
|
|
GPUType string `json:"gpu_type"`
|
|
Count int `json:"count"`
|
|
Devices []string `json:"devices,omitempty"`
|
|
Driver string `json:"driver,omitempty"`
|
|
MemTotal uint64 `json:"mem_total,omitempty"`
|
|
}
|
|
|
|
type JobSpec struct {
|
|
ID string `json:"id"`
|
|
Type JobType `json:"type"` // "batch" | "service"
|
|
SlotPool string `json:"slot_pool"`
|
|
UserID string `json:"user_id,omitempty"` // NEW: for per-user quota tracking
|
|
|
|
GPUCount int `json:"gpu_count"`
|
|
GPUType string `json:"gpu_type,omitempty"`
|
|
NodeCount int `json:"node_count"`
|
|
|
|
// MaxRuntimeHours is the maximum wall-clock time for this job.
|
|
// 0 = default (24h), capped at 168h (7d) by the scheduler.
|
|
MaxRuntimeHours int `json:"max_runtime_hours,omitempty"`
|
|
|
|
Command []string `json:"command"`
|
|
Env map[string]string `json:"env"`
|
|
|
|
Prolog []string `json:"prolog,omitempty"`
|
|
Epilog []string `json:"epilog,omitempty"`
|
|
|
|
SnapshotID string `json:"snapshot_id,omitempty"`
|
|
SnapshotSHA string `json:"snapshot_sha,omitempty"`
|
|
HealthCheck *HealthCheck `json:"health_check,omitempty"`
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|
}
|
|
|
|
type JobType string
|
|
|
|
const (
|
|
JobTypeBatch JobType = "batch"
|
|
JobTypeService JobType = "service"
|
|
)
|
|
|
|
type HealthCheck struct {
|
|
LivenessEndpoint string `json:"liveness"`
|
|
ReadinessEndpoint string `json:"readiness"`
|
|
IntervalSecs int `json:"interval_secs"`
|
|
}
|
|
|
|
type ServiceHealthPayload struct {
|
|
TaskID string `json:"task_id"`
|
|
Healthy bool `json:"healthy"`
|
|
Message string `json:"message,omitempty"`
|
|
}
|
|
|
|
// JobAssignPayload is sent from scheduler to worker when assigning a task.
|
|
type JobAssignPayload struct {
|
|
Spec JobSpec `json:"spec"`
|
|
RemainingTime time.Duration `json:"remaining_time"` // Wall-clock budget left
|
|
}
|