fetch_ml/internal/scheduler/protocol.go
Jeremie Fraeys da104367d6
Some checks failed
Build Pipeline / Build Binaries (push) Failing after 1m59s
Build Pipeline / Build Docker Images (push) Has been skipped
Build Pipeline / Sign HIPAA Config (push) Has been skipped
Build Pipeline / Generate SLSA Provenance (push) Has been skipped
Checkout test / test (push) Successful in 5s
CI Pipeline / Test (ubuntu-latest on self-hosted) (push) Failing after 1s
CI Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI Pipeline / Security Scan (push) Has been skipped
CI Pipeline / Test Scripts (push) Has been skipped
CI Pipeline / Test Native Libraries (push) Has been skipped
CI Pipeline / Native Library Build Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 35s
CI Pipeline / Trigger Build Workflow (push) Failing after 0s
Security Scan / Security Analysis (push) Has been cancelled
Security Scan / Native Library Security (push) Has been cancelled
Verification & Maintenance / V.1 - Schema Drift Detection (push) Has been cancelled
Verification & Maintenance / V.4 - Custom Go Vet Analyzers (push) Has been cancelled
Verification & Maintenance / V.7 - Audit Chain Integrity (push) Has been cancelled
Verification & Maintenance / V.6 - Extended Security Scanning (push) Has been cancelled
Verification & Maintenance / V.10 - OpenSSF Scorecard (push) Has been cancelled
Verification & Maintenance / Verification Summary (push) Has been cancelled
feat: add Plugin GPU Quota implementation and tests
- Add plugin_quota.go with GPU quota management for scheduler

- Update scheduler hub and protocol for plugin support

- Add comprehensive plugin quota unit tests

- Update gang service and WebSocket queue integration tests
2026-02-26 14:35:05 -05:00

138 lines
4 KiB
Go

package scheduler
import (
"encoding/json"
"time"
)
type Message struct {
Type MessageType `json:"type"`
Payload json.RawMessage `json:"payload,omitempty"`
Error string `json:"error,omitempty"`
}
type MessageType string
const (
// Worker → Scheduler
MsgRegister MessageType = "register"
MsgHeartbeat MessageType = "heartbeat" // slots only, every 10s
MsgReadyForWork MessageType = "ready_for_work"
MsgJobAccepted MessageType = "job_accepted"
MsgJobResult MessageType = "job_result"
MsgServiceHealth MessageType = "service_health"
MsgMetricsRequest MessageType = "metrics_request" // WSS metrics request
// Scheduler → Worker
MsgJobAssign MessageType = "job_assign"
MsgNoWork MessageType = "no_work" // nothing available right now
MsgJobCancel MessageType = "job_cancel"
MsgPrewarmHint MessageType = "prewarm_hint"
MsgAck MessageType = "ack"
MsgMetricsResponse MessageType = "metrics_response" // WSS metrics response
)
// Heartbeat — liveness and slot status combined, no CPU/mem load
type HeartbeatPayload struct {
WorkerID string `json:"worker_id"`
Slots SlotStatus `json:"slots"`
}
type ReadyPayload struct {
WorkerID string `json:"worker_id"`
Slots SlotStatus `json:"slots"`
Reason string `json:"reason"`
}
type JobResultPayload struct {
TaskID string `json:"task_id"`
State string `json:"state"`
ExitCode int `json:"exit_code"`
Error string `json:"error,omitempty"`
}
type PrewarmHintPayload struct {
TaskID string `json:"task_id"`
SnapshotID string `json:"snapshot_id"`
SnapshotSHA string `json:"snapshot_sha,omitempty"`
}
type WorkerRegistration struct {
ID string `json:"id"`
Capabilities WorkerCapabilities `json:"capabilities"`
ActiveTasks []ActiveTaskReport `json:"active_tasks"`
}
type ActiveTaskReport struct {
TaskID string `json:"task_id"`
State string `json:"state"`
StartedAt time.Time `json:"started_at,omitempty"`
}
type SlotStatus struct {
BatchTotal int `json:"batch_total"`
BatchInUse int `json:"batch_in_use"`
ServiceTotal int `json:"service_total"`
ServiceInUse int `json:"service_in_use"`
}
func (s SlotStatus) BatchAvailable() int { return s.BatchTotal - s.BatchInUse }
func (s SlotStatus) ServiceAvailable() int { return s.ServiceTotal - s.ServiceInUse }
type WorkerCapabilities struct {
GPUInfo GPUDetectionInfo `json:"gpu_info"`
GPUCount int `json:"gpu_count"`
GPUType string `json:"gpu_type"`
CPUCount int `json:"cpu_count"`
MemoryGB float64 `json:"memory_gb"`
Hostname string `json:"hostname"`
}
type GPUDetectionInfo struct {
GPUType string `json:"gpu_type"`
Count int `json:"count"`
Devices []string `json:"devices,omitempty"`
Driver string `json:"driver,omitempty"`
MemTotal uint64 `json:"mem_total,omitempty"`
}
type JobSpec struct {
ID string `json:"id"`
Type JobType `json:"type"` // "batch" | "service"
SlotPool string `json:"slot_pool"`
UserID string `json:"user_id,omitempty"` // NEW: for per-user quota tracking
GPUCount int `json:"gpu_count"`
GPUType string `json:"gpu_type,omitempty"`
NodeCount int `json:"node_count"`
Command []string `json:"command"`
Env map[string]string `json:"env"`
Prolog []string `json:"prolog,omitempty"`
Epilog []string `json:"epilog,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
SnapshotSHA string `json:"snapshot_sha,omitempty"`
HealthCheck *HealthCheck `json:"health_check,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
}
type JobType string
const (
JobTypeBatch JobType = "batch"
JobTypeService JobType = "service"
)
type HealthCheck struct {
LivenessEndpoint string `json:"liveness"`
ReadinessEndpoint string `json:"readiness"`
IntervalSecs int `json:"interval_secs"`
}
type ServiceHealthPayload struct {
TaskID string `json:"task_id"`
Healthy bool `json:"healthy"`
Message string `json:"message,omitempty"`
}