Some checks failed
Build Pipeline / Build Binaries (push) Failing after 1m59s
Build Pipeline / Build Docker Images (push) Has been skipped
Build Pipeline / Sign HIPAA Config (push) Has been skipped
Build Pipeline / Generate SLSA Provenance (push) Has been skipped
Checkout test / test (push) Successful in 5s
CI Pipeline / Test (ubuntu-latest on self-hosted) (push) Failing after 1s
CI Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI Pipeline / Security Scan (push) Has been skipped
CI Pipeline / Test Scripts (push) Has been skipped
CI Pipeline / Test Native Libraries (push) Has been skipped
CI Pipeline / Native Library Build Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 35s
CI Pipeline / Trigger Build Workflow (push) Failing after 0s
Security Scan / Security Analysis (push) Has been cancelled
Security Scan / Native Library Security (push) Has been cancelled
Verification & Maintenance / V.1 - Schema Drift Detection (push) Has been cancelled
Verification & Maintenance / V.4 - Custom Go Vet Analyzers (push) Has been cancelled
Verification & Maintenance / V.7 - Audit Chain Integrity (push) Has been cancelled
Verification & Maintenance / V.6 - Extended Security Scanning (push) Has been cancelled
Verification & Maintenance / V.10 - OpenSSF Scorecard (push) Has been cancelled
Verification & Maintenance / Verification Summary (push) Has been cancelled
- Add plugin_quota.go with GPU quota management for scheduler - Update scheduler hub and protocol for plugin support - Add comprehensive plugin quota unit tests - Update gang service and WebSocket queue integration tests
138 lines
4 KiB
Go
138 lines
4 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"encoding/json"
|
|
"time"
|
|
)
|
|
|
|
type Message struct {
|
|
Type MessageType `json:"type"`
|
|
Payload json.RawMessage `json:"payload,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
type MessageType string
|
|
|
|
const (
|
|
// Worker → Scheduler
|
|
MsgRegister MessageType = "register"
|
|
MsgHeartbeat MessageType = "heartbeat" // slots only, every 10s
|
|
MsgReadyForWork MessageType = "ready_for_work"
|
|
MsgJobAccepted MessageType = "job_accepted"
|
|
MsgJobResult MessageType = "job_result"
|
|
MsgServiceHealth MessageType = "service_health"
|
|
MsgMetricsRequest MessageType = "metrics_request" // WSS metrics request
|
|
|
|
// Scheduler → Worker
|
|
MsgJobAssign MessageType = "job_assign"
|
|
MsgNoWork MessageType = "no_work" // nothing available right now
|
|
MsgJobCancel MessageType = "job_cancel"
|
|
MsgPrewarmHint MessageType = "prewarm_hint"
|
|
MsgAck MessageType = "ack"
|
|
MsgMetricsResponse MessageType = "metrics_response" // WSS metrics response
|
|
)
|
|
|
|
// Heartbeat — liveness and slot status combined, no CPU/mem load
|
|
type HeartbeatPayload struct {
|
|
WorkerID string `json:"worker_id"`
|
|
Slots SlotStatus `json:"slots"`
|
|
}
|
|
|
|
type ReadyPayload struct {
|
|
WorkerID string `json:"worker_id"`
|
|
Slots SlotStatus `json:"slots"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
type JobResultPayload struct {
|
|
TaskID string `json:"task_id"`
|
|
State string `json:"state"`
|
|
ExitCode int `json:"exit_code"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
type PrewarmHintPayload struct {
|
|
TaskID string `json:"task_id"`
|
|
SnapshotID string `json:"snapshot_id"`
|
|
SnapshotSHA string `json:"snapshot_sha,omitempty"`
|
|
}
|
|
|
|
type WorkerRegistration struct {
|
|
ID string `json:"id"`
|
|
Capabilities WorkerCapabilities `json:"capabilities"`
|
|
ActiveTasks []ActiveTaskReport `json:"active_tasks"`
|
|
}
|
|
|
|
type ActiveTaskReport struct {
|
|
TaskID string `json:"task_id"`
|
|
State string `json:"state"`
|
|
StartedAt time.Time `json:"started_at,omitempty"`
|
|
}
|
|
|
|
type SlotStatus struct {
|
|
BatchTotal int `json:"batch_total"`
|
|
BatchInUse int `json:"batch_in_use"`
|
|
ServiceTotal int `json:"service_total"`
|
|
ServiceInUse int `json:"service_in_use"`
|
|
}
|
|
|
|
func (s SlotStatus) BatchAvailable() int { return s.BatchTotal - s.BatchInUse }
|
|
func (s SlotStatus) ServiceAvailable() int { return s.ServiceTotal - s.ServiceInUse }
|
|
|
|
type WorkerCapabilities struct {
|
|
GPUInfo GPUDetectionInfo `json:"gpu_info"`
|
|
GPUCount int `json:"gpu_count"`
|
|
GPUType string `json:"gpu_type"`
|
|
CPUCount int `json:"cpu_count"`
|
|
MemoryGB float64 `json:"memory_gb"`
|
|
Hostname string `json:"hostname"`
|
|
}
|
|
|
|
type GPUDetectionInfo struct {
|
|
GPUType string `json:"gpu_type"`
|
|
Count int `json:"count"`
|
|
Devices []string `json:"devices,omitempty"`
|
|
Driver string `json:"driver,omitempty"`
|
|
MemTotal uint64 `json:"mem_total,omitempty"`
|
|
}
|
|
|
|
type JobSpec struct {
|
|
ID string `json:"id"`
|
|
Type JobType `json:"type"` // "batch" | "service"
|
|
SlotPool string `json:"slot_pool"`
|
|
UserID string `json:"user_id,omitempty"` // NEW: for per-user quota tracking
|
|
|
|
GPUCount int `json:"gpu_count"`
|
|
GPUType string `json:"gpu_type,omitempty"`
|
|
NodeCount int `json:"node_count"`
|
|
|
|
Command []string `json:"command"`
|
|
Env map[string]string `json:"env"`
|
|
|
|
Prolog []string `json:"prolog,omitempty"`
|
|
Epilog []string `json:"epilog,omitempty"`
|
|
|
|
SnapshotID string `json:"snapshot_id,omitempty"`
|
|
SnapshotSHA string `json:"snapshot_sha,omitempty"`
|
|
HealthCheck *HealthCheck `json:"health_check,omitempty"`
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|
}
|
|
|
|
type JobType string
|
|
|
|
const (
|
|
JobTypeBatch JobType = "batch"
|
|
JobTypeService JobType = "service"
|
|
)
|
|
|
|
type HealthCheck struct {
|
|
LivenessEndpoint string `json:"liveness"`
|
|
ReadinessEndpoint string `json:"readiness"`
|
|
IntervalSecs int `json:"interval_secs"`
|
|
}
|
|
|
|
type ServiceHealthPayload struct {
|
|
TaskID string `json:"task_id"`
|
|
Healthy bool `json:"healthy"`
|
|
Message string `json:"message,omitempty"`
|
|
}
|