fetch_ml/internal/scheduler/protocol.go

package scheduler

import (
	"encoding/json"
	"time"
)

type Message struct {
	Type    MessageType     `json:"type"`
	Payload json.RawMessage `json:"payload,omitempty"`
	Error   string          `json:"error,omitempty"`
}

type MessageType string

const (
	// Worker → Scheduler
	MsgRegister       MessageType = "register"
	MsgHeartbeat      MessageType = "heartbeat" // slots only, every 10s
	MsgReadyForWork   MessageType = "ready_for_work"
	MsgJobAccepted    MessageType = "job_accepted"
	MsgJobResult      MessageType = "job_result"
	MsgServiceHealth  MessageType = "service_health"
	MsgMetricsRequest MessageType = "metrics_request" // WSS metrics request

	// Scheduler → Worker
	MsgJobAssign       MessageType = "job_assign"
	MsgNoWork          MessageType = "no_work" // nothing available right now
	MsgJobCancel       MessageType = "job_cancel"
	MsgPrewarmHint     MessageType = "prewarm_hint"
	MsgAck             MessageType = "ack"
	MsgMetricsResponse MessageType = "metrics_response" // WSS metrics response
)

// Heartbeat — liveness and slot status combined, no CPU/mem load
type HeartbeatPayload struct {
	WorkerID string     `json:"worker_id"`
	Slots    SlotStatus `json:"slots"`
}

type ReadyPayload struct {
	WorkerID string     `json:"worker_id"`
	Slots    SlotStatus `json:"slots"`
	Reason   string     `json:"reason"`
}

type JobResultPayload struct {
	TaskID   string `json:"task_id"`
	State    string `json:"state"`
	ExitCode int    `json:"exit_code"`
	Error    string `json:"error,omitempty"`
}

type PrewarmHintPayload struct {
	TaskID      string `json:"task_id"`
	SnapshotID  string `json:"snapshot_id"`
	SnapshotSHA string `json:"snapshot_sha,omitempty"`
}

type WorkerRegistration struct {
	ID           string             `json:"id"`
	Capabilities WorkerCapabilities `json:"capabilities"`
	ActiveTasks  []ActiveTaskReport `json:"active_tasks"`
}

type ActiveTaskReport struct {
	TaskID    string    `json:"task_id"`
	State     string    `json:"state"`
	StartedAt time.Time `json:"started_at,omitempty"`
}

type SlotStatus struct {
	BatchTotal   int `json:"batch_total"`
	BatchInUse   int `json:"batch_in_use"`
	ServiceTotal int `json:"service_total"`
	ServiceInUse int `json:"service_in_use"`
}

func (s SlotStatus) BatchAvailable() int   { return s.BatchTotal - s.BatchInUse }
func (s SlotStatus) ServiceAvailable() int { return s.ServiceTotal - s.ServiceInUse }

type WorkerCapabilities struct {
	GPUInfo  GPUDetectionInfo `json:"gpu_info"`
	GPUCount int              `json:"gpu_count"`
	GPUType  string           `json:"gpu_type"`
	CPUCount int              `json:"cpu_count"`
	MemoryGB float64          `json:"memory_gb"`
	Hostname string           `json:"hostname"`
}

type GPUDetectionInfo struct {
	GPUType  string   `json:"gpu_type"`
	Count    int      `json:"count"`
	Devices  []string `json:"devices,omitempty"`
	Driver   string   `json:"driver,omitempty"`
	MemTotal uint64   `json:"mem_total,omitempty"`
}

type JobSpec struct {
	ID       string  `json:"id"`
	Type     JobType `json:"type"` // "batch" | "service"
	SlotPool string  `json:"slot_pool"`

	GPUCount  int    `json:"gpu_count"`
	GPUType   string `json:"gpu_type,omitempty"`
	NodeCount int    `json:"node_count"`

	Command []string          `json:"command"`
	Env     map[string]string `json:"env"`

	Prolog []string `json:"prolog,omitempty"`
	Epilog []string `json:"epilog,omitempty"`

	SnapshotID  string            `json:"snapshot_id,omitempty"`
	SnapshotSHA string            `json:"snapshot_sha,omitempty"`
	HealthCheck *HealthCheck      `json:"health_check,omitempty"`
	Metadata    map[string]string `json:"metadata,omitempty"`
}

type JobType string

const (
	JobTypeBatch   JobType = "batch"
	JobTypeService JobType = "service"
)

type HealthCheck struct {
	LivenessEndpoint  string `json:"liveness"`
	ReadinessEndpoint string `json:"readiness"`
	IntervalSecs      int    `json:"interval_secs"`
}

type ServiceHealthPayload struct {
	TaskID  string `json:"task_id"`
	Healthy bool   `json:"healthy"`
	Message string `json:"message,omitempty"`
}