fetch_ml/internal/domain/task.go
Jeremie Fraeys a4f2c36069
feat: enhance task domain and scheduler protocol
- Update task domain model
- Improve scheduler hub and priority queue
- Enhance protocol definitions
- Update manifest schema and run handling
2026-03-04 13:23:38 -05:00

67 lines
3 KiB
Go

// Package domain provides core domain types for fetch_ml.
// These types have zero internal dependencies and are used across all packages.
package domain
import (
"time"
)
// Task represents an ML experiment task
type Task struct {
CreatedAt time.Time `json:"created_at"`
Metadata map[string]string `json:"metadata,omitempty"`
EndedAt *time.Time `json:"ended_at,omitempty"`
Tracking *TrackingConfig `json:"tracking,omitempty"`
NextRetry *time.Time `json:"next_retry,omitempty"`
LeaseExpiry *time.Time `json:"lease_expiry,omitempty"`
StartedAt *time.Time `json:"started_at,omitempty"`
Username string `json:"username"`
LeasedBy string `json:"leased_by,omitempty"`
Error string `json:"error,omitempty"`
Output string `json:"output,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
Status string `json:"status"`
LastError string `json:"last_error,omitempty"`
ID string `json:"id"`
Args string `json:"args"`
WorkerID string `json:"worker_id,omitempty"`
JobName string `json:"job_name"`
GPUMemory string `json:"gpu_memory,omitempty"`
UserID string `json:"user_id"`
CreatedBy string `json:"created_by"`
Datasets []string `json:"datasets,omitempty"`
Attempts []Attempt `json:"attempts,omitempty"`
DatasetSpecs []DatasetSpec `json:"dataset_specs,omitempty"`
MemoryGB int `json:"memory_gb,omitempty"`
CPU int `json:"cpu,omitempty"`
GPU int `json:"gpu,omitempty"`
RetryCount int `json:"retry_count"`
MaxRetries int `json:"max_retries"`
Priority int64 `json:"priority"`
// FirstAssignedAt is set once when the task is first assigned to a worker.
// It never changes, even on re-queue after worker failure.
FirstAssignedAt time.Time `json:"first_assigned_at,omitempty"`
// MaxRuntime is the cached computed value from JobSpec.MaxRuntimeHours.
// 0 means use default (24h), capped at 168h (7d).
MaxRuntime time.Duration `json:"max_runtime,omitempty"`
// RemainingTime is the wall-clock budget left when assigned to a worker.
// Set by the scheduler on assignment.
RemainingTime time.Duration `json:"remaining_time,omitempty"`
}
// Attempt represents a single execution attempt of a task
type Attempt struct {
StartedAt time.Time `json:"started_at"`
EndedAt *time.Time `json:"ended_at,omitempty"`
WorkerID string `json:"worker_id,omitempty"`
Status string `json:"status"`
FailureClass FailureClass `json:"failure_class,omitempty"`
Signal string `json:"signal,omitempty"`
Error string `json:"error,omitempty"`
LogTail string `json:"log_tail,omitempty"`
Attempt int `json:"attempt"`
ExitCode int `json:"exit_code,omitempty"`
}