fetch_ml/internal/queue/task.go
Jeremie Fraeys a93b6715fd
feat: add native library bridge and queue integration
- Add native_queue.go with CGO bindings for queue operations
- Add native_queue_stub.go for non-CGO builds
- Add hash_selector to choose between Go and native implementations
- Add native_bridge_libs.go for CGO builds with native_libs tag
- Add native_bridge_nocgo.go stub for non-CGO builds
- Update queue errors and task handling for native integration
- Update worker config and runloop for native library support
2026-02-16 20:38:30 -05:00

118 lines
5 KiB
Go

package queue
import (
"time"
"github.com/jfraeys/fetch_ml/internal/config"
)
// DatasetSpec describes a dataset input with optional provenance fields.
type DatasetSpec struct {
Name string `json:"name"`
Version string `json:"version,omitempty"`
Checksum string `json:"checksum,omitempty"`
URI string `json:"uri,omitempty"`
}
// Task represents an ML experiment task
type Task struct {
ID string `json:"id"`
JobName string `json:"job_name"`
Args string `json:"args"`
Status string `json:"status"` // queued, running, completed, failed
Priority int64 `json:"priority"`
CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"`
EndedAt *time.Time `json:"ended_at,omitempty"`
WorkerID string `json:"worker_id,omitempty"`
Error string `json:"error,omitempty"`
Output string `json:"output,omitempty"`
// TODO(phase1): SnapshotID is an opaque identifier only.
// TODO(phase2): Resolve SnapshotID and verify its checksum/digest before execution.
SnapshotID string `json:"snapshot_id,omitempty"`
// DatasetSpecs is the preferred structured dataset input and should be authoritative.
DatasetSpecs []DatasetSpec `json:"dataset_specs,omitempty"`
// Datasets is kept for backward compatibility (legacy callers).
Datasets []string `json:"datasets,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
// Resource requests (optional, 0 means unspecified)
CPU int `json:"cpu,omitempty"`
MemoryGB int `json:"memory_gb,omitempty"`
GPU int `json:"gpu,omitempty"`
GPUMemory string `json:"gpu_memory,omitempty"`
// User ownership and permissions
UserID string `json:"user_id"` // User who owns this task
Username string `json:"username"` // Username for display
CreatedBy string `json:"created_by"` // User who submitted the task
// Lease management for task resilience
LeaseExpiry *time.Time `json:"lease_expiry,omitempty"` // When task lease expires
LeasedBy string `json:"leased_by,omitempty"` // Worker ID holding lease
// Retry management
RetryCount int `json:"retry_count"` // Number of retry attempts made
MaxRetries int `json:"max_retries"` // Maximum retry limit (default 3)
LastError string `json:"last_error,omitempty"` // Last error encountered
NextRetry *time.Time `json:"next_retry,omitempty"` // When to retry next (exponential backoff)
// Attempt tracking - complete history of all execution attempts
Attempts []Attempt `json:"attempts,omitempty"`
// Optional tracking configuration for this task
Tracking *TrackingConfig `json:"tracking,omitempty"`
}
// Attempt represents a single execution attempt of a task
type Attempt struct {
Attempt int `json:"attempt"` // Attempt number (1-indexed)
StartedAt time.Time `json:"started_at"` // When attempt started
EndedAt *time.Time `json:"ended_at,omitempty"` // When attempt ended (if completed)
WorkerID string `json:"worker_id,omitempty"` // Which worker ran this attempt
Status string `json:"status"` // running, completed, failed
FailureClass FailureClass `json:"failure_class,omitempty"` // Failure classification (if failed)
ExitCode int `json:"exit_code,omitempty"` // Process exit code
Signal string `json:"signal,omitempty"` // Termination signal (if any)
Error string `json:"error,omitempty"` // Error message (if failed)
LogTail string `json:"log_tail,omitempty"` // Last N lines of log output
}
// TrackingConfig specifies experiment tracking tools to enable for a task.
type TrackingConfig struct {
MLflow *MLflowTrackingConfig `json:"mlflow,omitempty"`
TensorBoard *TensorBoardTrackingConfig `json:"tensorboard,omitempty"`
Wandb *WandbTrackingConfig `json:"wandb,omitempty"`
}
// MLflowTrackingConfig controls MLflow integration.
type MLflowTrackingConfig struct {
Enabled bool `json:"enabled"`
Mode string `json:"mode,omitempty"` // "sidecar" | "remote" | "disabled"
TrackingURI string `json:"tracking_uri,omitempty"` // Explicit tracking URI for remote mode
}
// TensorBoardTrackingConfig controls TensorBoard integration.
type TensorBoardTrackingConfig struct {
Enabled bool `json:"enabled"`
Mode string `json:"mode,omitempty"` // "sidecar" | "disabled"
}
// WandbTrackingConfig controls Weights & Biases integration.
type WandbTrackingConfig struct {
Enabled bool `json:"enabled"`
Mode string `json:"mode,omitempty"` // "remote" | "disabled"
APIKey string `json:"api_key,omitempty"`
Project string `json:"project,omitempty"`
Entity string `json:"entity,omitempty"`
}
// Redis key constants
var (
TaskQueueKey = config.RedisTaskQueueKey
TaskPrefix = config.RedisTaskPrefix
TaskStatusPrefix = config.RedisTaskStatusPrefix
WorkerHeartbeat = config.RedisWorkerHeartbeat
WorkerPrewarmKey = config.RedisWorkerPrewarmKey
JobMetricsPrefix = config.RedisJobMetricsPrefix
)