101 lines
4 KiB
Go
101 lines
4 KiB
Go
package queue
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/config"
|
|
)
|
|
|
|
// DatasetSpec describes a dataset input with optional provenance fields.
|
|
type DatasetSpec struct {
|
|
Name string `json:"name"`
|
|
Version string `json:"version,omitempty"`
|
|
Checksum string `json:"checksum,omitempty"`
|
|
URI string `json:"uri,omitempty"`
|
|
}
|
|
|
|
// Task represents an ML experiment task
|
|
type Task struct {
|
|
ID string `json:"id"`
|
|
JobName string `json:"job_name"`
|
|
Args string `json:"args"`
|
|
Status string `json:"status"` // queued, running, completed, failed
|
|
Priority int64 `json:"priority"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
|
EndedAt *time.Time `json:"ended_at,omitempty"`
|
|
WorkerID string `json:"worker_id,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
Output string `json:"output,omitempty"`
|
|
// TODO(phase1): SnapshotID is an opaque identifier only.
|
|
// TODO(phase2): Resolve SnapshotID and verify its checksum/digest before execution.
|
|
SnapshotID string `json:"snapshot_id,omitempty"`
|
|
// DatasetSpecs is the preferred structured dataset input and should be authoritative.
|
|
DatasetSpecs []DatasetSpec `json:"dataset_specs,omitempty"`
|
|
// Datasets is kept for backward compatibility (legacy callers).
|
|
Datasets []string `json:"datasets,omitempty"`
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|
|
|
// Resource requests (optional, 0 means unspecified)
|
|
CPU int `json:"cpu,omitempty"`
|
|
MemoryGB int `json:"memory_gb,omitempty"`
|
|
GPU int `json:"gpu,omitempty"`
|
|
GPUMemory string `json:"gpu_memory,omitempty"`
|
|
|
|
// User ownership and permissions
|
|
UserID string `json:"user_id"` // User who owns this task
|
|
Username string `json:"username"` // Username for display
|
|
CreatedBy string `json:"created_by"` // User who submitted the task
|
|
|
|
// Lease management for task resilience
|
|
LeaseExpiry *time.Time `json:"lease_expiry,omitempty"` // When task lease expires
|
|
LeasedBy string `json:"leased_by,omitempty"` // Worker ID holding lease
|
|
|
|
// Retry management
|
|
RetryCount int `json:"retry_count"` // Number of retry attempts made
|
|
MaxRetries int `json:"max_retries"` // Maximum retry limit (default 3)
|
|
LastError string `json:"last_error,omitempty"` // Last error encountered
|
|
NextRetry *time.Time `json:"next_retry,omitempty"` // When to retry next (exponential backoff)
|
|
|
|
// Optional tracking configuration for this task
|
|
Tracking *TrackingConfig `json:"tracking,omitempty"`
|
|
}
|
|
|
|
// TrackingConfig specifies experiment tracking tools to enable for a task.
|
|
type TrackingConfig struct {
|
|
MLflow *MLflowTrackingConfig `json:"mlflow,omitempty"`
|
|
TensorBoard *TensorBoardTrackingConfig `json:"tensorboard,omitempty"`
|
|
Wandb *WandbTrackingConfig `json:"wandb,omitempty"`
|
|
}
|
|
|
|
// MLflowTrackingConfig controls MLflow integration.
|
|
type MLflowTrackingConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
Mode string `json:"mode,omitempty"` // "sidecar" | "remote" | "disabled"
|
|
TrackingURI string `json:"tracking_uri,omitempty"` // Explicit tracking URI for remote mode
|
|
}
|
|
|
|
// TensorBoardTrackingConfig controls TensorBoard integration.
|
|
type TensorBoardTrackingConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
Mode string `json:"mode,omitempty"` // "sidecar" | "disabled"
|
|
}
|
|
|
|
// WandbTrackingConfig controls Weights & Biases integration.
|
|
type WandbTrackingConfig struct {
|
|
Enabled bool `json:"enabled"`
|
|
Mode string `json:"mode,omitempty"` // "remote" | "disabled"
|
|
APIKey string `json:"api_key,omitempty"`
|
|
Project string `json:"project,omitempty"`
|
|
Entity string `json:"entity,omitempty"`
|
|
}
|
|
|
|
// Redis key constants
|
|
var (
|
|
TaskQueueKey = config.RedisTaskQueueKey
|
|
TaskPrefix = config.RedisTaskPrefix
|
|
TaskStatusPrefix = config.RedisTaskStatusPrefix
|
|
WorkerHeartbeat = config.RedisWorkerHeartbeat
|
|
WorkerPrewarmKey = config.RedisWorkerPrewarmKey
|
|
JobMetricsPrefix = config.RedisJobMetricsPrefix
|
|
)
|