Phase 1: Event Sourcing - Add TaskEvent types (queued, started, completed, failed, etc.) - Create EventStore with Redis Streams (append-only) - Support event querying by task ID and time range Phase 3: Diagnosable Failures - Enhance TaskExecutionError with Context map, Timestamp, Recoverable flag - Update container.go to populate error context (image, GPU, duration) - Add WithContext helper for building error context - Create cmd/errors CLI for querying task errors Phase 4: Testable Security - Add security fields to PodmanConfig (Privileged, Network, ReadOnlyMounts) - Create ValidateSecurityPolicy() with ErrSecurityViolation - Add security contract tests (privileged rejection, host network rejection) - Tests serve as executable security documentation Phase 7: Reproducible Builds - Add BuildHash and BuildTime ldflags to Makefile - Create verify-build target for reproducibility testing - Add -version and -verify flags to api-server All tests pass: - go test ./internal/errtypes/... - go test ./internal/container/... -run Security - go test ./internal/queue/... - go build ./cmd/api-server/...
97 lines
3 KiB
Go
97 lines
3 KiB
Go
package domain
|
|
|
|
import (
|
|
"encoding/json"
|
|
"time"
|
|
)
|
|
|
|
// TaskEventType represents the type of task event.
|
|
type TaskEventType string
|
|
|
|
const (
|
|
// TaskEventQueued is fired when a task is added to the queue.
|
|
TaskEventQueued TaskEventType = "queued"
|
|
// TaskEventStarted is fired when a task begins execution.
|
|
TaskEventStarted TaskEventType = "started"
|
|
// TaskEventCompleted is fired when a task finishes successfully.
|
|
TaskEventCompleted TaskEventType = "completed"
|
|
// TaskEventFailed is fired when a task fails.
|
|
TaskEventFailed TaskEventType = "failed"
|
|
// TaskEventCancelled is fired when a task is cancelled.
|
|
TaskEventCancelled TaskEventType = "cancelled"
|
|
// TaskEventRetrying is fired when a task is being retried.
|
|
TaskEventRetrying TaskEventType = "retrying"
|
|
// TaskEventCheckpointSaved is fired when a checkpoint is saved.
|
|
TaskEventCheckpointSaved TaskEventType = "checkpoint_saved"
|
|
// TaskEventGPUAssigned is fired when a GPU is assigned.
|
|
TaskEventGPUAssigned TaskEventType = "gpu_assigned"
|
|
)
|
|
|
|
// TaskEvent represents an event in a task's lifecycle.
|
|
// Events are stored in Redis Streams for append-only audit trails.
|
|
type TaskEvent struct {
|
|
// TaskID is the unique identifier of the task.
|
|
TaskID string `json:"task_id"`
|
|
|
|
// EventType indicates what happened (queued, started, completed, etc.).
|
|
EventType TaskEventType `json:"event_type"`
|
|
|
|
// Timestamp when the event occurred.
|
|
Timestamp time.Time `json:"timestamp"`
|
|
|
|
// Data contains event-specific data (JSON-encoded).
|
|
// For "started": {"worker_id": "worker-1", "image": "pytorch:latest"}
|
|
// For "failed": {"error": "OOM", "phase": "execution"}
|
|
Data json.RawMessage `json:"data,omitempty"`
|
|
|
|
// Who triggered this event (worker ID, user ID, or system).
|
|
Who string `json:"who"`
|
|
}
|
|
|
|
// EventDataStarted contains data for the "started" event.
|
|
type EventDataStarted struct {
|
|
WorkerID string `json:"worker_id"`
|
|
Image string `json:"image,omitempty"`
|
|
GPUDevices []string `json:"gpu_devices,omitempty"`
|
|
}
|
|
|
|
// EventDataFailed contains data for the "failed" event.
|
|
type EventDataFailed struct {
|
|
Error string `json:"error"`
|
|
Phase string `json:"phase"`
|
|
Recoverable bool `json:"recoverable"`
|
|
}
|
|
|
|
// EventDataGPUAssigned contains data for the "gpu_assigned" event.
|
|
type EventDataGPUAssigned struct {
|
|
GPUDevices []string `json:"gpu_devices"`
|
|
GPUEnvVar string `json:"gpu_env_var,omitempty"`
|
|
}
|
|
|
|
// NewTaskEvent creates a new task event with the current timestamp.
|
|
func NewTaskEvent(taskID string, eventType TaskEventType, who string) TaskEvent {
|
|
return TaskEvent{
|
|
TaskID: taskID,
|
|
EventType: eventType,
|
|
Timestamp: time.Now().UTC(),
|
|
Who: who,
|
|
}
|
|
}
|
|
|
|
// WithData adds data to the event.
|
|
func (e TaskEvent) WithData(data any) (TaskEvent, error) {
|
|
encoded, err := json.Marshal(data)
|
|
if err != nil {
|
|
return e, err
|
|
}
|
|
e.Data = encoded
|
|
return e, nil
|
|
}
|
|
|
|
// ParseData parses the event data into the provided type.
|
|
func (e TaskEvent) ParseData(out any) error {
|
|
if len(e.Data) == 0 {
|
|
return nil
|
|
}
|
|
return json.Unmarshal(e.Data, out)
|
|
}
|