fetch_ml/internal/domain/events.go
Jeremie Fraeys 7194826871
feat: implement research-grade maintainability phases 1,3,4,7
Phase 1: Event Sourcing
- Add TaskEvent types (queued, started, completed, failed, etc.)
- Create EventStore with Redis Streams (append-only)
- Support event querying by task ID and time range

Phase 3: Diagnosable Failures
- Enhance TaskExecutionError with Context map, Timestamp, Recoverable flag
- Update container.go to populate error context (image, GPU, duration)
- Add WithContext helper for building error context
- Create cmd/errors CLI for querying task errors

Phase 4: Testable Security
- Add security fields to PodmanConfig (Privileged, Network, ReadOnlyMounts)
- Create ValidateSecurityPolicy() with ErrSecurityViolation
- Add security contract tests (privileged rejection, host network rejection)
- Tests serve as executable security documentation

Phase 7: Reproducible Builds
- Add BuildHash and BuildTime ldflags to Makefile
- Create verify-build target for reproducibility testing
- Add -version and -verify flags to api-server

All tests pass:
- go test ./internal/errtypes/...
- go test ./internal/container/... -run Security
- go test ./internal/queue/...
- go build ./cmd/api-server/...
2026-02-18 15:27:50 -05:00

97 lines
3 KiB
Go

package domain
import (
"encoding/json"
"time"
)
// TaskEventType represents the type of task event.
type TaskEventType string
const (
// TaskEventQueued is fired when a task is added to the queue.
TaskEventQueued TaskEventType = "queued"
// TaskEventStarted is fired when a task begins execution.
TaskEventStarted TaskEventType = "started"
// TaskEventCompleted is fired when a task finishes successfully.
TaskEventCompleted TaskEventType = "completed"
// TaskEventFailed is fired when a task fails.
TaskEventFailed TaskEventType = "failed"
// TaskEventCancelled is fired when a task is cancelled.
TaskEventCancelled TaskEventType = "cancelled"
// TaskEventRetrying is fired when a task is being retried.
TaskEventRetrying TaskEventType = "retrying"
// TaskEventCheckpointSaved is fired when a checkpoint is saved.
TaskEventCheckpointSaved TaskEventType = "checkpoint_saved"
// TaskEventGPUAssigned is fired when a GPU is assigned.
TaskEventGPUAssigned TaskEventType = "gpu_assigned"
)
// TaskEvent represents an event in a task's lifecycle.
// Events are stored in Redis Streams for append-only audit trails.
type TaskEvent struct {
// TaskID is the unique identifier of the task.
TaskID string `json:"task_id"`
// EventType indicates what happened (queued, started, completed, etc.).
EventType TaskEventType `json:"event_type"`
// Timestamp when the event occurred.
Timestamp time.Time `json:"timestamp"`
// Data contains event-specific data (JSON-encoded).
// For "started": {"worker_id": "worker-1", "image": "pytorch:latest"}
// For "failed": {"error": "OOM", "phase": "execution"}
Data json.RawMessage `json:"data,omitempty"`
// Who triggered this event (worker ID, user ID, or system).
Who string `json:"who"`
}
// EventDataStarted contains data for the "started" event.
type EventDataStarted struct {
WorkerID string `json:"worker_id"`
Image string `json:"image,omitempty"`
GPUDevices []string `json:"gpu_devices,omitempty"`
}
// EventDataFailed contains data for the "failed" event.
type EventDataFailed struct {
Error string `json:"error"`
Phase string `json:"phase"`
Recoverable bool `json:"recoverable"`
}
// EventDataGPUAssigned contains data for the "gpu_assigned" event.
type EventDataGPUAssigned struct {
GPUDevices []string `json:"gpu_devices"`
GPUEnvVar string `json:"gpu_env_var,omitempty"`
}
// NewTaskEvent creates a new task event with the current timestamp.
func NewTaskEvent(taskID string, eventType TaskEventType, who string) TaskEvent {
return TaskEvent{
TaskID: taskID,
EventType: eventType,
Timestamp: time.Now().UTC(),
Who: who,
}
}
// WithData adds data to the event.
func (e TaskEvent) WithData(data any) (TaskEvent, error) {
encoded, err := json.Marshal(data)
if err != nil {
return e, err
}
e.Data = encoded
return e, nil
}
// ParseData parses the event data into the provided type.
func (e TaskEvent) ParseData(out any) error {
if len(e.Data) == 0 {
return nil
}
return json.Unmarshal(e.Data, out)
}