fetch_ml/internal/experiment/manager.go
Jeremie Fraeys ea15af1833 Fix multi-user authentication and clean up debug code
- Fix YAML tags in auth config struct (json -> yaml)
- Update CLI configs to use pre-hashed API keys
- Remove double hashing in WebSocket client
- Fix port mapping (9102 -> 9103) in CLI commands
- Update permission keys to use jobs:read, jobs:create, etc.
- Clean up all debug logging from CLI and server
- All user roles now authenticate correctly:
  * Admin: Can queue jobs and see all jobs
  * Researcher: Can queue jobs and see own jobs
  * Analyst: Can see status (read-only access)

Multi-user authentication is now fully functional.
2025-12-06 12:35:32 -05:00

347 lines
8.1 KiB
Go

// Package experiment provides ML experiment management
package experiment
import (
"encoding/binary"
"fmt"
"math"
"os"
"path/filepath"
"time"
"github.com/jfraeys/fetch_ml/internal/fileutil"
)
// Metadata represents experiment metadata stored in meta.bin
type Metadata struct {
CommitID string
Timestamp int64
JobName string
User string
}
// Manager handles experiment storage and metadata
type Manager struct {
basePath string
}
// NewManager creates a new experiment manager.
func NewManager(basePath string) *Manager {
return &Manager{
basePath: basePath,
}
}
// Initialize ensures the experiment directory exists
func (m *Manager) Initialize() error {
if err := os.MkdirAll(m.basePath, 0750); err != nil {
return fmt.Errorf("failed to create experiment base directory: %w", err)
}
return nil
}
// GetExperimentPath returns the path for a given commit ID
func (m *Manager) GetExperimentPath(commitID string) string {
return filepath.Join(m.basePath, commitID)
}
// GetFilesPath returns the path to the files directory for an experiment
func (m *Manager) GetFilesPath(commitID string) string {
return filepath.Join(m.GetExperimentPath(commitID), "files")
}
// GetMetadataPath returns the path to meta.bin for an experiment
func (m *Manager) GetMetadataPath(commitID string) string {
return filepath.Join(m.GetExperimentPath(commitID), "meta.bin")
}
// ExperimentExists checks if an experiment with the given commit ID exists
func (m *Manager) ExperimentExists(commitID string) bool {
path := m.GetExperimentPath(commitID)
info, err := os.Stat(path)
return err == nil && info.IsDir()
}
// CreateExperiment creates the directory structure for a new experiment
func (m *Manager) CreateExperiment(commitID string) error {
filesPath := m.GetFilesPath(commitID)
if err := os.MkdirAll(filesPath, 0750); err != nil {
return fmt.Errorf("failed to create experiment directory: %w", err)
}
return nil
}
// WriteMetadata writes experiment metadata to meta.bin
func (m *Manager) WriteMetadata(meta *Metadata) error {
path := m.GetMetadataPath(meta.CommitID)
// Binary format:
// [version:1][timestamp:8][commit_id_len:1][commit_id:var][job_name_len:1][job_name:var][user_len:1][user:var]
buf := make([]byte, 0, 256)
// Version
buf = append(buf, 0x01)
// Timestamp
ts := make([]byte, 8)
binary.BigEndian.PutUint64(ts, uint64(meta.Timestamp))
buf = append(buf, ts...)
// Commit ID
buf = append(buf, byte(len(meta.CommitID)))
buf = append(buf, []byte(meta.CommitID)...)
// Job Name
buf = append(buf, byte(len(meta.JobName)))
buf = append(buf, []byte(meta.JobName)...)
// User
buf = append(buf, byte(len(meta.User)))
buf = append(buf, []byte(meta.User)...)
return os.WriteFile(path, buf, 0600)
}
// ReadMetadata reads experiment metadata from meta.bin
func (m *Manager) ReadMetadata(commitID string) (*Metadata, error) {
path := m.GetMetadataPath(commitID)
data, err := fileutil.SecureFileRead(path)
if err != nil {
return nil, fmt.Errorf("failed to read metadata: %w", err)
}
if len(data) < 10 {
return nil, fmt.Errorf("metadata file too short")
}
meta := &Metadata{}
offset := 0
// Version
version := data[offset]
offset++
if version != 0x01 {
return nil, fmt.Errorf("unsupported metadata version: %d", version)
}
// Timestamp
meta.Timestamp = int64(binary.BigEndian.Uint64(data[offset : offset+8]))
offset += 8
// Commit ID
commitIDLen := int(data[offset])
offset++
meta.CommitID = string(data[offset : offset+commitIDLen])
offset += commitIDLen
// Job Name
if offset >= len(data) {
return meta, nil
}
jobNameLen := int(data[offset])
offset++
meta.JobName = string(data[offset : offset+jobNameLen])
offset += jobNameLen
// User
if offset >= len(data) {
return meta, nil
}
userLen := int(data[offset])
offset++
meta.User = string(data[offset : offset+userLen])
return meta, nil
}
// ListExperiments returns all experiment commit IDs
func (m *Manager) ListExperiments() ([]string, error) {
entries, err := os.ReadDir(m.basePath)
if err != nil {
return nil, fmt.Errorf("failed to read experiments directory: %w", err)
}
var commitIDs []string
for _, entry := range entries {
if entry.IsDir() {
commitIDs = append(commitIDs, entry.Name())
}
}
return commitIDs, nil
}
// PruneExperiments removes old experiments based on retention policy
func (m *Manager) PruneExperiments(keepCount int, olderThanDays int) ([]string, error) {
commitIDs, err := m.ListExperiments()
if err != nil {
return nil, err
}
type experiment struct {
commitID string
timestamp int64
}
var experiments []experiment
for _, commitID := range commitIDs {
meta, err := m.ReadMetadata(commitID)
if err != nil {
continue // Skip experiments with invalid metadata
}
experiments = append(experiments, experiment{
commitID: commitID,
timestamp: meta.Timestamp,
})
}
// Sort by timestamp (newest first)
for i := 0; i < len(experiments); i++ {
for j := i + 1; j < len(experiments); j++ {
if experiments[j].timestamp > experiments[i].timestamp {
experiments[i], experiments[j] = experiments[j], experiments[i]
}
}
}
var pruned []string
cutoffTime := time.Now().AddDate(0, 0, -olderThanDays).Unix()
for i, exp := range experiments {
shouldPrune := false
// Keep the newest N experiments
if i >= keepCount {
shouldPrune = true
}
// Also prune if older than threshold
if olderThanDays > 0 && exp.timestamp < cutoffTime {
shouldPrune = true
}
if shouldPrune {
expPath := m.GetExperimentPath(exp.commitID)
if err := os.RemoveAll(expPath); err != nil {
// Log but continue
continue
}
pruned = append(pruned, exp.commitID)
}
}
return pruned, nil
}
// Metric represents a single data point in an experiment
type Metric struct {
Name string `json:"name"`
Value float64 `json:"value"`
Step int `json:"step"`
Timestamp int64 `json:"timestamp"`
}
// GetMetricsPath returns the path to metrics.bin for an experiment
func (m *Manager) GetMetricsPath(commitID string) string {
return filepath.Join(m.GetExperimentPath(commitID), "metrics.bin")
}
// LogMetric appends a metric to the experiment's metrics file
func (m *Manager) LogMetric(commitID string, name string, value float64, step int) error {
path := m.GetMetricsPath(commitID)
// Binary format for each metric:
// [timestamp:8][step:4][value:8][name_len:1][name:var]
buf := make([]byte, 0, 64)
// Timestamp
ts := make([]byte, 8)
binary.BigEndian.PutUint64(ts, uint64(time.Now().Unix()))
buf = append(buf, ts...)
// Step
st := make([]byte, 4)
binary.BigEndian.PutUint32(st, uint32(step))
buf = append(buf, st...)
// Value (float64)
val := make([]byte, 8)
binary.BigEndian.PutUint64(val, math.Float64bits(value))
buf = append(buf, val...)
// Name
if len(name) > 255 {
name = name[:255]
}
buf = append(buf, byte(len(name)))
buf = append(buf, []byte(name)...)
// Append to file
f, err := fileutil.SecureOpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
if err != nil {
return fmt.Errorf("failed to open metrics file: %w", err)
}
defer func() { _ = f.Close() }()
if _, err := f.Write(buf); err != nil {
return fmt.Errorf("failed to write metric: %w", err)
}
return nil
}
// GetMetrics reads all metrics for an experiment
func (m *Manager) GetMetrics(commitID string) ([]Metric, error) {
path := m.GetMetricsPath(commitID)
data, err := fileutil.SecureFileRead(path)
if err != nil {
if os.IsNotExist(err) {
return []Metric{}, nil
}
return nil, fmt.Errorf("failed to read metrics file: %w", err)
}
var metrics []Metric
offset := 0
for offset < len(data) {
if offset+21 > len(data) { // Min size check
break
}
m := Metric{}
// Timestamp
m.Timestamp = int64(binary.BigEndian.Uint64(data[offset : offset+8]))
offset += 8
// Step
m.Step = int(binary.BigEndian.Uint32(data[offset : offset+4]))
offset += 4
// Value
bits := binary.BigEndian.Uint64(data[offset : offset+8])
m.Value = math.Float64frombits(bits)
offset += 8
// Name
nameLen := int(data[offset])
offset++
if offset+nameLen > len(data) {
break
}
m.Name = string(data[offset : offset+nameLen])
offset += nameLen
metrics = append(metrics, m)
}
return metrics, nil
}