fetch_ml/internal/manifest/run_manifest.go
Jeremie Fraeys 6b2c377680
refactor(jupyter): enhance security and scheduler integration
Update Jupyter integration for security and scheduler support:
- Enhanced security configuration with audit logging
- Health monitoring with scheduler event integration
- Package manager with network policy enforcement
- Service manager with lifecycle hooks
- Network manager with tenant isolation
- Workspace metadata with tenant tags
- Config with resource limits
- Podman container integration improvements
- Experiment manager with tracking integration
- Manifest runner with security checks
2026-02-26 12:06:35 -05:00

465 lines
15 KiB
Go

package manifest
import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/crypto"
"github.com/jfraeys/fetch_ml/internal/fileutil"
)
const (
runManifestFilename = "run_manifest.json"
manifestNonceLength = 16 // 32 hex chars
)
// GenerateManifestNonce generates a cryptographically secure nonce for manifest filenames.
// This prevents information disclosure in multi-tenant environments where predictable
// filenames could be enumerated.
func GenerateManifestNonce() (string, error) {
nonce := make([]byte, manifestNonceLength)
if _, err := rand.Read(nonce); err != nil {
return "", fmt.Errorf("failed to generate manifest nonce: %w", err)
}
return hex.EncodeToString(nonce), nil
}
// GenerateManifestFilename creates a unique manifest filename with a cryptographic nonce.
// Format: run_manifest_<nonce>.json
func GenerateManifestFilename() (string, error) {
nonce, err := GenerateManifestNonce()
if err != nil {
return "", err
}
return fmt.Sprintf("run_manifest_%s.json", nonce), nil
}
// ParseManifestFilename extracts the nonce from a manifest filename if present.
// Returns empty string if no nonce found.
func ParseManifestFilename(filename string) string {
if !strings.HasPrefix(filename, "run_manifest_") || !strings.HasSuffix(filename, ".json") {
return ""
}
nonce := strings.TrimPrefix(filename, "run_manifest_")
nonce = strings.TrimSuffix(nonce, ".json")
return nonce
}
type Annotation struct {
Timestamp time.Time `json:"timestamp"`
Author string `json:"author,omitempty"`
Note string `json:"note"`
}
func (a *Annotation) UnmarshalJSON(data []byte) error {
type annotationWire struct {
Timestamp *time.Time `json:"timestamp,omitempty"`
TS *time.Time `json:"ts,omitempty"`
Author string `json:"author,omitempty"`
Note string `json:"note"`
}
var w annotationWire
if err := json.Unmarshal(data, &w); err != nil {
return err
}
if w.Timestamp != nil {
a.Timestamp = *w.Timestamp
} else if w.TS != nil {
a.Timestamp = *w.TS
}
a.Author = w.Author
a.Note = w.Note
return nil
}
type Narrative struct {
Hypothesis string `json:"hypothesis,omitempty"`
Context string `json:"context,omitempty"`
Intent string `json:"intent,omitempty"`
ExpectedOutcome string `json:"expected_outcome,omitempty"`
ParentRun string `json:"parent_run,omitempty"`
ExperimentGroup string `json:"experiment_group,omitempty"`
Tags []string `json:"tags,omitempty"`
}
type NarrativePatch struct {
Hypothesis *string `json:"hypothesis,omitempty"`
Context *string `json:"context,omitempty"`
Intent *string `json:"intent,omitempty"`
ExpectedOutcome *string `json:"expected_outcome,omitempty"`
ParentRun *string `json:"parent_run,omitempty"`
ExperimentGroup *string `json:"experiment_group,omitempty"`
Tags *[]string `json:"tags,omitempty"`
}
// Outcome represents the documented result of a run.
type Outcome struct {
Status string `json:"status,omitempty"` // validated, invalidated, inconclusive, partial
Summary string `json:"summary,omitempty"` // Brief description
KeyLearnings []string `json:"key_learnings,omitempty"` // 3-5 bullet points max
FollowUpRuns []string `json:"follow_up_runs,omitempty"` // References to related runs
ArtifactsUsed []string `json:"artifacts_used,omitempty"` // e.g., ["model.pt", "metrics.json"]
}
type ArtifactFile struct {
Modified time.Time `json:"modified"`
Path string `json:"path"`
SizeBytes int64 `json:"size_bytes"`
}
type Artifacts struct {
DiscoveryTime time.Time `json:"discovery_time"`
Files []ArtifactFile `json:"files,omitempty"`
Exclusions []Exclusion `json:"exclusions,omitempty"`
TotalSizeBytes int64 `json:"total_size_bytes,omitempty"`
}
// Exclusion records why a path was excluded from artifact scanning
type Exclusion struct {
Path string `json:"path"`
Reason string `json:"reason"`
}
// ExecutionEnvironment captures the runtime environment for reproducibility.
// This enables reconstruction and comparison of runs.
type ExecutionEnvironment struct {
Metadata map[string]string `json:"metadata,omitempty"`
ConfigHash string `json:"config_hash"`
GPUDetectionMethod string `json:"gpu_detection_method,omitempty"`
GPUVendor string `json:"gpu_vendor,omitempty"`
PodmanCPUs string `json:"podman_cpus,omitempty"`
SandboxNetworkMode string `json:"sandbox_network_mode"`
SandboxSeccomp string `json:"sandbox_seccomp,omitempty"`
ComplianceMode string `json:"compliance_mode,omitempty"`
ManifestNonce string `json:"manifest_nonce,omitempty"`
GPUCount int `json:"gpu_count"`
MaxWorkers int `json:"max_workers"`
SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"`
}
// RunManifest is a best-effort, self-contained provenance record for a run.
// It is written to <run_dir>/run_manifest.json.
type RunManifest struct {
CreatedAt time.Time `json:"created_at"`
StartedAt time.Time `json:"started_at,omitempty"`
EndedAt time.Time `json:"ended_at,omitempty"`
Artifacts *Artifacts `json:"artifacts,omitempty"`
Environment *ExecutionEnvironment `json:"environment,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
ExitCode *int `json:"exit_code,omitempty"`
Narrative *Narrative `json:"narrative,omitempty"`
Outcome *Outcome `json:"outcome,omitempty"`
PodmanImage string `json:"podman_image,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"`
DepsManifestName string `json:"deps_manifest_name,omitempty"`
DepsManifestSHA string `json:"deps_manifest_sha,omitempty"`
TrainScriptPath string `json:"train_script_path,omitempty"`
WorkerVersion string `json:"worker_version,omitempty"`
RunID string `json:"run_id"`
ImageDigest string `json:"image_digest,omitempty"`
WorkerHost string `json:"worker_host,omitempty"`
SnapshotSHA256 string `json:"snapshot_sha256,omitempty"`
Command string `json:"command,omitempty"`
Args string `json:"args,omitempty"`
CommitID string `json:"commit_id,omitempty"`
Error string `json:"error,omitempty"`
SigAlg string `json:"sig_alg,omitempty"`
SignerKeyID string `json:"signer_key_id,omitempty"`
Signature string `json:"signature,omitempty"`
TaskID string `json:"task_id"`
JobName string `json:"job_name"`
Annotations []Annotation `json:"annotations,omitempty"`
GPUDevices []string `json:"gpu_devices,omitempty"`
TotalDurationMS int64 `json:"total_duration_ms,omitempty"`
FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"`
ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"`
StagingDurationMS int64 `json:"staging_duration_ms,omitempty"`
}
func NewRunManifest(runID, taskID, jobName string, createdAt time.Time) *RunManifest {
m := &RunManifest{
RunID: runID,
TaskID: taskID,
JobName: jobName,
CreatedAt: createdAt,
Metadata: make(map[string]string),
}
return m
}
// ManifestPath returns the default manifest path (legacy fixed filename).
// Deprecated: Use ManifestPathWithNonce for new code to support unique filenames.
func ManifestPath(dir string) string {
return filepath.Join(dir, runManifestFilename)
}
// ManifestPathWithNonce returns the manifest path with a unique nonce.
// If nonce is empty, falls back to the default filename.
func ManifestPathWithNonce(dir, nonce string) string {
if nonce == "" {
return filepath.Join(dir, runManifestFilename)
}
filename := fmt.Sprintf("run_manifest_%s.json", nonce)
return filepath.Join(dir, filename)
}
func (m *RunManifest) WriteToDir(dir string) error {
if m == nil {
return fmt.Errorf("run manifest is nil")
}
data, err := json.MarshalIndent(m, "", " ")
if err != nil {
return fmt.Errorf("marshal run manifest: %w", err)
}
// Use nonce-based filename if Environment.ManifestNonce is set
var manifestPath string
if m.Environment != nil && m.Environment.ManifestNonce != "" {
manifestPath = ManifestPathWithNonce(dir, m.Environment.ManifestNonce)
} else {
manifestPath = ManifestPath(dir)
}
if err := fileutil.SecureFileWrite(manifestPath, data, 0640); err != nil {
return fmt.Errorf("write run manifest: %w", err)
}
return nil
}
func LoadFromDir(dir string) (*RunManifest, error) {
// Try standard filename first
data, err := fileutil.SecureFileRead(ManifestPath(dir))
if err != nil {
// If not found, look for nonce-based filename
entries, readErr := os.ReadDir(dir)
if readErr != nil {
return nil, fmt.Errorf("read run manifest: %w", err)
}
for _, entry := range entries {
if strings.HasPrefix(entry.Name(), "run_manifest_") && strings.HasSuffix(entry.Name(), ".json") {
data, err = fileutil.SecureFileRead(filepath.Join(dir, entry.Name()))
if err == nil {
break
}
}
}
if err != nil {
return nil, fmt.Errorf("read run manifest: %w", err)
}
}
var m RunManifest
if err := json.Unmarshal(data, &m); err != nil {
return nil, fmt.Errorf("parse run manifest: %w", err)
}
return &m, nil
}
func (m *RunManifest) MarkStarted(t time.Time) {
m.StartedAt = t
}
func (m *RunManifest) MarkFinished(t time.Time, exitCode *int, execErr error) {
m.EndedAt = t
m.ExitCode = exitCode
if execErr != nil {
m.Error = execErr.Error()
} else {
m.Error = ""
}
if !m.StartedAt.IsZero() {
m.TotalDurationMS = m.EndedAt.Sub(m.StartedAt).Milliseconds()
}
}
func (m *RunManifest) AddAnnotation(ts time.Time, author, note string) {
if m == nil {
return
}
n := strings.TrimSpace(note)
if n == "" {
return
}
a := Annotation{
Timestamp: ts,
Author: strings.TrimSpace(author),
Note: n,
}
m.Annotations = append(m.Annotations, a)
}
func (m *RunManifest) ApplyNarrativePatch(p NarrativePatch) {
if m == nil {
return
}
if m.Narrative == nil {
m.Narrative = &Narrative{}
}
if p.Hypothesis != nil {
m.Narrative.Hypothesis = strings.TrimSpace(*p.Hypothesis)
}
if p.Context != nil {
m.Narrative.Context = strings.TrimSpace(*p.Context)
}
if p.Intent != nil {
m.Narrative.Intent = strings.TrimSpace(*p.Intent)
}
if p.ExpectedOutcome != nil {
m.Narrative.ExpectedOutcome = strings.TrimSpace(*p.ExpectedOutcome)
}
if p.ParentRun != nil {
m.Narrative.ParentRun = strings.TrimSpace(*p.ParentRun)
}
if p.ExperimentGroup != nil {
m.Narrative.ExperimentGroup = strings.TrimSpace(*p.ExperimentGroup)
}
if p.Tags != nil {
clean := make([]string, 0, len(*p.Tags))
for _, t := range *p.Tags {
t = strings.TrimSpace(t)
if t == "" {
continue
}
clean = append(clean, t)
}
m.Narrative.Tags = clean
}
}
// Sign signs the manifest using the provided signer
func (m *RunManifest) Sign(signer *crypto.ManifestSigner) error {
if m == nil {
return fmt.Errorf("cannot sign nil manifest")
}
result, err := signer.SignManifest(m)
if err != nil {
return fmt.Errorf("failed to sign manifest: %w", err)
}
m.Signature = result.Signature
m.SignerKeyID = result.KeyID
m.SigAlg = result.Algorithm
return nil
}
// Verify verifies the manifest signature using the provided public key
func (m *RunManifest) Verify(publicKey []byte) (bool, error) {
if m == nil {
return false, fmt.Errorf("cannot verify nil manifest")
}
if m.Signature == "" {
return false, fmt.Errorf("manifest has no signature")
}
// Build signing result from manifest fields
result := &crypto.SigningResult{
Signature: m.Signature,
KeyID: m.SignerKeyID,
Algorithm: m.SigAlg,
}
// Call crypto package to verify
return crypto.VerifyManifest(m, result, publicKey)
}
// IsSigned returns true if the manifest has a signature
func (m *RunManifest) IsSigned() bool {
return m != nil && m.Signature != ""
}
// Validate checks manifest completeness using the standard Validator.
// This delegates to Validator.Validate() for consistency.
func (m *RunManifest) Validate() error {
v := NewValidator()
return v.Validate(m)
}
// ValidateStrict performs strict validation including optional provenance fields.
// This delegates to Validator.ValidateStrict() for consistency.
func (m *RunManifest) ValidateStrict() error {
v := NewValidator()
return v.ValidateStrict(m)
}
// ValidateProvenance checks if the manifest has complete provenance information.
// When ProvenanceBestEffort is false (default), this must pass before writing.
// Returns an error describing what's missing if validation fails.
func (m *RunManifest) ValidateProvenance() error {
if m == nil {
return fmt.Errorf("manifest is nil")
}
var missing []string
// Check Environment is present
if m.Environment == nil {
missing = append(missing, "environment metadata")
} else {
// Check ConfigHash - critical for reproducibility
if m.Environment.ConfigHash == "" {
missing = append(missing, "config_hash")
}
// Check GPU detection method
if m.Environment.GPUDetectionMethod == "" {
missing = append(missing, "gpu_detection_method")
}
// Check Sandbox configuration
if m.Environment.SandboxNetworkMode == "" {
missing = append(missing, "sandbox_network_mode")
}
}
// Check Artifacts are present (though they may be empty for new runs)
if m.Artifacts == nil {
missing = append(missing, "artifacts metadata")
}
// Check basic run identification
if m.RunID == "" {
missing = append(missing, "run_id")
}
if m.TaskID == "" {
missing = append(missing, "task_id")
}
if len(missing) > 0 {
return fmt.Errorf("incomplete provenance record: missing %v", missing)
}
return nil
}
// CanWrite checks if the manifest can be written given the provenance requirements.
// When ProvenanceBestEffort is false (default), requires complete environment capture.
// When true, allows partial manifests with warnings.
func (m *RunManifest) CanWrite(provenanceBestEffort bool) error {
// Always validate basic structure
if err := m.Validate(); err != nil {
return fmt.Errorf("manifest validation failed: %w", err)
}
// If best-effort is enabled, allow partial manifests
if provenanceBestEffort {
return nil
}
// Otherwise, require complete provenance (fail-closed default)
if err := m.ValidateProvenance(); err != nil {
return fmt.Errorf("provenance validation failed (set provenance_best_effort: true to allow partial manifests): %w", err)
}
return nil
}