fetch_ml/internal/manifest/run_manifest.go
Jeremie Fraeys 8f9bcef754
test(phase-3): prerequisite security and reproducibility tests
Implement 4 prerequisite test requirements:

- TestConfigIntegrityVerification: Config signing, tamper detection, hash stability
- TestManifestFilenameNonce: Cryptographic nonce generation and filename patterns
- TestGPUDetectionAudit: Structured logging of GPU detection at startup
- TestResourceEnvVarParsing: Resource env var parsing and override behavior

Also update manifest run_manifest.go:
- Add nonce-based filename support to WriteToDir
- Add nonce-based file detection to LoadFromDir
2026-02-23 20:25:26 -05:00

404 lines
13 KiB
Go

package manifest
import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/crypto"
"github.com/jfraeys/fetch_ml/internal/fileutil"
)
const (
runManifestFilename = "run_manifest.json"
manifestNonceLength = 16 // 32 hex chars
)
// GenerateManifestNonce generates a cryptographically secure nonce for manifest filenames.
// This prevents information disclosure in multi-tenant environments where predictable
// filenames could be enumerated.
func GenerateManifestNonce() (string, error) {
nonce := make([]byte, manifestNonceLength)
if _, err := rand.Read(nonce); err != nil {
return "", fmt.Errorf("failed to generate manifest nonce: %w", err)
}
return hex.EncodeToString(nonce), nil
}
// GenerateManifestFilename creates a unique manifest filename with a cryptographic nonce.
// Format: run_manifest_<nonce>.json
func GenerateManifestFilename() (string, error) {
nonce, err := GenerateManifestNonce()
if err != nil {
return "", err
}
return fmt.Sprintf("run_manifest_%s.json", nonce), nil
}
// ParseManifestFilename extracts the nonce from a manifest filename if present.
// Returns empty string if no nonce found.
func ParseManifestFilename(filename string) string {
if !strings.HasPrefix(filename, "run_manifest_") || !strings.HasSuffix(filename, ".json") {
return ""
}
nonce := strings.TrimPrefix(filename, "run_manifest_")
nonce = strings.TrimSuffix(nonce, ".json")
return nonce
}
type Annotation struct {
Timestamp time.Time `json:"timestamp"`
Author string `json:"author,omitempty"`
Note string `json:"note"`
}
func (a *Annotation) UnmarshalJSON(data []byte) error {
type annotationWire struct {
Timestamp *time.Time `json:"timestamp,omitempty"`
TS *time.Time `json:"ts,omitempty"`
Author string `json:"author,omitempty"`
Note string `json:"note"`
}
var w annotationWire
if err := json.Unmarshal(data, &w); err != nil {
return err
}
if w.Timestamp != nil {
a.Timestamp = *w.Timestamp
} else if w.TS != nil {
a.Timestamp = *w.TS
}
a.Author = w.Author
a.Note = w.Note
return nil
}
type Narrative struct {
Hypothesis string `json:"hypothesis,omitempty"`
Context string `json:"context,omitempty"`
Intent string `json:"intent,omitempty"`
ExpectedOutcome string `json:"expected_outcome,omitempty"`
ParentRun string `json:"parent_run,omitempty"`
ExperimentGroup string `json:"experiment_group,omitempty"`
Tags []string `json:"tags,omitempty"`
}
type NarrativePatch struct {
Hypothesis *string `json:"hypothesis,omitempty"`
Context *string `json:"context,omitempty"`
Intent *string `json:"intent,omitempty"`
ExpectedOutcome *string `json:"expected_outcome,omitempty"`
ParentRun *string `json:"parent_run,omitempty"`
ExperimentGroup *string `json:"experiment_group,omitempty"`
Tags *[]string `json:"tags,omitempty"`
}
// Outcome represents the documented result of a run.
type Outcome struct {
Status string `json:"status,omitempty"` // validated, invalidated, inconclusive, partial
Summary string `json:"summary,omitempty"` // Brief description
KeyLearnings []string `json:"key_learnings,omitempty"` // 3-5 bullet points max
FollowUpRuns []string `json:"follow_up_runs,omitempty"` // References to related runs
ArtifactsUsed []string `json:"artifacts_used,omitempty"` // e.g., ["model.pt", "metrics.json"]
}
type ArtifactFile struct {
Path string `json:"path"`
SizeBytes int64 `json:"size_bytes"`
Modified time.Time `json:"modified"`
}
type Artifacts struct {
DiscoveryTime time.Time `json:"discovery_time"`
Files []ArtifactFile `json:"files,omitempty"`
TotalSizeBytes int64 `json:"total_size_bytes,omitempty"`
Exclusions []Exclusion `json:"exclusions,omitempty"` // R.5: Scan exclusions recorded
}
// Exclusion records why a path was excluded from artifact scanning
type Exclusion struct {
Path string `json:"path"`
Reason string `json:"reason"`
}
// ExecutionEnvironment captures the runtime environment for reproducibility.
// This enables reconstruction and comparison of runs.
type ExecutionEnvironment struct {
ConfigHash string `json:"config_hash"` // R.2: Resolved config hash
GPUCount int `json:"gpu_count"` // GPU count detected
GPUDetectionMethod string `json:"gpu_detection_method,omitempty"` // R.3: "nvml", "env_override", etc.
GPUVendor string `json:"gpu_vendor,omitempty"` // Configured GPU vendor
MaxWorkers int `json:"max_workers"` // Active resource limits
PodmanCPUs string `json:"podman_cpus,omitempty"` // CPU limit
SandboxNetworkMode string `json:"sandbox_network_mode"` // Sandbox settings
SandboxSeccomp string `json:"sandbox_seccomp,omitempty"` // Seccomp profile
SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"` // Security flags
ComplianceMode string `json:"compliance_mode,omitempty"` // HIPAA mode
ManifestNonce string `json:"manifest_nonce,omitempty"` // Unique manifest identifier
Metadata map[string]string `json:"metadata,omitempty"` // Additional env info
}
// RunManifest is a best-effort, self-contained provenance record for a run.
// It is written to <run_dir>/run_manifest.json.
type RunManifest struct {
RunID string `json:"run_id"`
TaskID string `json:"task_id"`
JobName string `json:"job_name"`
CreatedAt time.Time `json:"created_at"`
StartedAt time.Time `json:"started_at,omitempty"`
EndedAt time.Time `json:"ended_at,omitempty"`
Annotations []Annotation `json:"annotations,omitempty"`
Narrative *Narrative `json:"narrative,omitempty"`
Outcome *Outcome `json:"outcome,omitempty"`
Artifacts *Artifacts `json:"artifacts,omitempty"`
CommitID string `json:"commit_id,omitempty"`
ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"`
DepsManifestName string `json:"deps_manifest_name,omitempty"`
DepsManifestSHA string `json:"deps_manifest_sha,omitempty"`
TrainScriptPath string `json:"train_script_path,omitempty"`
WorkerVersion string `json:"worker_version,omitempty"`
PodmanImage string `json:"podman_image,omitempty"`
ImageDigest string `json:"image_digest,omitempty"`
SnapshotID string `json:"snapshot_id,omitempty"`
SnapshotSHA256 string `json:"snapshot_sha256,omitempty"`
Command string `json:"command,omitempty"`
Args string `json:"args,omitempty"`
ExitCode *int `json:"exit_code,omitempty"`
Error string `json:"error,omitempty"`
StagingDurationMS int64 `json:"staging_duration_ms,omitempty"`
ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"`
FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"`
TotalDurationMS int64 `json:"total_duration_ms,omitempty"`
GPUDevices []string `json:"gpu_devices,omitempty"`
WorkerHost string `json:"worker_host,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
// Environment captures execution environment for reproducibility (R.1)
Environment *ExecutionEnvironment `json:"environment,omitempty"`
// Signature fields for tamper detection
Signature string `json:"signature,omitempty"`
SignerKeyID string `json:"signer_key_id,omitempty"`
SigAlg string `json:"sig_alg,omitempty"`
}
func NewRunManifest(runID, taskID, jobName string, createdAt time.Time) *RunManifest {
m := &RunManifest{
RunID: runID,
TaskID: taskID,
JobName: jobName,
CreatedAt: createdAt,
Metadata: make(map[string]string),
}
return m
}
// ManifestPath returns the default manifest path (legacy fixed filename).
// Deprecated: Use ManifestPathWithNonce for new code to support unique filenames.
func ManifestPath(dir string) string {
return filepath.Join(dir, runManifestFilename)
}
// ManifestPathWithNonce returns the manifest path with a unique nonce.
// If nonce is empty, falls back to the default filename.
func ManifestPathWithNonce(dir, nonce string) string {
if nonce == "" {
return filepath.Join(dir, runManifestFilename)
}
filename := fmt.Sprintf("run_manifest_%s.json", nonce)
return filepath.Join(dir, filename)
}
func (m *RunManifest) WriteToDir(dir string) error {
if m == nil {
return fmt.Errorf("run manifest is nil")
}
data, err := json.MarshalIndent(m, "", " ")
if err != nil {
return fmt.Errorf("marshal run manifest: %w", err)
}
// Use nonce-based filename if Environment.ManifestNonce is set
var manifestPath string
if m.Environment != nil && m.Environment.ManifestNonce != "" {
manifestPath = ManifestPathWithNonce(dir, m.Environment.ManifestNonce)
} else {
manifestPath = ManifestPath(dir)
}
if err := fileutil.SecureFileWrite(manifestPath, data, 0640); err != nil {
return fmt.Errorf("write run manifest: %w", err)
}
return nil
}
func LoadFromDir(dir string) (*RunManifest, error) {
// Try standard filename first
data, err := fileutil.SecureFileRead(ManifestPath(dir))
if err != nil {
// If not found, look for nonce-based filename
entries, readErr := os.ReadDir(dir)
if readErr != nil {
return nil, fmt.Errorf("read run manifest: %w", err)
}
for _, entry := range entries {
if strings.HasPrefix(entry.Name(), "run_manifest_") && strings.HasSuffix(entry.Name(), ".json") {
data, err = fileutil.SecureFileRead(filepath.Join(dir, entry.Name()))
if err == nil {
break
}
}
}
if err != nil {
return nil, fmt.Errorf("read run manifest: %w", err)
}
}
var m RunManifest
if err := json.Unmarshal(data, &m); err != nil {
return nil, fmt.Errorf("parse run manifest: %w", err)
}
return &m, nil
}
func (m *RunManifest) MarkStarted(t time.Time) {
m.StartedAt = t
}
func (m *RunManifest) MarkFinished(t time.Time, exitCode *int, execErr error) {
m.EndedAt = t
m.ExitCode = exitCode
if execErr != nil {
m.Error = execErr.Error()
} else {
m.Error = ""
}
if !m.StartedAt.IsZero() {
m.TotalDurationMS = m.EndedAt.Sub(m.StartedAt).Milliseconds()
}
}
func (m *RunManifest) AddAnnotation(ts time.Time, author, note string) {
if m == nil {
return
}
n := strings.TrimSpace(note)
if n == "" {
return
}
a := Annotation{
Timestamp: ts,
Author: strings.TrimSpace(author),
Note: n,
}
m.Annotations = append(m.Annotations, a)
}
func (m *RunManifest) ApplyNarrativePatch(p NarrativePatch) {
if m == nil {
return
}
if m.Narrative == nil {
m.Narrative = &Narrative{}
}
if p.Hypothesis != nil {
m.Narrative.Hypothesis = strings.TrimSpace(*p.Hypothesis)
}
if p.Context != nil {
m.Narrative.Context = strings.TrimSpace(*p.Context)
}
if p.Intent != nil {
m.Narrative.Intent = strings.TrimSpace(*p.Intent)
}
if p.ExpectedOutcome != nil {
m.Narrative.ExpectedOutcome = strings.TrimSpace(*p.ExpectedOutcome)
}
if p.ParentRun != nil {
m.Narrative.ParentRun = strings.TrimSpace(*p.ParentRun)
}
if p.ExperimentGroup != nil {
m.Narrative.ExperimentGroup = strings.TrimSpace(*p.ExperimentGroup)
}
if p.Tags != nil {
clean := make([]string, 0, len(*p.Tags))
for _, t := range *p.Tags {
t = strings.TrimSpace(t)
if t == "" {
continue
}
clean = append(clean, t)
}
m.Narrative.Tags = clean
}
}
// Sign signs the manifest using the provided signer
func (m *RunManifest) Sign(signer *crypto.ManifestSigner) error {
if m == nil {
return fmt.Errorf("cannot sign nil manifest")
}
result, err := signer.SignManifest(m)
if err != nil {
return fmt.Errorf("failed to sign manifest: %w", err)
}
m.Signature = result.Signature
m.SignerKeyID = result.KeyID
m.SigAlg = result.Algorithm
return nil
}
// Verify verifies the manifest signature using the provided public key
func (m *RunManifest) Verify(publicKey []byte) (bool, error) {
if m == nil {
return false, fmt.Errorf("cannot verify nil manifest")
}
if m.Signature == "" {
return false, fmt.Errorf("manifest has no signature")
}
// Build signing result from manifest fields
result := &crypto.SigningResult{
Signature: m.Signature,
KeyID: m.SignerKeyID,
Algorithm: m.SigAlg,
}
// Call crypto package to verify
return crypto.VerifyManifest(m, result, publicKey)
}
// IsSigned returns true if the manifest has a signature
func (m *RunManifest) IsSigned() bool {
return m != nil && m.Signature != ""
}
// Validate checks manifest completeness using the standard Validator.
// This delegates to Validator.Validate() for consistency.
func (m *RunManifest) Validate() error {
v := NewValidator()
return v.Validate(m)
}
// ValidateStrict performs strict validation including optional provenance fields.
// This delegates to Validator.ValidateStrict() for consistency.
func (m *RunManifest) ValidateStrict() error {
v := NewValidator()
return v.ValidateStrict(m)
}