Update Jupyter integration for security and scheduler support: - Enhanced security configuration with audit logging - Health monitoring with scheduler event integration - Package manager with network policy enforcement - Service manager with lifecycle hooks - Network manager with tenant isolation - Workspace metadata with tenant tags - Config with resource limits - Podman container integration improvements - Experiment manager with tracking integration - Manifest runner with security checks
465 lines
15 KiB
Go
465 lines
15 KiB
Go
package manifest
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/crypto"
|
|
"github.com/jfraeys/fetch_ml/internal/fileutil"
|
|
)
|
|
|
|
const (
|
|
runManifestFilename = "run_manifest.json"
|
|
manifestNonceLength = 16 // 32 hex chars
|
|
)
|
|
|
|
// GenerateManifestNonce generates a cryptographically secure nonce for manifest filenames.
|
|
// This prevents information disclosure in multi-tenant environments where predictable
|
|
// filenames could be enumerated.
|
|
func GenerateManifestNonce() (string, error) {
|
|
nonce := make([]byte, manifestNonceLength)
|
|
if _, err := rand.Read(nonce); err != nil {
|
|
return "", fmt.Errorf("failed to generate manifest nonce: %w", err)
|
|
}
|
|
return hex.EncodeToString(nonce), nil
|
|
}
|
|
|
|
// GenerateManifestFilename creates a unique manifest filename with a cryptographic nonce.
|
|
// Format: run_manifest_<nonce>.json
|
|
func GenerateManifestFilename() (string, error) {
|
|
nonce, err := GenerateManifestNonce()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return fmt.Sprintf("run_manifest_%s.json", nonce), nil
|
|
}
|
|
|
|
// ParseManifestFilename extracts the nonce from a manifest filename if present.
|
|
// Returns empty string if no nonce found.
|
|
func ParseManifestFilename(filename string) string {
|
|
if !strings.HasPrefix(filename, "run_manifest_") || !strings.HasSuffix(filename, ".json") {
|
|
return ""
|
|
}
|
|
nonce := strings.TrimPrefix(filename, "run_manifest_")
|
|
nonce = strings.TrimSuffix(nonce, ".json")
|
|
return nonce
|
|
}
|
|
|
|
type Annotation struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Author string `json:"author,omitempty"`
|
|
Note string `json:"note"`
|
|
}
|
|
|
|
func (a *Annotation) UnmarshalJSON(data []byte) error {
|
|
type annotationWire struct {
|
|
Timestamp *time.Time `json:"timestamp,omitempty"`
|
|
TS *time.Time `json:"ts,omitempty"`
|
|
Author string `json:"author,omitempty"`
|
|
Note string `json:"note"`
|
|
}
|
|
var w annotationWire
|
|
if err := json.Unmarshal(data, &w); err != nil {
|
|
return err
|
|
}
|
|
if w.Timestamp != nil {
|
|
a.Timestamp = *w.Timestamp
|
|
} else if w.TS != nil {
|
|
a.Timestamp = *w.TS
|
|
}
|
|
a.Author = w.Author
|
|
a.Note = w.Note
|
|
return nil
|
|
}
|
|
|
|
type Narrative struct {
|
|
Hypothesis string `json:"hypothesis,omitempty"`
|
|
Context string `json:"context,omitempty"`
|
|
Intent string `json:"intent,omitempty"`
|
|
ExpectedOutcome string `json:"expected_outcome,omitempty"`
|
|
ParentRun string `json:"parent_run,omitempty"`
|
|
ExperimentGroup string `json:"experiment_group,omitempty"`
|
|
Tags []string `json:"tags,omitempty"`
|
|
}
|
|
|
|
type NarrativePatch struct {
|
|
Hypothesis *string `json:"hypothesis,omitempty"`
|
|
Context *string `json:"context,omitempty"`
|
|
Intent *string `json:"intent,omitempty"`
|
|
ExpectedOutcome *string `json:"expected_outcome,omitempty"`
|
|
ParentRun *string `json:"parent_run,omitempty"`
|
|
ExperimentGroup *string `json:"experiment_group,omitempty"`
|
|
Tags *[]string `json:"tags,omitempty"`
|
|
}
|
|
|
|
// Outcome represents the documented result of a run.
|
|
type Outcome struct {
|
|
Status string `json:"status,omitempty"` // validated, invalidated, inconclusive, partial
|
|
Summary string `json:"summary,omitempty"` // Brief description
|
|
KeyLearnings []string `json:"key_learnings,omitempty"` // 3-5 bullet points max
|
|
FollowUpRuns []string `json:"follow_up_runs,omitempty"` // References to related runs
|
|
ArtifactsUsed []string `json:"artifacts_used,omitempty"` // e.g., ["model.pt", "metrics.json"]
|
|
}
|
|
|
|
type ArtifactFile struct {
|
|
Modified time.Time `json:"modified"`
|
|
Path string `json:"path"`
|
|
SizeBytes int64 `json:"size_bytes"`
|
|
}
|
|
|
|
type Artifacts struct {
|
|
DiscoveryTime time.Time `json:"discovery_time"`
|
|
Files []ArtifactFile `json:"files,omitempty"`
|
|
Exclusions []Exclusion `json:"exclusions,omitempty"`
|
|
TotalSizeBytes int64 `json:"total_size_bytes,omitempty"`
|
|
}
|
|
|
|
// Exclusion records why a path was excluded from artifact scanning
|
|
type Exclusion struct {
|
|
Path string `json:"path"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
// ExecutionEnvironment captures the runtime environment for reproducibility.
|
|
// This enables reconstruction and comparison of runs.
|
|
type ExecutionEnvironment struct {
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|
ConfigHash string `json:"config_hash"`
|
|
GPUDetectionMethod string `json:"gpu_detection_method,omitempty"`
|
|
GPUVendor string `json:"gpu_vendor,omitempty"`
|
|
PodmanCPUs string `json:"podman_cpus,omitempty"`
|
|
SandboxNetworkMode string `json:"sandbox_network_mode"`
|
|
SandboxSeccomp string `json:"sandbox_seccomp,omitempty"`
|
|
ComplianceMode string `json:"compliance_mode,omitempty"`
|
|
ManifestNonce string `json:"manifest_nonce,omitempty"`
|
|
GPUCount int `json:"gpu_count"`
|
|
MaxWorkers int `json:"max_workers"`
|
|
SandboxNoNewPrivs bool `json:"sandbox_no_new_privs"`
|
|
}
|
|
|
|
// RunManifest is a best-effort, self-contained provenance record for a run.
|
|
// It is written to <run_dir>/run_manifest.json.
|
|
type RunManifest struct {
|
|
CreatedAt time.Time `json:"created_at"`
|
|
StartedAt time.Time `json:"started_at,omitempty"`
|
|
EndedAt time.Time `json:"ended_at,omitempty"`
|
|
Artifacts *Artifacts `json:"artifacts,omitempty"`
|
|
Environment *ExecutionEnvironment `json:"environment,omitempty"`
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|
ExitCode *int `json:"exit_code,omitempty"`
|
|
Narrative *Narrative `json:"narrative,omitempty"`
|
|
Outcome *Outcome `json:"outcome,omitempty"`
|
|
PodmanImage string `json:"podman_image,omitempty"`
|
|
SnapshotID string `json:"snapshot_id,omitempty"`
|
|
ExperimentManifestSHA string `json:"experiment_manifest_sha,omitempty"`
|
|
DepsManifestName string `json:"deps_manifest_name,omitempty"`
|
|
DepsManifestSHA string `json:"deps_manifest_sha,omitempty"`
|
|
TrainScriptPath string `json:"train_script_path,omitempty"`
|
|
WorkerVersion string `json:"worker_version,omitempty"`
|
|
RunID string `json:"run_id"`
|
|
ImageDigest string `json:"image_digest,omitempty"`
|
|
WorkerHost string `json:"worker_host,omitempty"`
|
|
SnapshotSHA256 string `json:"snapshot_sha256,omitempty"`
|
|
Command string `json:"command,omitempty"`
|
|
Args string `json:"args,omitempty"`
|
|
CommitID string `json:"commit_id,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
SigAlg string `json:"sig_alg,omitempty"`
|
|
SignerKeyID string `json:"signer_key_id,omitempty"`
|
|
Signature string `json:"signature,omitempty"`
|
|
TaskID string `json:"task_id"`
|
|
JobName string `json:"job_name"`
|
|
Annotations []Annotation `json:"annotations,omitempty"`
|
|
GPUDevices []string `json:"gpu_devices,omitempty"`
|
|
TotalDurationMS int64 `json:"total_duration_ms,omitempty"`
|
|
FinalizeDurationMS int64 `json:"finalize_duration_ms,omitempty"`
|
|
ExecutionDurationMS int64 `json:"execution_duration_ms,omitempty"`
|
|
StagingDurationMS int64 `json:"staging_duration_ms,omitempty"`
|
|
}
|
|
|
|
func NewRunManifest(runID, taskID, jobName string, createdAt time.Time) *RunManifest {
|
|
m := &RunManifest{
|
|
RunID: runID,
|
|
TaskID: taskID,
|
|
JobName: jobName,
|
|
CreatedAt: createdAt,
|
|
Metadata: make(map[string]string),
|
|
}
|
|
return m
|
|
}
|
|
|
|
// ManifestPath returns the default manifest path (legacy fixed filename).
|
|
// Deprecated: Use ManifestPathWithNonce for new code to support unique filenames.
|
|
func ManifestPath(dir string) string {
|
|
return filepath.Join(dir, runManifestFilename)
|
|
}
|
|
|
|
// ManifestPathWithNonce returns the manifest path with a unique nonce.
|
|
// If nonce is empty, falls back to the default filename.
|
|
func ManifestPathWithNonce(dir, nonce string) string {
|
|
if nonce == "" {
|
|
return filepath.Join(dir, runManifestFilename)
|
|
}
|
|
filename := fmt.Sprintf("run_manifest_%s.json", nonce)
|
|
return filepath.Join(dir, filename)
|
|
}
|
|
|
|
func (m *RunManifest) WriteToDir(dir string) error {
|
|
if m == nil {
|
|
return fmt.Errorf("run manifest is nil")
|
|
}
|
|
data, err := json.MarshalIndent(m, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("marshal run manifest: %w", err)
|
|
}
|
|
|
|
// Use nonce-based filename if Environment.ManifestNonce is set
|
|
var manifestPath string
|
|
if m.Environment != nil && m.Environment.ManifestNonce != "" {
|
|
manifestPath = ManifestPathWithNonce(dir, m.Environment.ManifestNonce)
|
|
} else {
|
|
manifestPath = ManifestPath(dir)
|
|
}
|
|
|
|
if err := fileutil.SecureFileWrite(manifestPath, data, 0640); err != nil {
|
|
return fmt.Errorf("write run manifest: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func LoadFromDir(dir string) (*RunManifest, error) {
|
|
// Try standard filename first
|
|
data, err := fileutil.SecureFileRead(ManifestPath(dir))
|
|
if err != nil {
|
|
// If not found, look for nonce-based filename
|
|
entries, readErr := os.ReadDir(dir)
|
|
if readErr != nil {
|
|
return nil, fmt.Errorf("read run manifest: %w", err)
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
if strings.HasPrefix(entry.Name(), "run_manifest_") && strings.HasSuffix(entry.Name(), ".json") {
|
|
data, err = fileutil.SecureFileRead(filepath.Join(dir, entry.Name()))
|
|
if err == nil {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read run manifest: %w", err)
|
|
}
|
|
}
|
|
|
|
var m RunManifest
|
|
if err := json.Unmarshal(data, &m); err != nil {
|
|
return nil, fmt.Errorf("parse run manifest: %w", err)
|
|
}
|
|
return &m, nil
|
|
}
|
|
|
|
func (m *RunManifest) MarkStarted(t time.Time) {
|
|
m.StartedAt = t
|
|
}
|
|
|
|
func (m *RunManifest) MarkFinished(t time.Time, exitCode *int, execErr error) {
|
|
m.EndedAt = t
|
|
m.ExitCode = exitCode
|
|
if execErr != nil {
|
|
m.Error = execErr.Error()
|
|
} else {
|
|
m.Error = ""
|
|
}
|
|
if !m.StartedAt.IsZero() {
|
|
m.TotalDurationMS = m.EndedAt.Sub(m.StartedAt).Milliseconds()
|
|
}
|
|
}
|
|
|
|
func (m *RunManifest) AddAnnotation(ts time.Time, author, note string) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
n := strings.TrimSpace(note)
|
|
if n == "" {
|
|
return
|
|
}
|
|
a := Annotation{
|
|
Timestamp: ts,
|
|
Author: strings.TrimSpace(author),
|
|
Note: n,
|
|
}
|
|
m.Annotations = append(m.Annotations, a)
|
|
}
|
|
|
|
func (m *RunManifest) ApplyNarrativePatch(p NarrativePatch) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
if m.Narrative == nil {
|
|
m.Narrative = &Narrative{}
|
|
}
|
|
if p.Hypothesis != nil {
|
|
m.Narrative.Hypothesis = strings.TrimSpace(*p.Hypothesis)
|
|
}
|
|
if p.Context != nil {
|
|
m.Narrative.Context = strings.TrimSpace(*p.Context)
|
|
}
|
|
if p.Intent != nil {
|
|
m.Narrative.Intent = strings.TrimSpace(*p.Intent)
|
|
}
|
|
if p.ExpectedOutcome != nil {
|
|
m.Narrative.ExpectedOutcome = strings.TrimSpace(*p.ExpectedOutcome)
|
|
}
|
|
if p.ParentRun != nil {
|
|
m.Narrative.ParentRun = strings.TrimSpace(*p.ParentRun)
|
|
}
|
|
if p.ExperimentGroup != nil {
|
|
m.Narrative.ExperimentGroup = strings.TrimSpace(*p.ExperimentGroup)
|
|
}
|
|
if p.Tags != nil {
|
|
clean := make([]string, 0, len(*p.Tags))
|
|
for _, t := range *p.Tags {
|
|
t = strings.TrimSpace(t)
|
|
if t == "" {
|
|
continue
|
|
}
|
|
clean = append(clean, t)
|
|
}
|
|
m.Narrative.Tags = clean
|
|
}
|
|
}
|
|
|
|
// Sign signs the manifest using the provided signer
|
|
func (m *RunManifest) Sign(signer *crypto.ManifestSigner) error {
|
|
if m == nil {
|
|
return fmt.Errorf("cannot sign nil manifest")
|
|
}
|
|
|
|
result, err := signer.SignManifest(m)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to sign manifest: %w", err)
|
|
}
|
|
|
|
m.Signature = result.Signature
|
|
m.SignerKeyID = result.KeyID
|
|
m.SigAlg = result.Algorithm
|
|
return nil
|
|
}
|
|
|
|
// Verify verifies the manifest signature using the provided public key
|
|
func (m *RunManifest) Verify(publicKey []byte) (bool, error) {
|
|
if m == nil {
|
|
return false, fmt.Errorf("cannot verify nil manifest")
|
|
}
|
|
|
|
if m.Signature == "" {
|
|
return false, fmt.Errorf("manifest has no signature")
|
|
}
|
|
|
|
// Build signing result from manifest fields
|
|
result := &crypto.SigningResult{
|
|
Signature: m.Signature,
|
|
KeyID: m.SignerKeyID,
|
|
Algorithm: m.SigAlg,
|
|
}
|
|
|
|
// Call crypto package to verify
|
|
return crypto.VerifyManifest(m, result, publicKey)
|
|
}
|
|
|
|
// IsSigned returns true if the manifest has a signature
|
|
func (m *RunManifest) IsSigned() bool {
|
|
return m != nil && m.Signature != ""
|
|
}
|
|
|
|
// Validate checks manifest completeness using the standard Validator.
|
|
// This delegates to Validator.Validate() for consistency.
|
|
func (m *RunManifest) Validate() error {
|
|
v := NewValidator()
|
|
return v.Validate(m)
|
|
}
|
|
|
|
// ValidateStrict performs strict validation including optional provenance fields.
|
|
// This delegates to Validator.ValidateStrict() for consistency.
|
|
func (m *RunManifest) ValidateStrict() error {
|
|
v := NewValidator()
|
|
return v.ValidateStrict(m)
|
|
}
|
|
|
|
// ValidateProvenance checks if the manifest has complete provenance information.
|
|
// When ProvenanceBestEffort is false (default), this must pass before writing.
|
|
// Returns an error describing what's missing if validation fails.
|
|
func (m *RunManifest) ValidateProvenance() error {
|
|
if m == nil {
|
|
return fmt.Errorf("manifest is nil")
|
|
}
|
|
|
|
var missing []string
|
|
|
|
// Check Environment is present
|
|
if m.Environment == nil {
|
|
missing = append(missing, "environment metadata")
|
|
} else {
|
|
// Check ConfigHash - critical for reproducibility
|
|
if m.Environment.ConfigHash == "" {
|
|
missing = append(missing, "config_hash")
|
|
}
|
|
|
|
// Check GPU detection method
|
|
if m.Environment.GPUDetectionMethod == "" {
|
|
missing = append(missing, "gpu_detection_method")
|
|
}
|
|
|
|
// Check Sandbox configuration
|
|
if m.Environment.SandboxNetworkMode == "" {
|
|
missing = append(missing, "sandbox_network_mode")
|
|
}
|
|
}
|
|
|
|
// Check Artifacts are present (though they may be empty for new runs)
|
|
if m.Artifacts == nil {
|
|
missing = append(missing, "artifacts metadata")
|
|
}
|
|
|
|
// Check basic run identification
|
|
if m.RunID == "" {
|
|
missing = append(missing, "run_id")
|
|
}
|
|
if m.TaskID == "" {
|
|
missing = append(missing, "task_id")
|
|
}
|
|
|
|
if len(missing) > 0 {
|
|
return fmt.Errorf("incomplete provenance record: missing %v", missing)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// CanWrite checks if the manifest can be written given the provenance requirements.
|
|
// When ProvenanceBestEffort is false (default), requires complete environment capture.
|
|
// When true, allows partial manifests with warnings.
|
|
func (m *RunManifest) CanWrite(provenanceBestEffort bool) error {
|
|
// Always validate basic structure
|
|
if err := m.Validate(); err != nil {
|
|
return fmt.Errorf("manifest validation failed: %w", err)
|
|
}
|
|
|
|
// If best-effort is enabled, allow partial manifests
|
|
if provenanceBestEffort {
|
|
return nil
|
|
}
|
|
|
|
// Otherwise, require complete provenance (fail-closed default)
|
|
if err := m.ValidateProvenance(); err != nil {
|
|
return fmt.Errorf("provenance validation failed (set provenance_best_effort: true to allow partial manifests): %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|