Created integrity package with extracted data utilities: 1. internal/worker/integrity/hash.go (113 lines) - FileSHA256Hex() - SHA256 hash of single file - NormalizeSHA256ChecksumHex() - Checksum normalization - DirOverallSHA256Hex() - Directory hash (sequential) - DirOverallSHA256HexParallel() - Directory hash (parallel workers) 2. internal/worker/integrity/validate.go (76 lines) - DatasetVerifier type for dataset validation - VerifyDatasetSpecs() method for checksum validation - ProvenanceCalculator type for provenance computation - ComputeProvenance() method for task provenance Note: Used 'integrity' instead of 'data' due to .gitignore conflict (data/ directory is ignored for experiment artifacts) Functions extracted from data_integrity.go: - fileSHA256Hex → FileSHA256Hex - normalizeSHA256ChecksumHex → NormalizeSHA256ChecksumHex - dirOverallSHA256HexGo → DirOverallSHA256Hex - dirOverallSHA256HexParallel → DirOverallSHA256HexParallel - verifyDatasetSpecs logic → DatasetVerifier - computeTaskProvenance logic → ProvenanceCalculator Build status: Compiles successfully
121 lines
2.9 KiB
Go
121 lines
2.9 KiB
Go
// Package integrity provides data integrity and validation utilities
|
|
package integrity
|
|
|
|
import (
|
|
"fmt"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/container"
|
|
"github.com/jfraeys/fetch_ml/internal/queue"
|
|
)
|
|
|
|
// DatasetVerifier validates dataset specifications
|
|
type DatasetVerifier struct {
|
|
dataDir string
|
|
}
|
|
|
|
// NewDatasetVerifier creates a new dataset verifier
|
|
func NewDatasetVerifier(dataDir string) *DatasetVerifier {
|
|
return &DatasetVerifier{dataDir: dataDir}
|
|
}
|
|
|
|
// VerifyDatasetSpecs validates dataset checksums
|
|
func (v *DatasetVerifier) VerifyDatasetSpecs(task *queue.Task) error {
|
|
if task == nil {
|
|
return fmt.Errorf("task is nil")
|
|
}
|
|
if len(task.DatasetSpecs) == 0 {
|
|
return nil
|
|
}
|
|
|
|
for _, ds := range task.DatasetSpecs {
|
|
want, err := NormalizeSHA256ChecksumHex(ds.Checksum)
|
|
if err != nil {
|
|
return fmt.Errorf("dataset %q: invalid checksum: %w", ds.Name, err)
|
|
}
|
|
if want == "" {
|
|
continue
|
|
}
|
|
if err := container.ValidateJobName(ds.Name); err != nil {
|
|
return fmt.Errorf("dataset %q: invalid name: %w", ds.Name, err)
|
|
}
|
|
path := filepath.Join(v.dataDir, ds.Name)
|
|
got, err := DirOverallSHA256Hex(path)
|
|
if err != nil {
|
|
return fmt.Errorf("dataset %q: checksum verification failed: %w", ds.Name, err)
|
|
}
|
|
if got != want {
|
|
return fmt.Errorf("dataset %q: checksum mismatch: expected %s, got %s", ds.Name, want, got)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ProvenanceCalculator computes task provenance information
|
|
type ProvenanceCalculator struct {
|
|
basePath string
|
|
}
|
|
|
|
// NewProvenanceCalculator creates a new provenance calculator
|
|
func NewProvenanceCalculator(basePath string) *ProvenanceCalculator {
|
|
return &ProvenanceCalculator{basePath: basePath}
|
|
}
|
|
|
|
// ComputeProvenance calculates provenance for a task
|
|
func (pc *ProvenanceCalculator) ComputeProvenance(task *queue.Task) (map[string]string, error) {
|
|
if task == nil {
|
|
return nil, fmt.Errorf("task is nil")
|
|
}
|
|
out := map[string]string{}
|
|
|
|
if task.SnapshotID != "" {
|
|
out["snapshot_id"] = task.SnapshotID
|
|
}
|
|
|
|
datasets := pc.resolveDatasets(task)
|
|
if len(datasets) > 0 {
|
|
out["datasets"] = strings.Join(datasets, ",")
|
|
}
|
|
|
|
// Note: Additional provenance fields would require access to experiment manager
|
|
// This is kept minimal to avoid tight coupling
|
|
|
|
return out, nil
|
|
}
|
|
|
|
func (pc *ProvenanceCalculator) resolveDatasets(task *queue.Task) []string {
|
|
if task == nil {
|
|
return nil
|
|
}
|
|
if len(task.DatasetSpecs) > 0 {
|
|
out := make([]string, 0, len(task.DatasetSpecs))
|
|
for _, ds := range task.DatasetSpecs {
|
|
if ds.Name != "" {
|
|
out = append(out, ds.Name)
|
|
}
|
|
}
|
|
if len(out) > 0 {
|
|
return out
|
|
}
|
|
}
|
|
if len(task.Datasets) > 0 {
|
|
return task.Datasets
|
|
}
|
|
return parseDatasetsFromArgs(task.Args)
|
|
}
|
|
|
|
func parseDatasetsFromArgs(args string) []string {
|
|
if !strings.Contains(args, "--datasets") {
|
|
return nil
|
|
}
|
|
|
|
parts := strings.Fields(args)
|
|
for i, part := range parts {
|
|
if part == "--datasets" && i+1 < len(parts) {
|
|
return strings.Split(parts[i+1], ",")
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|