fetch_ml/internal/worker/artifacts.go
Jeremie Fraeys f71352202e
test(phase-1-2): naming alignment and partial test completion
Rename and enhance existing tests to align with coverage map:
- TestGPUDetectorAMDVendorAlias -> TestAMDAliasManifestRecord
- TestScanArtifacts_SkipsKnownPathsAndLogs -> TestScanExclusionsRecorded
- Add env var expansion verification to TestHIPAAValidation_InlineCredentials
- Record exclusions in manifest.Artifacts for audit trail
2026-02-23 20:25:07 -05:00

169 lines
4.3 KiB
Go

package worker
import (
"fmt"
"io/fs"
"path/filepath"
"sort"
"strings"
"time"
"github.com/jfraeys/fetch_ml/internal/fileutil"
"github.com/jfraeys/fetch_ml/internal/manifest"
)
func scanArtifacts(runDir string, includeAll bool, caps *SandboxConfig) (*manifest.Artifacts, error) {
runDir = strings.TrimSpace(runDir)
if runDir == "" {
return nil, fmt.Errorf("run dir is empty")
}
// Validate and canonicalize the runDir before any operations
validator := fileutil.NewSecurePathValidator(runDir)
validatedRunDir, err := validator.ValidatePath("")
if err != nil {
return nil, fmt.Errorf("invalid run directory: %w", err)
}
var files []manifest.ArtifactFile
var exclusions []manifest.Exclusion
var total int64
var fileCount int
now := time.Now().UTC()
err = filepath.WalkDir(validatedRunDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if path == validatedRunDir {
return nil
}
// Security: Validate each path is still within runDir
// This catches any symlink escapes or path traversal attempts during walk
rel, err := filepath.Rel(validatedRunDir, path)
if err != nil {
return fmt.Errorf("path escape detected during artifact scan: %w", err)
}
rel = filepath.ToSlash(rel)
// Check for path traversal patterns in the relative path
if strings.Contains(rel, "..") {
return fmt.Errorf("path traversal attempt detected: %s", rel)
}
// Standard exclusions (always apply)
if rel == manifestFilename {
exclusions = append(exclusions, manifest.Exclusion{
Path: rel,
Reason: "manifest file excluded",
})
return nil
}
if strings.HasSuffix(rel, "/"+manifestFilename) {
exclusions = append(exclusions, manifest.Exclusion{
Path: rel,
Reason: "manifest file excluded",
})
return nil
}
// Optional exclusions (skipped when includeAll is true)
if !includeAll {
if rel == "code" || strings.HasPrefix(rel, "code/") {
if d.IsDir() {
exclusions = append(exclusions, manifest.Exclusion{
Path: rel,
Reason: "source directory excluded",
})
return fs.SkipDir
}
exclusions = append(exclusions, manifest.Exclusion{
Path: rel,
Reason: "source directory excluded",
})
return nil
}
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
if d.IsDir() {
exclusions = append(exclusions, manifest.Exclusion{
Path: rel,
Reason: "snapshot directory excluded",
})
return fs.SkipDir
}
exclusions = append(exclusions, manifest.Exclusion{
Path: rel,
Reason: "snapshot directory excluded",
})
return nil
}
if strings.HasSuffix(rel, ".log") {
exclusions = append(exclusions, manifest.Exclusion{
Path: rel,
Reason: "log files excluded",
})
return nil
}
if d.Type()&fs.ModeSymlink != 0 {
// Skip symlinks - they could point outside the directory
exclusions = append(exclusions, manifest.Exclusion{
Path: rel,
Reason: "symlink excluded for security",
})
return nil
}
}
if d.IsDir() {
return nil
}
info, err := d.Info()
if err != nil {
return err
}
// Check artifact caps before adding
fileCount++
if caps != nil && caps.MaxArtifactFiles > 0 && fileCount > caps.MaxArtifactFiles {
return fmt.Errorf("artifact file count cap exceeded: %d files (max %d)", fileCount, caps.MaxArtifactFiles)
}
total += info.Size()
if caps != nil && caps.MaxArtifactTotalBytes > 0 && total > caps.MaxArtifactTotalBytes {
return fmt.Errorf("artifact total size cap exceeded: %d bytes (max %d)", total, caps.MaxArtifactTotalBytes)
}
files = append(files, manifest.ArtifactFile{
Path: rel,
SizeBytes: info.Size(),
Modified: info.ModTime().UTC(),
})
return nil
})
if err != nil {
return nil, err
}
sort.Slice(files, func(i, j int) bool {
return files[i].Path < files[j].Path
})
return &manifest.Artifacts{
DiscoveryTime: now,
Files: files,
TotalSizeBytes: total,
Exclusions: exclusions,
}, nil
}
const manifestFilename = "run_manifest.json"
// ScanArtifacts is an exported wrapper for testing/benchmarking.
// When includeAll is false, excludes code/, snapshot/, *.log files, and symlinks.
func ScanArtifacts(runDir string, includeAll bool, caps *SandboxConfig) (*manifest.Artifacts, error) {
return scanArtifacts(runDir, includeAll, caps)
}