package worker import ( "fmt" "io/fs" "path/filepath" "sort" "strings" "time" "github.com/jfraeys/fetch_ml/internal/fileutil" "github.com/jfraeys/fetch_ml/internal/manifest" ) func scanArtifacts(runDir string, includeAll bool, caps *SandboxConfig) (*manifest.Artifacts, error) { runDir = strings.TrimSpace(runDir) if runDir == "" { return nil, fmt.Errorf("run dir is empty") } // Validate and canonicalize the runDir before any operations validator := fileutil.NewSecurePathValidator(runDir) validatedRunDir, err := validator.ValidatePath("") if err != nil { return nil, fmt.Errorf("invalid run directory: %w", err) } var files []manifest.ArtifactFile var exclusions []manifest.Exclusion var total int64 var fileCount int now := time.Now().UTC() err = filepath.WalkDir(validatedRunDir, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } if path == validatedRunDir { return nil } // Security: Validate each path is still within runDir // This catches any symlink escapes or path traversal attempts during walk rel, err := filepath.Rel(validatedRunDir, path) if err != nil { return fmt.Errorf("path escape detected during artifact scan: %w", err) } rel = filepath.ToSlash(rel) // Check for path traversal patterns in the relative path if strings.Contains(rel, "..") { return fmt.Errorf("path traversal attempt detected: %s", rel) } // Standard exclusions (always apply) if rel == manifestFilename { exclusions = append(exclusions, manifest.Exclusion{ Path: rel, Reason: "manifest file excluded", }) return nil } if strings.HasSuffix(rel, "/"+manifestFilename) { exclusions = append(exclusions, manifest.Exclusion{ Path: rel, Reason: "manifest file excluded", }) return nil } // Optional exclusions (skipped when includeAll is true) if !includeAll { if rel == "code" || strings.HasPrefix(rel, "code/") { if d.IsDir() { exclusions = append(exclusions, manifest.Exclusion{ Path: rel, Reason: "source directory excluded", }) return fs.SkipDir } exclusions = append(exclusions, manifest.Exclusion{ Path: rel, Reason: "source directory excluded", }) return nil } if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") { if d.IsDir() { exclusions = append(exclusions, manifest.Exclusion{ Path: rel, Reason: "snapshot directory excluded", }) return fs.SkipDir } exclusions = append(exclusions, manifest.Exclusion{ Path: rel, Reason: "snapshot directory excluded", }) return nil } if strings.HasSuffix(rel, ".log") { exclusions = append(exclusions, manifest.Exclusion{ Path: rel, Reason: "log files excluded", }) return nil } if d.Type()&fs.ModeSymlink != 0 { // Skip symlinks - they could point outside the directory exclusions = append(exclusions, manifest.Exclusion{ Path: rel, Reason: "symlink excluded for security", }) return nil } } if d.IsDir() { return nil } info, err := d.Info() if err != nil { return err } // Check artifact caps before adding fileCount++ if caps != nil && caps.MaxArtifactFiles > 0 && fileCount > caps.MaxArtifactFiles { return fmt.Errorf("artifact file count cap exceeded: %d files (max %d)", fileCount, caps.MaxArtifactFiles) } total += info.Size() if caps != nil && caps.MaxArtifactTotalBytes > 0 && total > caps.MaxArtifactTotalBytes { return fmt.Errorf("artifact total size cap exceeded: %d bytes (max %d)", total, caps.MaxArtifactTotalBytes) } files = append(files, manifest.ArtifactFile{ Path: rel, SizeBytes: info.Size(), Modified: info.ModTime().UTC(), }) return nil }) if err != nil { return nil, err } sort.Slice(files, func(i, j int) bool { return files[i].Path < files[j].Path }) return &manifest.Artifacts{ DiscoveryTime: now, Files: files, TotalSizeBytes: total, Exclusions: exclusions, }, nil } const manifestFilename = "run_manifest.json" // ScanArtifacts is an exported wrapper for testing/benchmarking. // When includeAll is false, excludes code/, snapshot/, *.log files, and symlinks. func ScanArtifacts(runDir string, includeAll bool, caps *SandboxConfig) (*manifest.Artifacts, error) { return scanArtifacts(runDir, includeAll, caps) }