diff --git a/.forgejo/workflows/ci.yml b/.forgejo/workflows/ci.yml index 4e6a4f9..9d051ed 100644 --- a/.forgejo/workflows/ci.yml +++ b/.forgejo/workflows/ci.yml @@ -334,6 +334,62 @@ jobs: echo "=== Native Implementation ===" CGO_ENABLED=1 go test -tags native_libs -bench=. ./tests/benchmarks/ -benchmem || true + test-gpu-matrix: + name: GPU Golden Test Matrix + runs-on: self-hosted + needs: test-native + timeout-minutes: 15 + strategy: + matrix: + build_config: [cgo-native, cgo-only, nocgo] + fail-fast: false + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Setup Go + run: | + REQUIRED_GO="1.25.0" + if command -v go &> /dev/null && go version | grep -q "go${REQUIRED_GO}"; then + echo "Go ${REQUIRED_GO} already installed" + else + echo "Installing Go ${REQUIRED_GO}..." + curl -sL "https://go.dev/dl/go${REQUIRED_GO}.linux-amd64.tar.gz" | sudo tar -C /usr/local -xzf - + export PATH="/usr/local/go/bin:$PATH" + echo "/usr/local/go/bin" >> $GITHUB_PATH + fi + go version + + - name: Build Native Libraries (for cgo-native config) + if: matrix.build_config == 'cgo-native' + run: | + sudo apt-get update + sudo apt-get install -y cmake zlib1g-dev build-essential + make native-build || echo "Native build skipped (may fail without proper deps)" + + - name: Run GPU Tests - cgo+native_libs + if: matrix.build_config == 'cgo-native' + run: | + echo "=== Testing cgo + native_libs build ===" + CGO_ENABLED=1 go test -tags native_libs -v ./tests/unit/gpu/ -run TestGoldenGPUStatus + CGO_ENABLED=1 go test -tags native_libs -v ./tests/unit/gpu/ -run TestBuildTagMatrix + + - name: Run GPU Tests - cgo only (no native_libs) + if: matrix.build_config == 'cgo-only' + run: | + echo "=== Testing cgo without native_libs build ===" + CGO_ENABLED=1 go test -v ./tests/unit/gpu/ -run TestGoldenGPUStatus + CGO_ENABLED=1 go test -v ./tests/unit/gpu/ -run TestBuildTagMatrix + + - name: Run GPU Tests - nocgo + if: matrix.build_config == 'nocgo' + run: | + echo "=== Testing !cgo build ===" + CGO_ENABLED=0 go test -v ./tests/unit/gpu/ -run TestGoldenGPUStatus + CGO_ENABLED=0 go test -v ./tests/unit/gpu/ -run TestBuildTagMatrix + docker-build: name: Docker Build runs-on: self-hosted diff --git a/cli/src/utils/parallel_walk.zig b/cli/src/utils/parallel_walk.zig index 917fd24..ac545be 100644 --- a/cli/src/utils/parallel_walk.zig +++ b/cli/src/utils/parallel_walk.zig @@ -13,7 +13,7 @@ const WorkQueue = struct { depth: usize, }; - fn init(allocator: std.mem.Allocator) WorkQueue { + fn init() WorkQueue { return .{ .items = .empty, .mutex = .{}, diff --git a/docs/src/configuration-reference.md b/docs/src/configuration-reference.md index 069d2b5..9acaf7e 100644 --- a/docs/src/configuration-reference.md +++ b/docs/src/configuration-reference.md @@ -321,6 +321,14 @@ api_key = "" | `FETCHML_CONFIG` | - | Path to config file | | `FETCHML_LOG_LEVEL` | "info" | Override log level | | `CLI_CONFIG` | - | Path to CLI config file | +| `FETCH_ML_GPU_TYPE` | - | Override GPU vendor detection (nvidia, amd, apple, none). Takes precedence over config file. | +| `FETCH_ML_GPU_COUNT` | - | Override GPU count detection. Used with auto-detected or configured vendor. | +| `FETCH_ML_TOTAL_CPU` | - | Override total CPU count detection. Sets the number of CPU cores available. | +| `FETCH_ML_GPU_SLOTS_PER_GPU` | 1 | Override GPU slots per GPU. Controls how many concurrent tasks can share a single GPU. | + +When environment variable overrides are active, they are logged to stderr at worker startup for debugging. + +Note: When `gpu_vendor: amd` is configured, the system uses the NVIDIA detector implementation (aliased) due to similar device exposure patterns. The `configured_vendor` field will show "amd" while the actual detection uses NVIDIA-compatible methods. ## Troubleshooting diff --git a/internal/worker/artifacts.go b/internal/worker/artifacts.go index 631abfb..35a59f9 100644 --- a/internal/worker/artifacts.go +++ b/internal/worker/artifacts.go @@ -11,7 +11,7 @@ import ( "github.com/jfraeys/fetch_ml/internal/manifest" ) -func scanArtifacts(runDir string) (*manifest.Artifacts, error) { +func scanArtifacts(runDir string, includeAll bool) (*manifest.Artifacts, error) { runDir = strings.TrimSpace(runDir) if runDir == "" { return nil, fmt.Errorf("run dir is empty") @@ -37,19 +37,7 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) { } rel = filepath.ToSlash(rel) - if rel == "code" || strings.HasPrefix(rel, "code/") { - if d.IsDir() { - return fs.SkipDir - } - return nil - } - if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") { - if d.IsDir() { - return fs.SkipDir - } - return nil - } - + // Standard exclusions (always apply) if rel == manifestFilename { return nil } @@ -57,12 +45,26 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) { return nil } - if strings.HasSuffix(rel, ".log") { - return nil - } - - if d.Type()&fs.ModeSymlink != 0 { - return nil + // Optional exclusions (skipped when includeAll is true) + if !includeAll { + if rel == "code" || strings.HasPrefix(rel, "code/") { + if d.IsDir() { + return fs.SkipDir + } + return nil + } + if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") { + if d.IsDir() { + return fs.SkipDir + } + return nil + } + if strings.HasSuffix(rel, ".log") { + return nil + } + if d.Type()&fs.ModeSymlink != 0 { + return nil + } } if d.IsDir() { @@ -100,6 +102,7 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) { const manifestFilename = "run_manifest.json" // ScanArtifacts is an exported wrapper for testing/benchmarking. -func ScanArtifacts(runDir string) (*manifest.Artifacts, error) { - return scanArtifacts(runDir) +// When includeAll is false, excludes code/, snapshot/, *.log files, and symlinks. +func ScanArtifacts(runDir string, includeAll bool) (*manifest.Artifacts, error) { + return scanArtifacts(runDir, includeAll) } diff --git a/internal/worker/config.go b/internal/worker/config.go index 6bf0b4e..3d1853c 100644 --- a/internal/worker/config.go +++ b/internal/worker/config.go @@ -77,13 +77,14 @@ type Config struct { PrewarmEnabled bool `yaml:"prewarm_enabled"` // Podman execution - PodmanImage string `yaml:"podman_image"` - ContainerWorkspace string `yaml:"container_workspace"` - ContainerResults string `yaml:"container_results"` - GPUDevices []string `yaml:"gpu_devices"` - GPUVendor string `yaml:"gpu_vendor"` - GPUVisibleDevices []int `yaml:"gpu_visible_devices"` - GPUVisibleDeviceIDs []string `yaml:"gpu_visible_device_ids"` + PodmanImage string `yaml:"podman_image"` + ContainerWorkspace string `yaml:"container_workspace"` + ContainerResults string `yaml:"container_results"` + GPUDevices []string `yaml:"gpu_devices"` + GPUVendor string `yaml:"gpu_vendor"` + GPUVendorAutoDetected bool `yaml:"-"` // Set by LoadConfig when GPUVendor is auto-detected + GPUVisibleDevices []int `yaml:"gpu_visible_devices"` + GPUVisibleDeviceIDs []string `yaml:"gpu_visible_device_ids"` // Apple M-series GPU configuration AppleGPU AppleGPUConfig `yaml:"apple_gpu"` @@ -264,6 +265,7 @@ func LoadConfig(path string) (*Config, error) { } if strings.TrimSpace(cfg.GPUVendor) == "" { + cfg.GPUVendorAutoDetected = true if cfg.AppleGPU.Enabled { cfg.GPUVendor = string(GPUTypeApple) } else if len(cfg.GPUDevices) > 0 || @@ -453,9 +455,15 @@ func envInt(name string) (int, bool) { return n, true } +// logEnvOverride logs environment variable overrides to stderr for debugging +func logEnvOverride(name string, value interface{}) { + fmt.Fprintf(os.Stderr, "[env] %s=%v (override active)\n", name, value) +} + // parseCPUFromConfig determines total CPU from environment or config func parseCPUFromConfig(cfg *Config) int { if n, ok := envInt("FETCH_ML_TOTAL_CPU"); ok && n >= 0 { + logEnvOverride("FETCH_ML_TOTAL_CPU", n) return n } if cfg != nil { @@ -471,11 +479,11 @@ func parseCPUFromConfig(cfg *Config) int { return runtime.NumCPU() } -// parseGPUCountFromConfig detects GPU count from config -func parseGPUCountFromConfig(cfg *Config) int { +// parseGPUCountFromConfig detects GPU count from config and returns detection metadata +func parseGPUCountFromConfig(cfg *Config) (int, GPUDetectionInfo) { factory := &GPUDetectorFactory{} - detector := factory.CreateDetector(cfg) - return detector.DetectGPUCount() + result := factory.CreateDetectorWithInfo(cfg) + return result.Detector.DetectGPUCount(), result.Info } // parseGPUSlotsPerGPUFromConfig reads GPU slots per GPU from environment diff --git a/internal/worker/factory.go b/internal/worker/factory.go index 0535510..246dd96 100644 --- a/internal/worker/factory.go +++ b/internal/worker/factory.go @@ -147,9 +147,10 @@ func NewWorker(cfg *Config, _ string) (*Worker, error) { ) // Create resource manager + gpuCount, gpuDetectionInfo := parseGPUCountFromConfig(cfg) rm, err := resources.NewManager(resources.Options{ TotalCPU: parseCPUFromConfig(cfg), - GPUCount: parseGPUCountFromConfig(cfg), + GPUCount: gpuCount, SlotsPerGPU: parseGPUSlotsPerGPUFromConfig(), }) if err != nil { @@ -158,28 +159,32 @@ func NewWorker(cfg *Config, _ string) (*Worker, error) { } worker := &Worker{ - id: cfg.WorkerID, - config: cfg, - logger: logger, - runLoop: runLoop, - runner: jobRunner, - metrics: metricsObj, - health: lifecycle.NewHealthMonitor(), - resources: rm, - jupyter: jupyterMgr, + id: cfg.WorkerID, + config: cfg, + logger: logger, + runLoop: runLoop, + runner: jobRunner, + metrics: metricsObj, + health: lifecycle.NewHealthMonitor(), + resources: rm, + jupyter: jupyterMgr, + gpuDetectionInfo: gpuDetectionInfo, } // Log GPU configuration if !cfg.LocalMode { gpuType := strings.ToLower(strings.TrimSpace(os.Getenv("FETCH_ML_GPU_TYPE"))) - if cfg.AppleGPU.Enabled { - logger.Warn("apple MPS GPU mode is intended for development; do not use in production", - "gpu_type", "apple", - ) - } if gpuType == "amd" { - logger.Warn("amd GPU mode is intended for development; do not use in production", - "gpu_type", "amd", + cancel() + return nil, fmt.Errorf( + "AMD GPU mode is not supported in production (FETCH_ML_GPU_TYPE=amd). " + + "Use 'nvidia', 'apple', 'none', or GPUDevices config. " + + "AMD support is available in local mode for experimental development", + ) + } else if cfg.AppleGPU.Enabled { + logger.Warn( + "apple MPS GPU mode is intended for development; do not use in production", + "gpu_type", "apple", ) } } diff --git a/internal/worker/gpu_detector.go b/internal/worker/gpu_detector.go index 987cf88..ad2ed6c 100644 --- a/internal/worker/gpu_detector.go +++ b/internal/worker/gpu_detector.go @@ -11,10 +11,33 @@ type GPUType string const ( GPUTypeNVIDIA GPUType = "nvidia" + GPUTypeAMD GPUType = "amd" GPUTypeApple GPUType = "apple" GPUTypeNone GPUType = "none" ) +// DetectionSource indicates how the GPU detector was selected +type DetectionSource string + +const ( + DetectionSourceEnvType DetectionSource = "env_override_type" + DetectionSourceEnvCount DetectionSource = "env_override_count" + DetectionSourceEnvBoth DetectionSource = "env_override_both" + DetectionSourceConfig DetectionSource = "config" + DetectionSourceAuto DetectionSource = "auto" + DetectionSourceNone DetectionSource = "none" +) + +// GPUDetectionInfo provides metadata about how GPU detection was determined +type GPUDetectionInfo struct { + GPUType GPUType `json:"gpu_type"` + ConfiguredVendor string `json:"configured_vendor"` + DetectionMethod DetectionSource `json:"detection_method"` + EnvOverrideType string `json:"env_override_type,omitempty"` + EnvOverrideCount int `json:"env_override_count,omitempty"` + ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"` +} + // GPUDetector interface for detecting GPU availability type GPUDetector interface { DetectGPUCount() int @@ -138,47 +161,240 @@ func (d *NoneDetector) GetDevicePaths() []string { return nil } -// GPUDetectorFactory creates appropriate GPU detector based on config +// GPUDetectorFactory creates appropriate GPU detector based config type GPUDetectorFactory struct{} +// DetectionResult contains both the detector and metadata about how it was selected +type DetectionResult struct { + Detector GPUDetector + Info GPUDetectionInfo +} + func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector { - // Check for explicit environment override - if gpuType := os.Getenv("FETCH_ML_GPU_TYPE"); gpuType != "" { - switch gpuType { + result := f.CreateDetectorWithInfo(cfg) + return result.Detector +} + +func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult { + // Check for explicit environment overrides + envType := os.Getenv("FETCH_ML_GPU_TYPE") + envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT") + + if envType != "" && hasEnvCount { + // Both env vars set + logEnvOverride("FETCH_ML_GPU_TYPE", envType) + logEnvOverride("FETCH_ML_GPU_COUNT", envCount) + switch envType { case string(GPUTypeNVIDIA): - return &NVIDIADetector{} + return DetectionResult{ + Detector: &NVIDIADetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNVIDIA, + ConfiguredVendor: "nvidia", + DetectionMethod: DetectionSourceEnvBoth, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + }, + } case string(GPUTypeApple): - return &AppleDetector{enabled: true} + return DetectionResult{ + Detector: &AppleDetector{enabled: true}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeApple, + ConfiguredVendor: "apple", + DetectionMethod: DetectionSourceEnvBoth, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + }, + } case string(GPUTypeNone): - return &NoneDetector{} - } - } - - // Respect configured vendor when explicitly set. - if cfg != nil { - switch GPUType(cfg.GPUVendor) { - case GPUTypeApple: - return &AppleDetector{enabled: cfg.AppleGPU.Enabled} - case GPUTypeNone: - return &NoneDetector{} - case GPUTypeNVIDIA: - return &NVIDIADetector{} + return DetectionResult{ + Detector: &NoneDetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNone, + ConfiguredVendor: "none", + DetectionMethod: DetectionSourceEnvBoth, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + }, + } case "amd": - // AMD uses similar device exposure patterns in this codebase. - return &NVIDIADetector{} + // AMD env override uses NVIDIA detector (aliased) + return DetectionResult{ + Detector: &NVIDIADetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeAMD, + ConfiguredVendor: "amd", + DetectionMethod: DetectionSourceEnvBoth, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + }, + } } } - // Auto-detect based on config - if cfg != nil { - if cfg.AppleGPU.Enabled { - return &AppleDetector{enabled: true} + if envType != "" { + // Only FETCH_ML_GPU_TYPE set + logEnvOverride("FETCH_ML_GPU_TYPE", envType) + switch envType { + case string(GPUTypeNVIDIA): + return DetectionResult{ + Detector: &NVIDIADetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNVIDIA, + ConfiguredVendor: "nvidia", + DetectionMethod: DetectionSourceEnvType, + EnvOverrideType: envType, + }, + } + case string(GPUTypeApple): + return DetectionResult{ + Detector: &AppleDetector{enabled: true}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeApple, + ConfiguredVendor: "apple", + DetectionMethod: DetectionSourceEnvType, + EnvOverrideType: envType, + }, + } + case string(GPUTypeNone): + return DetectionResult{ + Detector: &NoneDetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNone, + ConfiguredVendor: "none", + DetectionMethod: DetectionSourceEnvType, + EnvOverrideType: envType, + }, + } + case "amd": + // AMD env override uses NVIDIA detector (aliased) + return DetectionResult{ + Detector: &NVIDIADetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeAMD, + ConfiguredVendor: "amd", + DetectionMethod: DetectionSourceEnvType, + EnvOverrideType: envType, + }, + } } - if len(cfg.GPUDevices) > 0 { - return &NVIDIADetector{} + } + + if hasEnvCount { + // Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto + logEnvOverride("FETCH_ML_GPU_COUNT", envCount) + return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount) + } + + // No env overrides - detect from config + return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1) +} + +func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult { + if cfg == nil { + return DetectionResult{ + Detector: &NoneDetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNone, + ConfiguredVendor: "none", + DetectionMethod: source, + }, + } + } + + switch GPUType(cfg.GPUVendor) { + case GPUTypeApple: + return DetectionResult{ + Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeApple, + ConfiguredVendor: "apple", + DetectionMethod: source, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, + }, + } + case GPUTypeNone: + return DetectionResult{ + Detector: &NoneDetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNone, + ConfiguredVendor: "none", + DetectionMethod: source, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, + }, + } + case GPUTypeNVIDIA: + return DetectionResult{ + Detector: &NVIDIADetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNVIDIA, + ConfiguredVendor: "nvidia", + DetectionMethod: source, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, + }, + } + case "amd": + // AMD uses similar device exposure patterns in this codebase. + // This is the key aliasing point - we report AMD as configured vendor + // but use NVIDIADetector for implementation. + return DetectionResult{ + Detector: &NVIDIADetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNVIDIA, + ConfiguredVendor: "amd", // User configured "amd" + DetectionMethod: source, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, + }, + } + } + + // Auto-detect based on config settings + if cfg.AppleGPU.Enabled { + return DetectionResult{ + Detector: &AppleDetector{enabled: true}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeApple, + ConfiguredVendor: "apple", + DetectionMethod: DetectionSourceAuto, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, + }, + } + } + if len(cfg.GPUDevices) > 0 { + return DetectionResult{ + Detector: &NVIDIADetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNVIDIA, + ConfiguredVendor: "nvidia", + DetectionMethod: DetectionSourceAuto, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, + }, } } // Default to no GPU - return &NoneDetector{} + return DetectionResult{ + Detector: &NoneDetector{}, + Info: GPUDetectionInfo{ + GPUType: GPUTypeNone, + ConfiguredVendor: "none", + DetectionMethod: source, + EnvOverrideType: envType, + EnvOverrideCount: envCount, + ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, + }, + } } diff --git a/internal/worker/gpu_macos.go b/internal/worker/gpu_macos.go index d1a764f..a482953 100644 --- a/internal/worker/gpu_macos.go +++ b/internal/worker/gpu_macos.go @@ -8,6 +8,7 @@ import ( "context" "encoding/json" "fmt" + "os" "os/exec" "regexp" "runtime" @@ -176,6 +177,9 @@ func GetPowermetricsData() (*PowermetricsData, error) { out, err := cmd.Output() if err != nil { // powermetrics not available or no permission + if ctx.Err() != context.DeadlineExceeded { + fmt.Fprintln(os.Stderr, "Warning: powermetrics requires sudo for GPU metrics") + } return &PowermetricsData{HasData: false}, nil } diff --git a/internal/worker/native_bridge_libs.go b/internal/worker/native_bridge_libs.go index 58bbe3b..0fac19e 100644 --- a/internal/worker/native_bridge_libs.go +++ b/internal/worker/native_bridge_libs.go @@ -26,6 +26,8 @@ var ( ctxInitTime time.Time ) +// getHashContext returns the native hash context, initializing it on first call. +// First call initializes C++ context (5-20ms) - subsequent calls reuse context. func getHashContext() *C.fh_context_t { hashCtxOnce.Do(func() { start := time.Now() @@ -65,7 +67,7 @@ func HasSIMDSHA256() bool { } func ScanArtifactsNative(runDir string) (*manifest.Artifacts, error) { - return ScanArtifacts(runDir) + return ScanArtifacts(runDir, false) } func ExtractTarGzNative(archivePath, dstDir string) error { diff --git a/internal/worker/worker.go b/internal/worker/worker.go index c38ac79..2c7c990 100644 --- a/internal/worker/worker.go +++ b/internal/worker/worker.go @@ -55,6 +55,9 @@ type Worker struct { health *lifecycle.HealthMonitor resources *resources.Manager + // GPU detection metadata for status output + gpuDetectionInfo GPUDetectionInfo + // Legacy fields for backward compatibility during migration jupyter JupyterManager queueClient queue.Backend // Stored for prewarming access diff --git a/tests/benchmarks/artifact_and_snapshot_bench_test.go b/tests/benchmarks/artifact_and_snapshot_bench_test.go index 44d9439..50caee7 100644 --- a/tests/benchmarks/artifact_and_snapshot_bench_test.go +++ b/tests/benchmarks/artifact_and_snapshot_bench_test.go @@ -131,7 +131,7 @@ func BenchmarkScanArtifacts(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { - _, err := worker.ScanArtifacts(runDir) + _, err := worker.ScanArtifacts(runDir, false) if err != nil { b.Fatal(err) } diff --git a/tests/benchmarks/artifact_scanner_bench_test.go b/tests/benchmarks/artifact_scanner_bench_test.go index 6d5acb4..f237ef2 100644 --- a/tests/benchmarks/artifact_scanner_bench_test.go +++ b/tests/benchmarks/artifact_scanner_bench_test.go @@ -15,11 +15,10 @@ func BenchmarkArtifactScanGo(b *testing.B) { // Create test artifact structure createTestArtifacts(b, tmpDir, 100) - b.ResetTimer() b.ReportAllocs() - for i := 0; i < b.N; i++ { - _, err := worker.ScanArtifacts(tmpDir) + for b.Loop() { + _, err := worker.ScanArtifacts(tmpDir, false) if err != nil { b.Fatal(err) } @@ -34,10 +33,9 @@ func BenchmarkArtifactScanNative(b *testing.B) { // Create test artifact structure createTestArtifacts(b, tmpDir, 100) - b.ResetTimer() b.ReportAllocs() - for i := 0; i < b.N; i++ { + for b.Loop() { _, err := worker.ScanArtifactsNative(tmpDir) if err != nil { b.Fatal(err) @@ -54,8 +52,8 @@ func BenchmarkArtifactScanLarge(b *testing.B) { b.Run("Go", func(b *testing.B) { b.ReportAllocs() - for i := 0; i < b.N; i++ { - _, err := worker.ScanArtifacts(tmpDir) + for b.Loop() { + _, err := worker.ScanArtifacts(tmpDir, false) if err != nil { b.Fatal(err) } @@ -64,7 +62,7 @@ func BenchmarkArtifactScanLarge(b *testing.B) { b.Run("Native", func(b *testing.B) { b.ReportAllocs() - for i := 0; i < b.N; i++ { + for b.Loop() { _, err := worker.ScanArtifactsNative(tmpDir) if err != nil { b.Fatal(err) @@ -93,7 +91,7 @@ func createTestArtifacts(b testing.TB, root string, count int) { } // Create test files - for i := 0; i < count; i++ { + for i := range count { var path string switch i % 5 { case 0: diff --git a/tests/unit/gpu/gpu_detector_test.go b/tests/unit/gpu/gpu_detector_test.go new file mode 100644 index 0000000..05b55af --- /dev/null +++ b/tests/unit/gpu/gpu_detector_test.go @@ -0,0 +1,210 @@ +package worker_test + +import ( + "os" + "testing" + + "github.com/jfraeys/fetch_ml/internal/worker" +) + +// TestGPUDetectorEnvOverrides validates both FETCH_ML_GPU_TYPE and FETCH_ML_GPU_COUNT work +func TestGPUDetectorEnvOverrides(t *testing.T) { + tests := []struct { + name string + gpuType string + gpuCount string + wantType worker.GPUType + wantCount int + wantMethod worker.DetectionSource + wantConfigured string + }{ + { + name: "env type only - nvidia", + gpuType: "nvidia", + wantType: worker.GPUTypeNVIDIA, + wantMethod: worker.DetectionSourceEnvType, + wantConfigured: "nvidia", + }, + { + name: "env type only - apple", + gpuType: "apple", + wantType: worker.GPUTypeApple, + wantMethod: worker.DetectionSourceEnvType, + wantConfigured: "apple", + }, + { + name: "env type only - none", + gpuType: "none", + wantType: worker.GPUTypeNone, + wantMethod: worker.DetectionSourceEnvType, + wantConfigured: "none", + }, + { + name: "both env vars set", + gpuType: "nvidia", + gpuCount: "4", + wantType: worker.GPUTypeNVIDIA, + wantMethod: worker.DetectionSourceEnvBoth, + wantConfigured: "nvidia", + }, + { + name: "env type amd - shows amd configured vendor", + gpuType: "amd", + wantType: worker.GPUTypeAMD, + wantMethod: worker.DetectionSourceEnvType, + wantConfigured: "amd", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Set env vars + if tt.gpuType != "" { + os.Setenv("FETCH_ML_GPU_TYPE", tt.gpuType) + defer os.Unsetenv("FETCH_ML_GPU_TYPE") + } + if tt.gpuCount != "" { + os.Setenv("FETCH_ML_GPU_COUNT", tt.gpuCount) + defer os.Unsetenv("FETCH_ML_GPU_COUNT") + } + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(nil) + + if result.Info.GPUType != tt.wantType { + t.Errorf("GPUType = %v, want %v", result.Info.GPUType, tt.wantType) + } + if result.Info.DetectionMethod != tt.wantMethod { + t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, tt.wantMethod) + } + if result.Info.ConfiguredVendor != tt.wantConfigured { + t.Errorf("ConfiguredVendor = %v, want %v", result.Info.ConfiguredVendor, tt.wantConfigured) + } + }) + } +} + +// TestGPUDetectorAMDVendorAlias validates AMD config shows proper aliasing +func TestGPUDetectorAMDVendorAlias(t *testing.T) { + cfg := &worker.Config{ + GPUVendor: "amd", + } + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(cfg) + + // AMD uses NVIDIA detector implementation + if result.Info.ConfiguredVendor != "amd" { + t.Errorf("ConfiguredVendor = %v, want 'amd'", result.Info.ConfiguredVendor) + } + if result.Info.GPUType != worker.GPUTypeNVIDIA { + t.Errorf("GPUType = %v, want %v (NVIDIA implementation for AMD alias)", result.Info.GPUType, worker.GPUTypeNVIDIA) + } +} + +// TestGPUDetectorEnvCountOverride validates FETCH_ML_GPU_COUNT with auto-detect +func TestGPUDetectorEnvCountOverride(t *testing.T) { + os.Setenv("FETCH_ML_GPU_COUNT", "8") + defer os.Unsetenv("FETCH_ML_GPU_COUNT") + + cfg := &worker.Config{ + GPUVendor: "nvidia", + } + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(cfg) + + if result.Info.DetectionMethod != worker.DetectionSourceEnvCount { + t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, worker.DetectionSourceEnvCount) + } + if result.Info.EnvOverrideCount != 8 { + t.Errorf("EnvOverrideCount = %v, want 8", result.Info.EnvOverrideCount) + } +} + +// TestGPUDetectorDetectionSources validates all detection source types +func TestGPUDetectorDetectionSources(t *testing.T) { + tests := []struct { + name string + envType string + envCount string + config *worker.Config + wantSource worker.DetectionSource + }{ + { + name: "env type takes precedence over config", + envType: "apple", + config: &worker.Config{GPUVendor: "nvidia"}, + wantSource: worker.DetectionSourceEnvType, + }, + { + name: "env count triggers env_count source", + envCount: "2", + config: &worker.Config{GPUVendor: "nvidia"}, + wantSource: worker.DetectionSourceEnvCount, + }, + { + name: "config source when no env", + config: &worker.Config{GPUVendor: "nvidia"}, + wantSource: worker.DetectionSourceConfig, + }, + { + name: "auto source for GPUDevices", + config: &worker.Config{GPUDevices: []string{"/dev/nvidia0"}}, + wantSource: worker.DetectionSourceAuto, + }, + { + name: "auto source for AppleGPU", + config: &worker.Config{AppleGPU: worker.AppleGPUConfig{Enabled: true}}, + wantSource: worker.DetectionSourceAuto, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.envType != "" { + os.Setenv("FETCH_ML_GPU_TYPE", tt.envType) + defer os.Unsetenv("FETCH_ML_GPU_TYPE") + } + if tt.envCount != "" { + os.Setenv("FETCH_ML_GPU_COUNT", tt.envCount) + defer os.Unsetenv("FETCH_ML_GPU_COUNT") + } + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(tt.config) + + if result.Info.DetectionMethod != tt.wantSource { + t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, tt.wantSource) + } + }) + } +} + +// TestGPUDetectorInfoFields validates all GPUDetectionInfo fields are populated +func TestGPUDetectorInfoFields(t *testing.T) { + os.Setenv("FETCH_ML_GPU_TYPE", "nvidia") + os.Setenv("FETCH_ML_GPU_COUNT", "4") + defer os.Unsetenv("FETCH_ML_GPU_TYPE") + defer os.Unsetenv("FETCH_ML_GPU_COUNT") + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(nil) + + // Validate all expected fields + if result.Info.GPUType == "" { + t.Error("GPUType field is empty") + } + if result.Info.ConfiguredVendor == "" { + t.Error("ConfiguredVendor field is empty") + } + if result.Info.DetectionMethod == "" { + t.Error("DetectionMethod field is empty") + } + if result.Info.EnvOverrideType != "nvidia" { + t.Errorf("EnvOverrideType = %v, want 'nvidia'", result.Info.EnvOverrideType) + } + if result.Info.EnvOverrideCount != 4 { + t.Errorf("EnvOverrideCount = %v, want 4", result.Info.EnvOverrideCount) + } +} diff --git a/tests/unit/gpu/gpu_golden_test.go b/tests/unit/gpu/gpu_golden_test.go new file mode 100644 index 0000000..ab556c4 --- /dev/null +++ b/tests/unit/gpu/gpu_golden_test.go @@ -0,0 +1,311 @@ +package worker_test + +import ( + "encoding/json" + "os" + "testing" + + "github.com/jfraeys/fetch_ml/internal/worker" +) + +// GoldenGPUStatus represents the expected GPU status output for golden file testing +type GoldenGPUStatus struct { + GPUCount int `json:"gpu_count"` + GPUType string `json:"gpu_type"` + ConfiguredVendor string `json:"configured_vendor"` + DetectionMethod string `json:"detection_method"` + EnvOverrideType string `json:"env_override_type,omitempty"` + EnvOverrideCount int `json:"env_override_count,omitempty"` + BuildTags map[string]bool `json:"build_tags"` + NativeAvailable bool `json:"native_available"` + Extra map[string]interface{} `json:"extra,omitempty"` +} + +// detectBuildTags returns which build tags are active +func detectBuildTags() map[string]bool { + tags := map[string]bool{ + "cgo": false, + "native_libs": false, + "darwin": false, + "linux": false, + } + + // CGO is determined at compile time - we can detect by trying to use native + // If native functions return "disabled", we know native_libs is not set + simdName := worker.GetSIMDImplName() + tags["native_libs"] = simdName != "disabled" && simdName != "disabled (no CGO)" + tags["cgo"] = simdName != "disabled (no CGO)" + + // OS detection + if worker.IsMacOS() { + tags["darwin"] = true + } else { + tags["linux"] = true + } + + return tags +} + +// TestGoldenGPUStatusNVML validates GPU status against golden file for NVML path +// This test runs under all build configurations but expectations differ: +// - cgo+native_libs: Real GPU count and NVML detection +// - cgo without native_libs: Returns 0, nil (stub behavior) +// - !cgo: Returns 0, nil (stub behavior) +func TestGoldenGPUStatusNVML(t *testing.T) { + // Setup: Configure for NVIDIA detection + cfg := &worker.Config{ + GPUVendor: "nvidia", + } + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(cfg) + + // Get actual detected count (behavior varies by build tags) + count := result.Detector.DetectGPUCount() + + buildTags := detectBuildTags() + + // Build the golden status object + got := GoldenGPUStatus{ + GPUCount: count, + GPUType: string(result.Info.GPUType), + ConfiguredVendor: result.Info.ConfiguredVendor, + DetectionMethod: string(result.Info.DetectionMethod), + BuildTags: buildTags, + NativeAvailable: buildTags["native_libs"] && buildTags["cgo"], + } + + // Validate against build-specific expectations + if buildTags["native_libs"] && buildTags["cgo"] { + // Real NVML build: Should detect actual GPUs or get real NVML error + // GPU count may be 0 if no NVIDIA hardware, but detection method should be config + if got.DetectionMethod != "config" { + t.Errorf("cgo+native_libs: DetectionMethod = %v, want 'config'", got.DetectionMethod) + } + } else if buildTags["cgo"] { + // CGO without native_libs: Stub returns 0 + if got.GPUCount != 0 { + t.Logf("cgo-only build: GPUCount = %d (expected 0 from stub)", got.GPUCount) + } + if got.NativeAvailable { + t.Error("cgo-only build: NativeAvailable should be false") + } + } else { + // No CGO: Stub returns 0 + if got.GPUCount != 0 { + t.Logf("nocgo build: GPUCount = %d (expected 0 from stub)", got.GPUCount) + } + if got.NativeAvailable { + t.Error("nocgo build: NativeAvailable should be false") + } + } + + // Common validations + if got.ConfiguredVendor != "nvidia" { + t.Errorf("ConfiguredVendor = %v, want 'nvidia'", got.ConfiguredVendor) + } +} + +// TestGoldenGPUStatusAMDVendorAlias validates AMD aliasing is visible in output +// Build tags: all three configurations +// Runtime scenarios: amd config +func TestGoldenGPUStatusAMDVendorAlias(t *testing.T) { + cfg := &worker.Config{ + GPUVendor: "amd", + } + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(cfg) + + buildTags := detectBuildTags() + + got := GoldenGPUStatus{ + GPUCount: result.Detector.DetectGPUCount(), + GPUType: string(result.Info.GPUType), + ConfiguredVendor: result.Info.ConfiguredVendor, + DetectionMethod: string(result.Info.DetectionMethod), + BuildTags: buildTags, + NativeAvailable: buildTags["native_libs"] && buildTags["cgo"], + } + + // The key assertion: configured_vendor should be "amd" but GPUType should be "nvidia" + // This makes the aliasing visible in status output + if got.ConfiguredVendor != "amd" { + t.Errorf("AMD config: ConfiguredVendor = %v, want 'amd'", got.ConfiguredVendor) + } + if got.GPUType != "nvidia" { + t.Errorf("AMD config: GPUType = %v, want 'nvidia' (AMD aliased to NVIDIA implementation)", got.GPUType) + } +} + +// TestGoldenGPUStatusEnvOverride validates env override behavior across build configs +// Build tags: all three +// Runtime scenarios: env override set +func TestGoldenGPUStatusEnvOverride(t *testing.T) { + // Set env override + os.Setenv("FETCH_ML_GPU_TYPE", "nvidia") + os.Setenv("FETCH_ML_GPU_COUNT", "4") + defer os.Unsetenv("FETCH_ML_GPU_TYPE") + defer os.Unsetenv("FETCH_ML_GPU_COUNT") + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(&worker.Config{GPUVendor: "apple"}) + + buildTags := detectBuildTags() + + got := GoldenGPUStatus{ + GPUCount: result.Detector.DetectGPUCount(), + GPUType: string(result.Info.GPUType), + ConfiguredVendor: result.Info.ConfiguredVendor, + DetectionMethod: string(result.Info.DetectionMethod), + EnvOverrideType: result.Info.EnvOverrideType, + EnvOverrideCount: result.Info.EnvOverrideCount, + BuildTags: buildTags, + } + + // Env should take precedence over config + if got.DetectionMethod != "env_override_both" { + t.Errorf("Env override: DetectionMethod = %v, want 'env_override_both'", got.DetectionMethod) + } + if got.GPUType != "nvidia" { + t.Errorf("Env override: GPUType = %v, want 'nvidia'", got.GPUType) + } + if got.EnvOverrideType != "nvidia" { + t.Errorf("Env override: EnvOverrideType = %v, want 'nvidia'", got.EnvOverrideType) + } + if got.EnvOverrideCount != 4 { + t.Errorf("Env override: EnvOverrideCount = %v, want 4", got.EnvOverrideCount) + } +} + +// TestGoldenGPUStatusMacOS validates macOS detection when running on Darwin +// Build tags: cgo+native_libs on Darwin +// Runtime scenarios: darwin +func TestGoldenGPUStatusMacOS(t *testing.T) { + if !worker.IsMacOS() { + t.Skip("Skipping macOS-specific test on non-Darwin platform") + } + + cfg := &worker.Config{ + GPUVendor: "apple", + AppleGPU: worker.AppleGPUConfig{Enabled: true}, + } + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(cfg) + + buildTags := detectBuildTags() + + got := GoldenGPUStatus{ + GPUCount: result.Detector.DetectGPUCount(), + GPUType: string(result.Info.GPUType), + ConfiguredVendor: result.Info.ConfiguredVendor, + DetectionMethod: string(result.Info.DetectionMethod), + BuildTags: buildTags, + NativeAvailable: buildTags["darwin"], + } + + if got.ConfiguredVendor != "apple" { + t.Errorf("macOS: ConfiguredVendor = %v, want 'apple'", got.ConfiguredVendor) + } + if got.GPUType != "apple" { + t.Errorf("macOS: GPUType = %v, want 'apple'", got.GPUType) + } + if !got.BuildTags["darwin"] { + t.Error("macOS: darwin build tag should be true") + } +} + +// TestGoldenGPUStatusNone validates no-GPU configuration +// Build tags: all three +// Runtime scenarios: none +func TestGoldenGPUStatusNone(t *testing.T) { + cfg := &worker.Config{ + GPUVendor: "none", + } + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(cfg) + + if result.Detector.DetectGPUCount() != 0 { + t.Errorf("none config: GPUCount = %v, want 0", result.Detector.DetectGPUCount()) + } + if result.Info.ConfiguredVendor != "none" { + t.Errorf("none config: ConfiguredVendor = %v, want 'none'", result.Info.ConfiguredVendor) + } +} + +// TestGoldenJSONSerialization validates the GPU status serializes to JSON correctly +func TestGoldenJSONSerialization(t *testing.T) { + os.Setenv("FETCH_ML_GPU_TYPE", "nvidia") + os.Setenv("FETCH_ML_GPU_COUNT", "2") + defer os.Unsetenv("FETCH_ML_GPU_TYPE") + defer os.Unsetenv("FETCH_ML_GPU_COUNT") + + factory := &worker.GPUDetectorFactory{} + result := factory.CreateDetectorWithInfo(nil) + + status := GoldenGPUStatus{ + GPUCount: result.Detector.DetectGPUCount(), + GPUType: string(result.Info.GPUType), + ConfiguredVendor: result.Info.ConfiguredVendor, + DetectionMethod: string(result.Info.DetectionMethod), + EnvOverrideType: result.Info.EnvOverrideType, + EnvOverrideCount: result.Info.EnvOverrideCount, + BuildTags: detectBuildTags(), + } + + // Serialize to JSON (this mimics what ml status --json would output) + jsonData, err := json.MarshalIndent(status, "", " ") + if err != nil { + t.Fatalf("JSON serialization failed: %v", err) + } + + // Verify JSON can be parsed back + var parsed GoldenGPUStatus + if err := json.Unmarshal(jsonData, &parsed); err != nil { + t.Fatalf("JSON deserialization failed: %v", err) + } + + if parsed.ConfiguredVendor != status.ConfiguredVendor { + t.Errorf("JSON roundtrip: ConfiguredVendor mismatch") + } + if parsed.DetectionMethod != status.DetectionMethod { + t.Errorf("JSON roundtrip: DetectionMethod mismatch") + } +} + +// TestBuildTagMatrix validates that all expected build tag combinations are testable +// This test documents the three build configurations: +// 1. cgo + native_libs: Real native library implementations +// 2. cgo without native_libs: Stubs that return errors +// 3. !cgo: Stubs that return "disabled (no CGO)" +func TestBuildTagMatrix(t *testing.T) { + tags := detectBuildTags() + + // Log the current build configuration for CI visibility + t.Logf("Build configuration: cgo=%v native_libs=%v darwin=%v linux=%v", + tags["cgo"], tags["native_libs"], tags["darwin"], tags["linux"]) + + // Validate SIMD implementation name matches build tags + simdName := worker.GetSIMDImplName() + t.Logf("SIMD implementation: %s", simdName) + + switch { + case tags["native_libs"]: + // Should have real implementation name (avx2, sha_ni, armv8_crypto, or generic) + if simdName == "disabled" || simdName == "disabled (no CGO)" { + t.Errorf("native_libs build: SIMD impl should be active, got %q", simdName) + } + case tags["cgo"]: + // Should be disabled without native_libs + if simdName != "disabled" { + t.Errorf("cgo-only build: SIMD impl should be 'disabled', got %q", simdName) + } + default: + // No CGO + if simdName != "disabled (no CGO)" { + t.Errorf("nocgo build: SIMD impl should be 'disabled (no CGO)', got %q", simdName) + } + } +} diff --git a/tests/unit/worker/artifacts_test.go b/tests/unit/worker/artifacts_test.go index dd527f4..a99df85 100644 --- a/tests/unit/worker/artifacts_test.go +++ b/tests/unit/worker/artifacts_test.go @@ -30,7 +30,7 @@ func TestScanArtifacts_SkipsKnownPathsAndLogs(t *testing.T) { mustWrite("checkpoints/best.pt", []byte("checkpoint")) mustWrite("plots/loss.png", []byte("png")) - art, err := worker.ScanArtifacts(runDir) + art, err := worker.ScanArtifacts(runDir, false) if err != nil { t.Fatalf("scanArtifacts: %v", err) }