feat: GPU detection transparency and artifact scanner improvements
Some checks failed
Build CLI with Embedded SQLite / build (arm64, aarch64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build (x86_64, x86_64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (arm64) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (x86_64) (push) Waiting to run
Security Scan / Security Analysis (push) Waiting to run
Security Scan / Native Library Security (push) Waiting to run
Checkout test / test (push) Successful in 6s
CI/CD Pipeline / Test (push) Failing after 1s
CI/CD Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI/CD Pipeline / Build (push) Has been skipped
CI/CD Pipeline / Test Scripts (push) Has been skipped
CI/CD Pipeline / Test Native Libraries (push) Has been skipped
CI/CD Pipeline / GPU Golden Test Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 39s
CI/CD Pipeline / Docker Build (push) Has been skipped

- Surface GPUDetectionInfo from parseGPUCountFromConfig for detection metadata
- Document FETCH_ML_TOTAL_CPU and FETCH_ML_GPU_SLOTS_PER_GPU env vars
- Add debug logging for all env var overrides to stderr
- Track config-layer auto-detection in GPUDetectionInfo.ConfigLayerAutoDetected
- Add --include-all flag to artifact scanner (includeAll parameter)
- Add AMD production mode enforcement (error in non-local mode)
- Add GPU detector unit tests for env overrides and AMD aliasing
This commit is contained in:
Jeremie Fraeys 2026-02-23 12:29:34 -05:00
parent f987ddb86c
commit 3b194ff2e8
No known key found for this signature in database
15 changed files with 915 additions and 91 deletions

View file

@ -334,6 +334,62 @@ jobs:
echo "=== Native Implementation ===" echo "=== Native Implementation ==="
CGO_ENABLED=1 go test -tags native_libs -bench=. ./tests/benchmarks/ -benchmem || true CGO_ENABLED=1 go test -tags native_libs -bench=. ./tests/benchmarks/ -benchmem || true
test-gpu-matrix:
name: GPU Golden Test Matrix
runs-on: self-hosted
needs: test-native
timeout-minutes: 15
strategy:
matrix:
build_config: [cgo-native, cgo-only, nocgo]
fail-fast: false
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Setup Go
run: |
REQUIRED_GO="1.25.0"
if command -v go &> /dev/null && go version | grep -q "go${REQUIRED_GO}"; then
echo "Go ${REQUIRED_GO} already installed"
else
echo "Installing Go ${REQUIRED_GO}..."
curl -sL "https://go.dev/dl/go${REQUIRED_GO}.linux-amd64.tar.gz" | sudo tar -C /usr/local -xzf -
export PATH="/usr/local/go/bin:$PATH"
echo "/usr/local/go/bin" >> $GITHUB_PATH
fi
go version
- name: Build Native Libraries (for cgo-native config)
if: matrix.build_config == 'cgo-native'
run: |
sudo apt-get update
sudo apt-get install -y cmake zlib1g-dev build-essential
make native-build || echo "Native build skipped (may fail without proper deps)"
- name: Run GPU Tests - cgo+native_libs
if: matrix.build_config == 'cgo-native'
run: |
echo "=== Testing cgo + native_libs build ==="
CGO_ENABLED=1 go test -tags native_libs -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
CGO_ENABLED=1 go test -tags native_libs -v ./tests/unit/gpu/ -run TestBuildTagMatrix
- name: Run GPU Tests - cgo only (no native_libs)
if: matrix.build_config == 'cgo-only'
run: |
echo "=== Testing cgo without native_libs build ==="
CGO_ENABLED=1 go test -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
CGO_ENABLED=1 go test -v ./tests/unit/gpu/ -run TestBuildTagMatrix
- name: Run GPU Tests - nocgo
if: matrix.build_config == 'nocgo'
run: |
echo "=== Testing !cgo build ==="
CGO_ENABLED=0 go test -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
CGO_ENABLED=0 go test -v ./tests/unit/gpu/ -run TestBuildTagMatrix
docker-build: docker-build:
name: Docker Build name: Docker Build
runs-on: self-hosted runs-on: self-hosted

View file

@ -13,7 +13,7 @@ const WorkQueue = struct {
depth: usize, depth: usize,
}; };
fn init(allocator: std.mem.Allocator) WorkQueue { fn init() WorkQueue {
return .{ return .{
.items = .empty, .items = .empty,
.mutex = .{}, .mutex = .{},

View file

@ -321,6 +321,14 @@ api_key = "<analyst-api-key>"
| `FETCHML_CONFIG` | - | Path to config file | | `FETCHML_CONFIG` | - | Path to config file |
| `FETCHML_LOG_LEVEL` | "info" | Override log level | | `FETCHML_LOG_LEVEL` | "info" | Override log level |
| `CLI_CONFIG` | - | Path to CLI config file | | `CLI_CONFIG` | - | Path to CLI config file |
| `FETCH_ML_GPU_TYPE` | - | Override GPU vendor detection (nvidia, amd, apple, none). Takes precedence over config file. |
| `FETCH_ML_GPU_COUNT` | - | Override GPU count detection. Used with auto-detected or configured vendor. |
| `FETCH_ML_TOTAL_CPU` | - | Override total CPU count detection. Sets the number of CPU cores available. |
| `FETCH_ML_GPU_SLOTS_PER_GPU` | 1 | Override GPU slots per GPU. Controls how many concurrent tasks can share a single GPU. |
When environment variable overrides are active, they are logged to stderr at worker startup for debugging.
Note: When `gpu_vendor: amd` is configured, the system uses the NVIDIA detector implementation (aliased) due to similar device exposure patterns. The `configured_vendor` field will show "amd" while the actual detection uses NVIDIA-compatible methods.
## Troubleshooting ## Troubleshooting

View file

@ -11,7 +11,7 @@ import (
"github.com/jfraeys/fetch_ml/internal/manifest" "github.com/jfraeys/fetch_ml/internal/manifest"
) )
func scanArtifacts(runDir string) (*manifest.Artifacts, error) { func scanArtifacts(runDir string, includeAll bool) (*manifest.Artifacts, error) {
runDir = strings.TrimSpace(runDir) runDir = strings.TrimSpace(runDir)
if runDir == "" { if runDir == "" {
return nil, fmt.Errorf("run dir is empty") return nil, fmt.Errorf("run dir is empty")
@ -37,19 +37,7 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
} }
rel = filepath.ToSlash(rel) rel = filepath.ToSlash(rel)
if rel == "code" || strings.HasPrefix(rel, "code/") { // Standard exclusions (always apply)
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if rel == manifestFilename { if rel == manifestFilename {
return nil return nil
} }
@ -57,12 +45,26 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
return nil return nil
} }
if strings.HasSuffix(rel, ".log") { // Optional exclusions (skipped when includeAll is true)
return nil if !includeAll {
} if rel == "code" || strings.HasPrefix(rel, "code/") {
if d.IsDir() {
if d.Type()&fs.ModeSymlink != 0 { return fs.SkipDir
return nil }
return nil
}
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if strings.HasSuffix(rel, ".log") {
return nil
}
if d.Type()&fs.ModeSymlink != 0 {
return nil
}
} }
if d.IsDir() { if d.IsDir() {
@ -100,6 +102,7 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
const manifestFilename = "run_manifest.json" const manifestFilename = "run_manifest.json"
// ScanArtifacts is an exported wrapper for testing/benchmarking. // ScanArtifacts is an exported wrapper for testing/benchmarking.
func ScanArtifacts(runDir string) (*manifest.Artifacts, error) { // When includeAll is false, excludes code/, snapshot/, *.log files, and symlinks.
return scanArtifacts(runDir) func ScanArtifacts(runDir string, includeAll bool) (*manifest.Artifacts, error) {
return scanArtifacts(runDir, includeAll)
} }

View file

@ -77,13 +77,14 @@ type Config struct {
PrewarmEnabled bool `yaml:"prewarm_enabled"` PrewarmEnabled bool `yaml:"prewarm_enabled"`
// Podman execution // Podman execution
PodmanImage string `yaml:"podman_image"` PodmanImage string `yaml:"podman_image"`
ContainerWorkspace string `yaml:"container_workspace"` ContainerWorkspace string `yaml:"container_workspace"`
ContainerResults string `yaml:"container_results"` ContainerResults string `yaml:"container_results"`
GPUDevices []string `yaml:"gpu_devices"` GPUDevices []string `yaml:"gpu_devices"`
GPUVendor string `yaml:"gpu_vendor"` GPUVendor string `yaml:"gpu_vendor"`
GPUVisibleDevices []int `yaml:"gpu_visible_devices"` GPUVendorAutoDetected bool `yaml:"-"` // Set by LoadConfig when GPUVendor is auto-detected
GPUVisibleDeviceIDs []string `yaml:"gpu_visible_device_ids"` GPUVisibleDevices []int `yaml:"gpu_visible_devices"`
GPUVisibleDeviceIDs []string `yaml:"gpu_visible_device_ids"`
// Apple M-series GPU configuration // Apple M-series GPU configuration
AppleGPU AppleGPUConfig `yaml:"apple_gpu"` AppleGPU AppleGPUConfig `yaml:"apple_gpu"`
@ -264,6 +265,7 @@ func LoadConfig(path string) (*Config, error) {
} }
if strings.TrimSpace(cfg.GPUVendor) == "" { if strings.TrimSpace(cfg.GPUVendor) == "" {
cfg.GPUVendorAutoDetected = true
if cfg.AppleGPU.Enabled { if cfg.AppleGPU.Enabled {
cfg.GPUVendor = string(GPUTypeApple) cfg.GPUVendor = string(GPUTypeApple)
} else if len(cfg.GPUDevices) > 0 || } else if len(cfg.GPUDevices) > 0 ||
@ -453,9 +455,15 @@ func envInt(name string) (int, bool) {
return n, true return n, true
} }
// logEnvOverride logs environment variable overrides to stderr for debugging
func logEnvOverride(name string, value interface{}) {
fmt.Fprintf(os.Stderr, "[env] %s=%v (override active)\n", name, value)
}
// parseCPUFromConfig determines total CPU from environment or config // parseCPUFromConfig determines total CPU from environment or config
func parseCPUFromConfig(cfg *Config) int { func parseCPUFromConfig(cfg *Config) int {
if n, ok := envInt("FETCH_ML_TOTAL_CPU"); ok && n >= 0 { if n, ok := envInt("FETCH_ML_TOTAL_CPU"); ok && n >= 0 {
logEnvOverride("FETCH_ML_TOTAL_CPU", n)
return n return n
} }
if cfg != nil { if cfg != nil {
@ -471,11 +479,11 @@ func parseCPUFromConfig(cfg *Config) int {
return runtime.NumCPU() return runtime.NumCPU()
} }
// parseGPUCountFromConfig detects GPU count from config // parseGPUCountFromConfig detects GPU count from config and returns detection metadata
func parseGPUCountFromConfig(cfg *Config) int { func parseGPUCountFromConfig(cfg *Config) (int, GPUDetectionInfo) {
factory := &GPUDetectorFactory{} factory := &GPUDetectorFactory{}
detector := factory.CreateDetector(cfg) result := factory.CreateDetectorWithInfo(cfg)
return detector.DetectGPUCount() return result.Detector.DetectGPUCount(), result.Info
} }
// parseGPUSlotsPerGPUFromConfig reads GPU slots per GPU from environment // parseGPUSlotsPerGPUFromConfig reads GPU slots per GPU from environment

View file

@ -147,9 +147,10 @@ func NewWorker(cfg *Config, _ string) (*Worker, error) {
) )
// Create resource manager // Create resource manager
gpuCount, gpuDetectionInfo := parseGPUCountFromConfig(cfg)
rm, err := resources.NewManager(resources.Options{ rm, err := resources.NewManager(resources.Options{
TotalCPU: parseCPUFromConfig(cfg), TotalCPU: parseCPUFromConfig(cfg),
GPUCount: parseGPUCountFromConfig(cfg), GPUCount: gpuCount,
SlotsPerGPU: parseGPUSlotsPerGPUFromConfig(), SlotsPerGPU: parseGPUSlotsPerGPUFromConfig(),
}) })
if err != nil { if err != nil {
@ -158,28 +159,32 @@ func NewWorker(cfg *Config, _ string) (*Worker, error) {
} }
worker := &Worker{ worker := &Worker{
id: cfg.WorkerID, id: cfg.WorkerID,
config: cfg, config: cfg,
logger: logger, logger: logger,
runLoop: runLoop, runLoop: runLoop,
runner: jobRunner, runner: jobRunner,
metrics: metricsObj, metrics: metricsObj,
health: lifecycle.NewHealthMonitor(), health: lifecycle.NewHealthMonitor(),
resources: rm, resources: rm,
jupyter: jupyterMgr, jupyter: jupyterMgr,
gpuDetectionInfo: gpuDetectionInfo,
} }
// Log GPU configuration // Log GPU configuration
if !cfg.LocalMode { if !cfg.LocalMode {
gpuType := strings.ToLower(strings.TrimSpace(os.Getenv("FETCH_ML_GPU_TYPE"))) gpuType := strings.ToLower(strings.TrimSpace(os.Getenv("FETCH_ML_GPU_TYPE")))
if cfg.AppleGPU.Enabled {
logger.Warn("apple MPS GPU mode is intended for development; do not use in production",
"gpu_type", "apple",
)
}
if gpuType == "amd" { if gpuType == "amd" {
logger.Warn("amd GPU mode is intended for development; do not use in production", cancel()
"gpu_type", "amd", return nil, fmt.Errorf(
"AMD GPU mode is not supported in production (FETCH_ML_GPU_TYPE=amd). " +
"Use 'nvidia', 'apple', 'none', or GPUDevices config. " +
"AMD support is available in local mode for experimental development",
)
} else if cfg.AppleGPU.Enabled {
logger.Warn(
"apple MPS GPU mode is intended for development; do not use in production",
"gpu_type", "apple",
) )
} }
} }

View file

@ -11,10 +11,33 @@ type GPUType string
const ( const (
GPUTypeNVIDIA GPUType = "nvidia" GPUTypeNVIDIA GPUType = "nvidia"
GPUTypeAMD GPUType = "amd"
GPUTypeApple GPUType = "apple" GPUTypeApple GPUType = "apple"
GPUTypeNone GPUType = "none" GPUTypeNone GPUType = "none"
) )
// DetectionSource indicates how the GPU detector was selected
type DetectionSource string
const (
DetectionSourceEnvType DetectionSource = "env_override_type"
DetectionSourceEnvCount DetectionSource = "env_override_count"
DetectionSourceEnvBoth DetectionSource = "env_override_both"
DetectionSourceConfig DetectionSource = "config"
DetectionSourceAuto DetectionSource = "auto"
DetectionSourceNone DetectionSource = "none"
)
// GPUDetectionInfo provides metadata about how GPU detection was determined
type GPUDetectionInfo struct {
GPUType GPUType `json:"gpu_type"`
ConfiguredVendor string `json:"configured_vendor"`
DetectionMethod DetectionSource `json:"detection_method"`
EnvOverrideType string `json:"env_override_type,omitempty"`
EnvOverrideCount int `json:"env_override_count,omitempty"`
ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"`
}
// GPUDetector interface for detecting GPU availability // GPUDetector interface for detecting GPU availability
type GPUDetector interface { type GPUDetector interface {
DetectGPUCount() int DetectGPUCount() int
@ -138,47 +161,240 @@ func (d *NoneDetector) GetDevicePaths() []string {
return nil return nil
} }
// GPUDetectorFactory creates appropriate GPU detector based on config // GPUDetectorFactory creates appropriate GPU detector based config
type GPUDetectorFactory struct{} type GPUDetectorFactory struct{}
// DetectionResult contains both the detector and metadata about how it was selected
type DetectionResult struct {
Detector GPUDetector
Info GPUDetectionInfo
}
func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector { func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
// Check for explicit environment override result := f.CreateDetectorWithInfo(cfg)
if gpuType := os.Getenv("FETCH_ML_GPU_TYPE"); gpuType != "" { return result.Detector
switch gpuType { }
func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
// Check for explicit environment overrides
envType := os.Getenv("FETCH_ML_GPU_TYPE")
envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")
if envType != "" && hasEnvCount {
// Both env vars set
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
switch envType {
case string(GPUTypeNVIDIA): case string(GPUTypeNVIDIA):
return &NVIDIADetector{} return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "nvidia",
DetectionMethod: DetectionSourceEnvBoth,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
},
}
case string(GPUTypeApple): case string(GPUTypeApple):
return &AppleDetector{enabled: true} return DetectionResult{
Detector: &AppleDetector{enabled: true},
Info: GPUDetectionInfo{
GPUType: GPUTypeApple,
ConfiguredVendor: "apple",
DetectionMethod: DetectionSourceEnvBoth,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
},
}
case string(GPUTypeNone): case string(GPUTypeNone):
return &NoneDetector{} return DetectionResult{
} Detector: &NoneDetector{},
} Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
// Respect configured vendor when explicitly set. ConfiguredVendor: "none",
if cfg != nil { DetectionMethod: DetectionSourceEnvBoth,
switch GPUType(cfg.GPUVendor) { EnvOverrideType: envType,
case GPUTypeApple: EnvOverrideCount: envCount,
return &AppleDetector{enabled: cfg.AppleGPU.Enabled} },
case GPUTypeNone: }
return &NoneDetector{}
case GPUTypeNVIDIA:
return &NVIDIADetector{}
case "amd": case "amd":
// AMD uses similar device exposure patterns in this codebase. // AMD env override uses NVIDIA detector (aliased)
return &NVIDIADetector{} return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeAMD,
ConfiguredVendor: "amd",
DetectionMethod: DetectionSourceEnvBoth,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
},
}
} }
} }
// Auto-detect based on config if envType != "" {
if cfg != nil { // Only FETCH_ML_GPU_TYPE set
if cfg.AppleGPU.Enabled { logEnvOverride("FETCH_ML_GPU_TYPE", envType)
return &AppleDetector{enabled: true} switch envType {
case string(GPUTypeNVIDIA):
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "nvidia",
DetectionMethod: DetectionSourceEnvType,
EnvOverrideType: envType,
},
}
case string(GPUTypeApple):
return DetectionResult{
Detector: &AppleDetector{enabled: true},
Info: GPUDetectionInfo{
GPUType: GPUTypeApple,
ConfiguredVendor: "apple",
DetectionMethod: DetectionSourceEnvType,
EnvOverrideType: envType,
},
}
case string(GPUTypeNone):
return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: DetectionSourceEnvType,
EnvOverrideType: envType,
},
}
case "amd":
// AMD env override uses NVIDIA detector (aliased)
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeAMD,
ConfiguredVendor: "amd",
DetectionMethod: DetectionSourceEnvType,
EnvOverrideType: envType,
},
}
} }
if len(cfg.GPUDevices) > 0 { }
return &NVIDIADetector{}
if hasEnvCount {
// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
}
// No env overrides - detect from config
return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
}
func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
if cfg == nil {
return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: source,
},
}
}
switch GPUType(cfg.GPUVendor) {
case GPUTypeApple:
return DetectionResult{
Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
Info: GPUDetectionInfo{
GPUType: GPUTypeApple,
ConfiguredVendor: "apple",
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
case GPUTypeNone:
return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
case GPUTypeNVIDIA:
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "nvidia",
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
case "amd":
// AMD uses similar device exposure patterns in this codebase.
// This is the key aliasing point - we report AMD as configured vendor
// but use NVIDIADetector for implementation.
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "amd", // User configured "amd"
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
}
// Auto-detect based on config settings
if cfg.AppleGPU.Enabled {
return DetectionResult{
Detector: &AppleDetector{enabled: true},
Info: GPUDetectionInfo{
GPUType: GPUTypeApple,
ConfiguredVendor: "apple",
DetectionMethod: DetectionSourceAuto,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
}
if len(cfg.GPUDevices) > 0 {
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "nvidia",
DetectionMethod: DetectionSourceAuto,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
} }
} }
// Default to no GPU // Default to no GPU
return &NoneDetector{} return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
} }

View file

@ -8,6 +8,7 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"os"
"os/exec" "os/exec"
"regexp" "regexp"
"runtime" "runtime"
@ -176,6 +177,9 @@ func GetPowermetricsData() (*PowermetricsData, error) {
out, err := cmd.Output() out, err := cmd.Output()
if err != nil { if err != nil {
// powermetrics not available or no permission // powermetrics not available or no permission
if ctx.Err() != context.DeadlineExceeded {
fmt.Fprintln(os.Stderr, "Warning: powermetrics requires sudo for GPU metrics")
}
return &PowermetricsData{HasData: false}, nil return &PowermetricsData{HasData: false}, nil
} }

View file

@ -26,6 +26,8 @@ var (
ctxInitTime time.Time ctxInitTime time.Time
) )
// getHashContext returns the native hash context, initializing it on first call.
// First call initializes C++ context (5-20ms) - subsequent calls reuse context.
func getHashContext() *C.fh_context_t { func getHashContext() *C.fh_context_t {
hashCtxOnce.Do(func() { hashCtxOnce.Do(func() {
start := time.Now() start := time.Now()
@ -65,7 +67,7 @@ func HasSIMDSHA256() bool {
} }
func ScanArtifactsNative(runDir string) (*manifest.Artifacts, error) { func ScanArtifactsNative(runDir string) (*manifest.Artifacts, error) {
return ScanArtifacts(runDir) return ScanArtifacts(runDir, false)
} }
func ExtractTarGzNative(archivePath, dstDir string) error { func ExtractTarGzNative(archivePath, dstDir string) error {

View file

@ -55,6 +55,9 @@ type Worker struct {
health *lifecycle.HealthMonitor health *lifecycle.HealthMonitor
resources *resources.Manager resources *resources.Manager
// GPU detection metadata for status output
gpuDetectionInfo GPUDetectionInfo
// Legacy fields for backward compatibility during migration // Legacy fields for backward compatibility during migration
jupyter JupyterManager jupyter JupyterManager
queueClient queue.Backend // Stored for prewarming access queueClient queue.Backend // Stored for prewarming access

View file

@ -131,7 +131,7 @@ func BenchmarkScanArtifacts(b *testing.B) {
b.ReportAllocs() b.ReportAllocs()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
_, err := worker.ScanArtifacts(runDir) _, err := worker.ScanArtifacts(runDir, false)
if err != nil { if err != nil {
b.Fatal(err) b.Fatal(err)
} }

View file

@ -15,11 +15,10 @@ func BenchmarkArtifactScanGo(b *testing.B) {
// Create test artifact structure // Create test artifact structure
createTestArtifacts(b, tmpDir, 100) createTestArtifacts(b, tmpDir, 100)
b.ResetTimer()
b.ReportAllocs() b.ReportAllocs()
for i := 0; i < b.N; i++ { for b.Loop() {
_, err := worker.ScanArtifacts(tmpDir) _, err := worker.ScanArtifacts(tmpDir, false)
if err != nil { if err != nil {
b.Fatal(err) b.Fatal(err)
} }
@ -34,10 +33,9 @@ func BenchmarkArtifactScanNative(b *testing.B) {
// Create test artifact structure // Create test artifact structure
createTestArtifacts(b, tmpDir, 100) createTestArtifacts(b, tmpDir, 100)
b.ResetTimer()
b.ReportAllocs() b.ReportAllocs()
for i := 0; i < b.N; i++ { for b.Loop() {
_, err := worker.ScanArtifactsNative(tmpDir) _, err := worker.ScanArtifactsNative(tmpDir)
if err != nil { if err != nil {
b.Fatal(err) b.Fatal(err)
@ -54,8 +52,8 @@ func BenchmarkArtifactScanLarge(b *testing.B) {
b.Run("Go", func(b *testing.B) { b.Run("Go", func(b *testing.B) {
b.ReportAllocs() b.ReportAllocs()
for i := 0; i < b.N; i++ { for b.Loop() {
_, err := worker.ScanArtifacts(tmpDir) _, err := worker.ScanArtifacts(tmpDir, false)
if err != nil { if err != nil {
b.Fatal(err) b.Fatal(err)
} }
@ -64,7 +62,7 @@ func BenchmarkArtifactScanLarge(b *testing.B) {
b.Run("Native", func(b *testing.B) { b.Run("Native", func(b *testing.B) {
b.ReportAllocs() b.ReportAllocs()
for i := 0; i < b.N; i++ { for b.Loop() {
_, err := worker.ScanArtifactsNative(tmpDir) _, err := worker.ScanArtifactsNative(tmpDir)
if err != nil { if err != nil {
b.Fatal(err) b.Fatal(err)
@ -93,7 +91,7 @@ func createTestArtifacts(b testing.TB, root string, count int) {
} }
// Create test files // Create test files
for i := 0; i < count; i++ { for i := range count {
var path string var path string
switch i % 5 { switch i % 5 {
case 0: case 0:

View file

@ -0,0 +1,210 @@
package worker_test
import (
"os"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
// TestGPUDetectorEnvOverrides validates both FETCH_ML_GPU_TYPE and FETCH_ML_GPU_COUNT work
func TestGPUDetectorEnvOverrides(t *testing.T) {
tests := []struct {
name string
gpuType string
gpuCount string
wantType worker.GPUType
wantCount int
wantMethod worker.DetectionSource
wantConfigured string
}{
{
name: "env type only - nvidia",
gpuType: "nvidia",
wantType: worker.GPUTypeNVIDIA,
wantMethod: worker.DetectionSourceEnvType,
wantConfigured: "nvidia",
},
{
name: "env type only - apple",
gpuType: "apple",
wantType: worker.GPUTypeApple,
wantMethod: worker.DetectionSourceEnvType,
wantConfigured: "apple",
},
{
name: "env type only - none",
gpuType: "none",
wantType: worker.GPUTypeNone,
wantMethod: worker.DetectionSourceEnvType,
wantConfigured: "none",
},
{
name: "both env vars set",
gpuType: "nvidia",
gpuCount: "4",
wantType: worker.GPUTypeNVIDIA,
wantMethod: worker.DetectionSourceEnvBoth,
wantConfigured: "nvidia",
},
{
name: "env type amd - shows amd configured vendor",
gpuType: "amd",
wantType: worker.GPUTypeAMD,
wantMethod: worker.DetectionSourceEnvType,
wantConfigured: "amd",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Set env vars
if tt.gpuType != "" {
os.Setenv("FETCH_ML_GPU_TYPE", tt.gpuType)
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
}
if tt.gpuCount != "" {
os.Setenv("FETCH_ML_GPU_COUNT", tt.gpuCount)
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
}
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(nil)
if result.Info.GPUType != tt.wantType {
t.Errorf("GPUType = %v, want %v", result.Info.GPUType, tt.wantType)
}
if result.Info.DetectionMethod != tt.wantMethod {
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, tt.wantMethod)
}
if result.Info.ConfiguredVendor != tt.wantConfigured {
t.Errorf("ConfiguredVendor = %v, want %v", result.Info.ConfiguredVendor, tt.wantConfigured)
}
})
}
}
// TestGPUDetectorAMDVendorAlias validates AMD config shows proper aliasing
func TestGPUDetectorAMDVendorAlias(t *testing.T) {
cfg := &worker.Config{
GPUVendor: "amd",
}
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(cfg)
// AMD uses NVIDIA detector implementation
if result.Info.ConfiguredVendor != "amd" {
t.Errorf("ConfiguredVendor = %v, want 'amd'", result.Info.ConfiguredVendor)
}
if result.Info.GPUType != worker.GPUTypeNVIDIA {
t.Errorf("GPUType = %v, want %v (NVIDIA implementation for AMD alias)", result.Info.GPUType, worker.GPUTypeNVIDIA)
}
}
// TestGPUDetectorEnvCountOverride validates FETCH_ML_GPU_COUNT with auto-detect
func TestGPUDetectorEnvCountOverride(t *testing.T) {
os.Setenv("FETCH_ML_GPU_COUNT", "8")
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
cfg := &worker.Config{
GPUVendor: "nvidia",
}
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(cfg)
if result.Info.DetectionMethod != worker.DetectionSourceEnvCount {
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, worker.DetectionSourceEnvCount)
}
if result.Info.EnvOverrideCount != 8 {
t.Errorf("EnvOverrideCount = %v, want 8", result.Info.EnvOverrideCount)
}
}
// TestGPUDetectorDetectionSources validates all detection source types
func TestGPUDetectorDetectionSources(t *testing.T) {
tests := []struct {
name string
envType string
envCount string
config *worker.Config
wantSource worker.DetectionSource
}{
{
name: "env type takes precedence over config",
envType: "apple",
config: &worker.Config{GPUVendor: "nvidia"},
wantSource: worker.DetectionSourceEnvType,
},
{
name: "env count triggers env_count source",
envCount: "2",
config: &worker.Config{GPUVendor: "nvidia"},
wantSource: worker.DetectionSourceEnvCount,
},
{
name: "config source when no env",
config: &worker.Config{GPUVendor: "nvidia"},
wantSource: worker.DetectionSourceConfig,
},
{
name: "auto source for GPUDevices",
config: &worker.Config{GPUDevices: []string{"/dev/nvidia0"}},
wantSource: worker.DetectionSourceAuto,
},
{
name: "auto source for AppleGPU",
config: &worker.Config{AppleGPU: worker.AppleGPUConfig{Enabled: true}},
wantSource: worker.DetectionSourceAuto,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.envType != "" {
os.Setenv("FETCH_ML_GPU_TYPE", tt.envType)
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
}
if tt.envCount != "" {
os.Setenv("FETCH_ML_GPU_COUNT", tt.envCount)
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
}
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(tt.config)
if result.Info.DetectionMethod != tt.wantSource {
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, tt.wantSource)
}
})
}
}
// TestGPUDetectorInfoFields validates all GPUDetectionInfo fields are populated
func TestGPUDetectorInfoFields(t *testing.T) {
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
os.Setenv("FETCH_ML_GPU_COUNT", "4")
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(nil)
// Validate all expected fields
if result.Info.GPUType == "" {
t.Error("GPUType field is empty")
}
if result.Info.ConfiguredVendor == "" {
t.Error("ConfiguredVendor field is empty")
}
if result.Info.DetectionMethod == "" {
t.Error("DetectionMethod field is empty")
}
if result.Info.EnvOverrideType != "nvidia" {
t.Errorf("EnvOverrideType = %v, want 'nvidia'", result.Info.EnvOverrideType)
}
if result.Info.EnvOverrideCount != 4 {
t.Errorf("EnvOverrideCount = %v, want 4", result.Info.EnvOverrideCount)
}
}

View file

@ -0,0 +1,311 @@
package worker_test
import (
"encoding/json"
"os"
"testing"
"github.com/jfraeys/fetch_ml/internal/worker"
)
// GoldenGPUStatus represents the expected GPU status output for golden file testing
type GoldenGPUStatus struct {
GPUCount int `json:"gpu_count"`
GPUType string `json:"gpu_type"`
ConfiguredVendor string `json:"configured_vendor"`
DetectionMethod string `json:"detection_method"`
EnvOverrideType string `json:"env_override_type,omitempty"`
EnvOverrideCount int `json:"env_override_count,omitempty"`
BuildTags map[string]bool `json:"build_tags"`
NativeAvailable bool `json:"native_available"`
Extra map[string]interface{} `json:"extra,omitempty"`
}
// detectBuildTags returns which build tags are active
func detectBuildTags() map[string]bool {
tags := map[string]bool{
"cgo": false,
"native_libs": false,
"darwin": false,
"linux": false,
}
// CGO is determined at compile time - we can detect by trying to use native
// If native functions return "disabled", we know native_libs is not set
simdName := worker.GetSIMDImplName()
tags["native_libs"] = simdName != "disabled" && simdName != "disabled (no CGO)"
tags["cgo"] = simdName != "disabled (no CGO)"
// OS detection
if worker.IsMacOS() {
tags["darwin"] = true
} else {
tags["linux"] = true
}
return tags
}
// TestGoldenGPUStatusNVML validates GPU status against golden file for NVML path
// This test runs under all build configurations but expectations differ:
// - cgo+native_libs: Real GPU count and NVML detection
// - cgo without native_libs: Returns 0, nil (stub behavior)
// - !cgo: Returns 0, nil (stub behavior)
func TestGoldenGPUStatusNVML(t *testing.T) {
// Setup: Configure for NVIDIA detection
cfg := &worker.Config{
GPUVendor: "nvidia",
}
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(cfg)
// Get actual detected count (behavior varies by build tags)
count := result.Detector.DetectGPUCount()
buildTags := detectBuildTags()
// Build the golden status object
got := GoldenGPUStatus{
GPUCount: count,
GPUType: string(result.Info.GPUType),
ConfiguredVendor: result.Info.ConfiguredVendor,
DetectionMethod: string(result.Info.DetectionMethod),
BuildTags: buildTags,
NativeAvailable: buildTags["native_libs"] && buildTags["cgo"],
}
// Validate against build-specific expectations
if buildTags["native_libs"] && buildTags["cgo"] {
// Real NVML build: Should detect actual GPUs or get real NVML error
// GPU count may be 0 if no NVIDIA hardware, but detection method should be config
if got.DetectionMethod != "config" {
t.Errorf("cgo+native_libs: DetectionMethod = %v, want 'config'", got.DetectionMethod)
}
} else if buildTags["cgo"] {
// CGO without native_libs: Stub returns 0
if got.GPUCount != 0 {
t.Logf("cgo-only build: GPUCount = %d (expected 0 from stub)", got.GPUCount)
}
if got.NativeAvailable {
t.Error("cgo-only build: NativeAvailable should be false")
}
} else {
// No CGO: Stub returns 0
if got.GPUCount != 0 {
t.Logf("nocgo build: GPUCount = %d (expected 0 from stub)", got.GPUCount)
}
if got.NativeAvailable {
t.Error("nocgo build: NativeAvailable should be false")
}
}
// Common validations
if got.ConfiguredVendor != "nvidia" {
t.Errorf("ConfiguredVendor = %v, want 'nvidia'", got.ConfiguredVendor)
}
}
// TestGoldenGPUStatusAMDVendorAlias validates AMD aliasing is visible in output
// Build tags: all three configurations
// Runtime scenarios: amd config
func TestGoldenGPUStatusAMDVendorAlias(t *testing.T) {
cfg := &worker.Config{
GPUVendor: "amd",
}
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(cfg)
buildTags := detectBuildTags()
got := GoldenGPUStatus{
GPUCount: result.Detector.DetectGPUCount(),
GPUType: string(result.Info.GPUType),
ConfiguredVendor: result.Info.ConfiguredVendor,
DetectionMethod: string(result.Info.DetectionMethod),
BuildTags: buildTags,
NativeAvailable: buildTags["native_libs"] && buildTags["cgo"],
}
// The key assertion: configured_vendor should be "amd" but GPUType should be "nvidia"
// This makes the aliasing visible in status output
if got.ConfiguredVendor != "amd" {
t.Errorf("AMD config: ConfiguredVendor = %v, want 'amd'", got.ConfiguredVendor)
}
if got.GPUType != "nvidia" {
t.Errorf("AMD config: GPUType = %v, want 'nvidia' (AMD aliased to NVIDIA implementation)", got.GPUType)
}
}
// TestGoldenGPUStatusEnvOverride validates env override behavior across build configs
// Build tags: all three
// Runtime scenarios: env override set
func TestGoldenGPUStatusEnvOverride(t *testing.T) {
// Set env override
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
os.Setenv("FETCH_ML_GPU_COUNT", "4")
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(&worker.Config{GPUVendor: "apple"})
buildTags := detectBuildTags()
got := GoldenGPUStatus{
GPUCount: result.Detector.DetectGPUCount(),
GPUType: string(result.Info.GPUType),
ConfiguredVendor: result.Info.ConfiguredVendor,
DetectionMethod: string(result.Info.DetectionMethod),
EnvOverrideType: result.Info.EnvOverrideType,
EnvOverrideCount: result.Info.EnvOverrideCount,
BuildTags: buildTags,
}
// Env should take precedence over config
if got.DetectionMethod != "env_override_both" {
t.Errorf("Env override: DetectionMethod = %v, want 'env_override_both'", got.DetectionMethod)
}
if got.GPUType != "nvidia" {
t.Errorf("Env override: GPUType = %v, want 'nvidia'", got.GPUType)
}
if got.EnvOverrideType != "nvidia" {
t.Errorf("Env override: EnvOverrideType = %v, want 'nvidia'", got.EnvOverrideType)
}
if got.EnvOverrideCount != 4 {
t.Errorf("Env override: EnvOverrideCount = %v, want 4", got.EnvOverrideCount)
}
}
// TestGoldenGPUStatusMacOS validates macOS detection when running on Darwin
// Build tags: cgo+native_libs on Darwin
// Runtime scenarios: darwin
func TestGoldenGPUStatusMacOS(t *testing.T) {
if !worker.IsMacOS() {
t.Skip("Skipping macOS-specific test on non-Darwin platform")
}
cfg := &worker.Config{
GPUVendor: "apple",
AppleGPU: worker.AppleGPUConfig{Enabled: true},
}
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(cfg)
buildTags := detectBuildTags()
got := GoldenGPUStatus{
GPUCount: result.Detector.DetectGPUCount(),
GPUType: string(result.Info.GPUType),
ConfiguredVendor: result.Info.ConfiguredVendor,
DetectionMethod: string(result.Info.DetectionMethod),
BuildTags: buildTags,
NativeAvailable: buildTags["darwin"],
}
if got.ConfiguredVendor != "apple" {
t.Errorf("macOS: ConfiguredVendor = %v, want 'apple'", got.ConfiguredVendor)
}
if got.GPUType != "apple" {
t.Errorf("macOS: GPUType = %v, want 'apple'", got.GPUType)
}
if !got.BuildTags["darwin"] {
t.Error("macOS: darwin build tag should be true")
}
}
// TestGoldenGPUStatusNone validates no-GPU configuration
// Build tags: all three
// Runtime scenarios: none
func TestGoldenGPUStatusNone(t *testing.T) {
cfg := &worker.Config{
GPUVendor: "none",
}
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(cfg)
if result.Detector.DetectGPUCount() != 0 {
t.Errorf("none config: GPUCount = %v, want 0", result.Detector.DetectGPUCount())
}
if result.Info.ConfiguredVendor != "none" {
t.Errorf("none config: ConfiguredVendor = %v, want 'none'", result.Info.ConfiguredVendor)
}
}
// TestGoldenJSONSerialization validates the GPU status serializes to JSON correctly
func TestGoldenJSONSerialization(t *testing.T) {
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
os.Setenv("FETCH_ML_GPU_COUNT", "2")
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
factory := &worker.GPUDetectorFactory{}
result := factory.CreateDetectorWithInfo(nil)
status := GoldenGPUStatus{
GPUCount: result.Detector.DetectGPUCount(),
GPUType: string(result.Info.GPUType),
ConfiguredVendor: result.Info.ConfiguredVendor,
DetectionMethod: string(result.Info.DetectionMethod),
EnvOverrideType: result.Info.EnvOverrideType,
EnvOverrideCount: result.Info.EnvOverrideCount,
BuildTags: detectBuildTags(),
}
// Serialize to JSON (this mimics what ml status --json would output)
jsonData, err := json.MarshalIndent(status, "", " ")
if err != nil {
t.Fatalf("JSON serialization failed: %v", err)
}
// Verify JSON can be parsed back
var parsed GoldenGPUStatus
if err := json.Unmarshal(jsonData, &parsed); err != nil {
t.Fatalf("JSON deserialization failed: %v", err)
}
if parsed.ConfiguredVendor != status.ConfiguredVendor {
t.Errorf("JSON roundtrip: ConfiguredVendor mismatch")
}
if parsed.DetectionMethod != status.DetectionMethod {
t.Errorf("JSON roundtrip: DetectionMethod mismatch")
}
}
// TestBuildTagMatrix validates that all expected build tag combinations are testable
// This test documents the three build configurations:
// 1. cgo + native_libs: Real native library implementations
// 2. cgo without native_libs: Stubs that return errors
// 3. !cgo: Stubs that return "disabled (no CGO)"
func TestBuildTagMatrix(t *testing.T) {
tags := detectBuildTags()
// Log the current build configuration for CI visibility
t.Logf("Build configuration: cgo=%v native_libs=%v darwin=%v linux=%v",
tags["cgo"], tags["native_libs"], tags["darwin"], tags["linux"])
// Validate SIMD implementation name matches build tags
simdName := worker.GetSIMDImplName()
t.Logf("SIMD implementation: %s", simdName)
switch {
case tags["native_libs"]:
// Should have real implementation name (avx2, sha_ni, armv8_crypto, or generic)
if simdName == "disabled" || simdName == "disabled (no CGO)" {
t.Errorf("native_libs build: SIMD impl should be active, got %q", simdName)
}
case tags["cgo"]:
// Should be disabled without native_libs
if simdName != "disabled" {
t.Errorf("cgo-only build: SIMD impl should be 'disabled', got %q", simdName)
}
default:
// No CGO
if simdName != "disabled (no CGO)" {
t.Errorf("nocgo build: SIMD impl should be 'disabled (no CGO)', got %q", simdName)
}
}
}

View file

@ -30,7 +30,7 @@ func TestScanArtifacts_SkipsKnownPathsAndLogs(t *testing.T) {
mustWrite("checkpoints/best.pt", []byte("checkpoint")) mustWrite("checkpoints/best.pt", []byte("checkpoint"))
mustWrite("plots/loss.png", []byte("png")) mustWrite("plots/loss.png", []byte("png"))
art, err := worker.ScanArtifacts(runDir) art, err := worker.ScanArtifacts(runDir, false)
if err != nil { if err != nil {
t.Fatalf("scanArtifacts: %v", err) t.Fatalf("scanArtifacts: %v", err)
} }