fetch_ml/internal/worker/gpu_detector.go
Jeremie Fraeys 3b194ff2e8
Some checks failed
Build CLI with Embedded SQLite / build (arm64, aarch64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build (x86_64, x86_64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (arm64) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (x86_64) (push) Waiting to run
Security Scan / Security Analysis (push) Waiting to run
Security Scan / Native Library Security (push) Waiting to run
Checkout test / test (push) Successful in 6s
CI/CD Pipeline / Test (push) Failing after 1s
CI/CD Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI/CD Pipeline / Build (push) Has been skipped
CI/CD Pipeline / Test Scripts (push) Has been skipped
CI/CD Pipeline / Test Native Libraries (push) Has been skipped
CI/CD Pipeline / GPU Golden Test Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 39s
CI/CD Pipeline / Docker Build (push) Has been skipped
feat: GPU detection transparency and artifact scanner improvements
- Surface GPUDetectionInfo from parseGPUCountFromConfig for detection metadata
- Document FETCH_ML_TOTAL_CPU and FETCH_ML_GPU_SLOTS_PER_GPU env vars
- Add debug logging for all env var overrides to stderr
- Track config-layer auto-detection in GPUDetectionInfo.ConfigLayerAutoDetected
- Add --include-all flag to artifact scanner (includeAll parameter)
- Add AMD production mode enforcement (error in non-local mode)
- Add GPU detector unit tests for env overrides and AMD aliasing
2026-02-23 12:29:34 -05:00

400 lines
10 KiB
Go

package worker
import (
"os"
"path/filepath"
"strings"
)
// GPUType represents different GPU types
type GPUType string
const (
GPUTypeNVIDIA GPUType = "nvidia"
GPUTypeAMD GPUType = "amd"
GPUTypeApple GPUType = "apple"
GPUTypeNone GPUType = "none"
)
// DetectionSource indicates how the GPU detector was selected
type DetectionSource string
const (
DetectionSourceEnvType DetectionSource = "env_override_type"
DetectionSourceEnvCount DetectionSource = "env_override_count"
DetectionSourceEnvBoth DetectionSource = "env_override_both"
DetectionSourceConfig DetectionSource = "config"
DetectionSourceAuto DetectionSource = "auto"
DetectionSourceNone DetectionSource = "none"
)
// GPUDetectionInfo provides metadata about how GPU detection was determined
type GPUDetectionInfo struct {
GPUType GPUType `json:"gpu_type"`
ConfiguredVendor string `json:"configured_vendor"`
DetectionMethod DetectionSource `json:"detection_method"`
EnvOverrideType string `json:"env_override_type,omitempty"`
EnvOverrideCount int `json:"env_override_count,omitempty"`
ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"`
}
// GPUDetector interface for detecting GPU availability
type GPUDetector interface {
DetectGPUCount() int
GetGPUType() GPUType
GetDevicePaths() []string
}
// NVIDIA GPUDetector implementation
type NVIDIADetector struct{}
func (d *NVIDIADetector) DetectGPUCount() int {
// First try NVML for accurate detection
if IsNVMLAvailable() {
count, err := GetGPUCount()
if err == nil && count > 0 {
return count
}
}
// Fall back to environment variable
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
return n
}
return 0
}
func (d *NVIDIADetector) GetGPUType() GPUType {
return GPUTypeNVIDIA
}
func (d *NVIDIADetector) GetDevicePaths() []string {
// Prefer standard NVIDIA device nodes when present.
patterns := []string{
"/dev/nvidiactl",
"/dev/nvidia-modeset",
"/dev/nvidia-uvm",
"/dev/nvidia-uvm-tools",
"/dev/nvidia*",
}
seen := make(map[string]struct{})
out := make([]string, 0, 8)
for _, pat := range patterns {
if filepath.Base(pat) == pat {
continue
}
if strings.Contains(pat, "*") {
matches, _ := filepath.Glob(pat)
for _, m := range matches {
if _, ok := seen[m]; ok {
continue
}
if _, err := os.Stat(m); err != nil {
continue
}
seen[m] = struct{}{}
out = append(out, m)
}
continue
}
if _, ok := seen[pat]; ok {
continue
}
if _, err := os.Stat(pat); err != nil {
continue
}
seen[pat] = struct{}{}
out = append(out, pat)
}
// Fallback for non-NVIDIA setups where only generic DRM device exists.
if len(out) == 0 {
if _, err := os.Stat("/dev/dri"); err == nil {
out = append(out, "/dev/dri")
}
}
return out
}
// Apple M-series GPUDetector implementation
type AppleDetector struct {
enabled bool
}
func (d *AppleDetector) DetectGPUCount() int {
// First try actual macOS GPU detection
if IsMacOS() {
count, err := GetMacOSGPUCount()
if err == nil && count > 0 {
return count
}
}
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
return n
}
if d.enabled {
return 1
}
return 0
}
func (d *AppleDetector) GetGPUType() GPUType {
return GPUTypeApple
}
func (d *AppleDetector) GetDevicePaths() []string {
return []string{"/dev/metal", "/dev/mps"}
}
// None GPUDetector implementation
type NoneDetector struct{}
func (d *NoneDetector) DetectGPUCount() int {
return 0
}
func (d *NoneDetector) GetGPUType() GPUType {
return GPUTypeNone
}
func (d *NoneDetector) GetDevicePaths() []string {
return nil
}
// GPUDetectorFactory creates appropriate GPU detector based config
type GPUDetectorFactory struct{}
// DetectionResult contains both the detector and metadata about how it was selected
type DetectionResult struct {
Detector GPUDetector
Info GPUDetectionInfo
}
func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
result := f.CreateDetectorWithInfo(cfg)
return result.Detector
}
func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
// Check for explicit environment overrides
envType := os.Getenv("FETCH_ML_GPU_TYPE")
envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")
if envType != "" && hasEnvCount {
// Both env vars set
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
switch envType {
case string(GPUTypeNVIDIA):
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "nvidia",
DetectionMethod: DetectionSourceEnvBoth,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
},
}
case string(GPUTypeApple):
return DetectionResult{
Detector: &AppleDetector{enabled: true},
Info: GPUDetectionInfo{
GPUType: GPUTypeApple,
ConfiguredVendor: "apple",
DetectionMethod: DetectionSourceEnvBoth,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
},
}
case string(GPUTypeNone):
return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: DetectionSourceEnvBoth,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
},
}
case "amd":
// AMD env override uses NVIDIA detector (aliased)
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeAMD,
ConfiguredVendor: "amd",
DetectionMethod: DetectionSourceEnvBoth,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
},
}
}
}
if envType != "" {
// Only FETCH_ML_GPU_TYPE set
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
switch envType {
case string(GPUTypeNVIDIA):
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "nvidia",
DetectionMethod: DetectionSourceEnvType,
EnvOverrideType: envType,
},
}
case string(GPUTypeApple):
return DetectionResult{
Detector: &AppleDetector{enabled: true},
Info: GPUDetectionInfo{
GPUType: GPUTypeApple,
ConfiguredVendor: "apple",
DetectionMethod: DetectionSourceEnvType,
EnvOverrideType: envType,
},
}
case string(GPUTypeNone):
return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: DetectionSourceEnvType,
EnvOverrideType: envType,
},
}
case "amd":
// AMD env override uses NVIDIA detector (aliased)
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeAMD,
ConfiguredVendor: "amd",
DetectionMethod: DetectionSourceEnvType,
EnvOverrideType: envType,
},
}
}
}
if hasEnvCount {
// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
}
// No env overrides - detect from config
return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
}
func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
if cfg == nil {
return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: source,
},
}
}
switch GPUType(cfg.GPUVendor) {
case GPUTypeApple:
return DetectionResult{
Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
Info: GPUDetectionInfo{
GPUType: GPUTypeApple,
ConfiguredVendor: "apple",
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
case GPUTypeNone:
return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
case GPUTypeNVIDIA:
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "nvidia",
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
case "amd":
// AMD uses similar device exposure patterns in this codebase.
// This is the key aliasing point - we report AMD as configured vendor
// but use NVIDIADetector for implementation.
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "amd", // User configured "amd"
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
}
// Auto-detect based on config settings
if cfg.AppleGPU.Enabled {
return DetectionResult{
Detector: &AppleDetector{enabled: true},
Info: GPUDetectionInfo{
GPUType: GPUTypeApple,
ConfiguredVendor: "apple",
DetectionMethod: DetectionSourceAuto,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
}
if len(cfg.GPUDevices) > 0 {
return DetectionResult{
Detector: &NVIDIADetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNVIDIA,
ConfiguredVendor: "nvidia",
DetectionMethod: DetectionSourceAuto,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
}
// Default to no GPU
return DetectionResult{
Detector: &NoneDetector{},
Info: GPUDetectionInfo{
GPUType: GPUTypeNone,
ConfiguredVendor: "none",
DetectionMethod: source,
EnvOverrideType: envType,
EnvOverrideCount: envCount,
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
},
}
}