package worker import ( "os" "path/filepath" "strings" ) // GPUType represents different GPU types type GPUType string const ( GPUTypeNVIDIA GPUType = "nvidia" GPUTypeAMD GPUType = "amd" GPUTypeApple GPUType = "apple" GPUTypeNone GPUType = "none" ) // DetectionSource indicates how the GPU detector was selected type DetectionSource string const ( DetectionSourceEnvType DetectionSource = "env_override_type" DetectionSourceEnvCount DetectionSource = "env_override_count" DetectionSourceEnvBoth DetectionSource = "env_override_both" DetectionSourceConfig DetectionSource = "config" DetectionSourceAuto DetectionSource = "auto" DetectionSourceNone DetectionSource = "none" ) // GPUDetectionInfo provides metadata about how GPU detection was determined type GPUDetectionInfo struct { GPUType GPUType `json:"gpu_type"` ConfiguredVendor string `json:"configured_vendor"` DetectionMethod DetectionSource `json:"detection_method"` EnvOverrideType string `json:"env_override_type,omitempty"` EnvOverrideCount int `json:"env_override_count,omitempty"` ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"` } // GPUDetector interface for detecting GPU availability type GPUDetector interface { DetectGPUCount() int GetGPUType() GPUType GetDevicePaths() []string } // NVIDIA GPUDetector implementation type NVIDIADetector struct{} func (d *NVIDIADetector) DetectGPUCount() int { // First try NVML for accurate detection if IsNVMLAvailable() { count, err := GetGPUCount() if err == nil && count > 0 { return count } } // Fall back to environment variable if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 { return n } return 0 } func (d *NVIDIADetector) GetGPUType() GPUType { return GPUTypeNVIDIA } func (d *NVIDIADetector) GetDevicePaths() []string { // Prefer standard NVIDIA device nodes when present. patterns := []string{ "/dev/nvidiactl", "/dev/nvidia-modeset", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools", "/dev/nvidia*", } seen := make(map[string]struct{}) out := make([]string, 0, 8) for _, pat := range patterns { if filepath.Base(pat) == pat { continue } if strings.Contains(pat, "*") { matches, _ := filepath.Glob(pat) for _, m := range matches { if _, ok := seen[m]; ok { continue } if _, err := os.Stat(m); err != nil { continue } seen[m] = struct{}{} out = append(out, m) } continue } if _, ok := seen[pat]; ok { continue } if _, err := os.Stat(pat); err != nil { continue } seen[pat] = struct{}{} out = append(out, pat) } // Fallback for non-NVIDIA setups where only generic DRM device exists. if len(out) == 0 { if _, err := os.Stat("/dev/dri"); err == nil { out = append(out, "/dev/dri") } } return out } // Apple M-series GPUDetector implementation type AppleDetector struct { enabled bool } func (d *AppleDetector) DetectGPUCount() int { // First try actual macOS GPU detection if IsMacOS() { count, err := GetMacOSGPUCount() if err == nil && count > 0 { return count } } if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 { return n } if d.enabled { return 1 } return 0 } func (d *AppleDetector) GetGPUType() GPUType { return GPUTypeApple } func (d *AppleDetector) GetDevicePaths() []string { return []string{"/dev/metal", "/dev/mps"} } // None GPUDetector implementation type NoneDetector struct{} func (d *NoneDetector) DetectGPUCount() int { return 0 } func (d *NoneDetector) GetGPUType() GPUType { return GPUTypeNone } func (d *NoneDetector) GetDevicePaths() []string { return nil } // GPUDetectorFactory creates appropriate GPU detector based config type GPUDetectorFactory struct{} // DetectionResult contains both the detector and metadata about how it was selected type DetectionResult struct { Detector GPUDetector Info GPUDetectionInfo } func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector { result := f.CreateDetectorWithInfo(cfg) return result.Detector } func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult { // Check for explicit environment overrides envType := os.Getenv("FETCH_ML_GPU_TYPE") envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT") if envType != "" && hasEnvCount { // Both env vars set logEnvOverride("FETCH_ML_GPU_TYPE", envType) logEnvOverride("FETCH_ML_GPU_COUNT", envCount) switch envType { case string(GPUTypeNVIDIA): return DetectionResult{ Detector: &NVIDIADetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNVIDIA, ConfiguredVendor: "nvidia", DetectionMethod: DetectionSourceEnvBoth, EnvOverrideType: envType, EnvOverrideCount: envCount, }, } case string(GPUTypeApple): return DetectionResult{ Detector: &AppleDetector{enabled: true}, Info: GPUDetectionInfo{ GPUType: GPUTypeApple, ConfiguredVendor: "apple", DetectionMethod: DetectionSourceEnvBoth, EnvOverrideType: envType, EnvOverrideCount: envCount, }, } case string(GPUTypeNone): return DetectionResult{ Detector: &NoneDetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNone, ConfiguredVendor: "none", DetectionMethod: DetectionSourceEnvBoth, EnvOverrideType: envType, EnvOverrideCount: envCount, }, } case "amd": // AMD env override uses NVIDIA detector (aliased) return DetectionResult{ Detector: &NVIDIADetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeAMD, ConfiguredVendor: "amd", DetectionMethod: DetectionSourceEnvBoth, EnvOverrideType: envType, EnvOverrideCount: envCount, }, } } } if envType != "" { // Only FETCH_ML_GPU_TYPE set logEnvOverride("FETCH_ML_GPU_TYPE", envType) switch envType { case string(GPUTypeNVIDIA): return DetectionResult{ Detector: &NVIDIADetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNVIDIA, ConfiguredVendor: "nvidia", DetectionMethod: DetectionSourceEnvType, EnvOverrideType: envType, }, } case string(GPUTypeApple): return DetectionResult{ Detector: &AppleDetector{enabled: true}, Info: GPUDetectionInfo{ GPUType: GPUTypeApple, ConfiguredVendor: "apple", DetectionMethod: DetectionSourceEnvType, EnvOverrideType: envType, }, } case string(GPUTypeNone): return DetectionResult{ Detector: &NoneDetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNone, ConfiguredVendor: "none", DetectionMethod: DetectionSourceEnvType, EnvOverrideType: envType, }, } case "amd": // AMD env override uses NVIDIA detector (aliased) return DetectionResult{ Detector: &NVIDIADetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeAMD, ConfiguredVendor: "amd", DetectionMethod: DetectionSourceEnvType, EnvOverrideType: envType, }, } } } if hasEnvCount { // Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto logEnvOverride("FETCH_ML_GPU_COUNT", envCount) return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount) } // No env overrides - detect from config return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1) } func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult { if cfg == nil { return DetectionResult{ Detector: &NoneDetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNone, ConfiguredVendor: "none", DetectionMethod: source, }, } } switch GPUType(cfg.GPUVendor) { case GPUTypeApple: return DetectionResult{ Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled}, Info: GPUDetectionInfo{ GPUType: GPUTypeApple, ConfiguredVendor: "apple", DetectionMethod: source, EnvOverrideType: envType, EnvOverrideCount: envCount, ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, }, } case GPUTypeNone: return DetectionResult{ Detector: &NoneDetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNone, ConfiguredVendor: "none", DetectionMethod: source, EnvOverrideType: envType, EnvOverrideCount: envCount, ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, }, } case GPUTypeNVIDIA: return DetectionResult{ Detector: &NVIDIADetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNVIDIA, ConfiguredVendor: "nvidia", DetectionMethod: source, EnvOverrideType: envType, EnvOverrideCount: envCount, ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, }, } case "amd": // AMD uses similar device exposure patterns in this codebase. // This is the key aliasing point - we report AMD as configured vendor // but use NVIDIADetector for implementation. return DetectionResult{ Detector: &NVIDIADetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNVIDIA, ConfiguredVendor: "amd", // User configured "amd" DetectionMethod: source, EnvOverrideType: envType, EnvOverrideCount: envCount, ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, }, } } // Auto-detect based on config settings if cfg.AppleGPU.Enabled { return DetectionResult{ Detector: &AppleDetector{enabled: true}, Info: GPUDetectionInfo{ GPUType: GPUTypeApple, ConfiguredVendor: "apple", DetectionMethod: DetectionSourceAuto, EnvOverrideType: envType, EnvOverrideCount: envCount, ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, }, } } if len(cfg.GPUDevices) > 0 { return DetectionResult{ Detector: &NVIDIADetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNVIDIA, ConfiguredVendor: "nvidia", DetectionMethod: DetectionSourceAuto, EnvOverrideType: envType, EnvOverrideCount: envCount, ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, }, } } // Default to no GPU return DetectionResult{ Detector: &NoneDetector{}, Info: GPUDetectionInfo{ GPUType: GPUTypeNone, ConfiguredVendor: "none", DetectionMethod: source, EnvOverrideType: envType, EnvOverrideCount: envCount, ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected, }, } }