GPU detection refactor: - Major rewrite of gpu_detector.go with unified detection interface - Support for NVIDIA (NVML), AMD (ROCm), and Apple Metal - Runtime GPU capability querying for scheduler matching macOS improvements: - gpu_macos.go: native Metal device enumeration and memory queries - Support for Apple Silicon (M1/M2/M3) unified memory reporting - Fallback to system profiler for Intel Macs Testing infrastructure: - Add gpu_detector_mock.go for testing without hardware - Update gpu_golden_test.go with platform-specific expectations - Cross-platform GPU info validation
594 lines
16 KiB
Go
594 lines
16 KiB
Go
package worker
|
|
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strings"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
)
|
|
|
|
// logWarningf logs a warning message using slog with proper sanitization
|
|
func logWarningf(format string, args ...any) {
|
|
// Use structured logging to avoid log injection
|
|
// Format the message first, then log as a single string attribute
|
|
msg := fmt.Sprintf(format, args...)
|
|
slog.Warn("warning", "message", msg)
|
|
}
|
|
|
|
// GPUType represents different GPU types
|
|
type GPUType string
|
|
|
|
const (
|
|
GPUTypeNVIDIA GPUType = "nvidia"
|
|
GPUTypeAMD GPUType = "amd"
|
|
GPUTypeApple GPUType = "apple"
|
|
GPUTypeNone GPUType = "none"
|
|
)
|
|
|
|
// DetectionSource indicates how the GPU detector was selected
|
|
type DetectionSource string
|
|
|
|
const (
|
|
DetectionSourceEnvType DetectionSource = "env_override_type"
|
|
DetectionSourceEnvCount DetectionSource = "env_override_count"
|
|
DetectionSourceEnvBoth DetectionSource = "env_override_both"
|
|
DetectionSourceConfig DetectionSource = "config"
|
|
DetectionSourceAuto DetectionSource = "auto"
|
|
DetectionSourceNone DetectionSource = "none"
|
|
)
|
|
|
|
// GPUDetectionInfo provides metadata about how GPU detection was determined
|
|
type GPUDetectionInfo struct {
|
|
GPUType GPUType `json:"gpu_type"`
|
|
ConfiguredVendor string `json:"configured_vendor"`
|
|
DetectionMethod DetectionSource `json:"detection_method"`
|
|
EnvOverrideType string `json:"env_override_type,omitempty"`
|
|
EnvOverrideCount int `json:"env_override_count,omitempty"`
|
|
ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"`
|
|
}
|
|
|
|
// GPUDetector interface for detecting GPU availability
|
|
type GPUDetector interface {
|
|
DetectGPUCount() int
|
|
GetGPUType() GPUType
|
|
GetDevicePaths() []string
|
|
}
|
|
|
|
// NVIDIA GPUDetector implementation
|
|
type NVIDIADetector struct{}
|
|
|
|
func (d *NVIDIADetector) DetectGPUCount() int {
|
|
// First try NVML for accurate detection
|
|
if IsNVMLAvailable() {
|
|
count, err := GetGPUCount()
|
|
if err == nil && count > 0 {
|
|
return count
|
|
}
|
|
}
|
|
|
|
// Fall back to environment variable
|
|
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
|
|
return n
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (d *NVIDIADetector) GetGPUType() GPUType {
|
|
return GPUTypeNVIDIA
|
|
}
|
|
|
|
func (d *NVIDIADetector) GetDevicePaths() []string {
|
|
// Prefer standard NVIDIA device nodes when present.
|
|
patterns := []string{
|
|
"/dev/nvidiactl",
|
|
"/dev/nvidia-modeset",
|
|
"/dev/nvidia-uvm",
|
|
"/dev/nvidia-uvm-tools",
|
|
"/dev/nvidia*",
|
|
}
|
|
seen := make(map[string]struct{})
|
|
out := make([]string, 0, 8)
|
|
for _, pat := range patterns {
|
|
if filepath.Base(pat) == pat {
|
|
continue
|
|
}
|
|
if strings.Contains(pat, "*") {
|
|
matches, _ := filepath.Glob(pat)
|
|
for _, m := range matches {
|
|
if _, ok := seen[m]; ok {
|
|
continue
|
|
}
|
|
if _, err := os.Stat(m); err != nil {
|
|
continue
|
|
}
|
|
seen[m] = struct{}{}
|
|
out = append(out, m)
|
|
}
|
|
continue
|
|
}
|
|
if _, ok := seen[pat]; ok {
|
|
continue
|
|
}
|
|
if _, err := os.Stat(pat); err != nil {
|
|
continue
|
|
}
|
|
seen[pat] = struct{}{}
|
|
out = append(out, pat)
|
|
}
|
|
// Fallback for non-NVIDIA setups where only generic DRM device exists.
|
|
if len(out) == 0 {
|
|
if _, err := os.Stat("/dev/dri"); err == nil {
|
|
out = append(out, "/dev/dri")
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Apple M-series GPUDetector implementation
|
|
type AppleDetector struct {
|
|
enabled bool
|
|
}
|
|
|
|
func (d *AppleDetector) DetectGPUCount() int {
|
|
// First try actual macOS GPU detection
|
|
if IsMacOS() {
|
|
count, err := GetMacOSGPUCount()
|
|
if err == nil && count > 0 {
|
|
return count
|
|
}
|
|
}
|
|
|
|
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
|
|
return n
|
|
}
|
|
if d.enabled {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (d *AppleDetector) GetGPUType() GPUType {
|
|
return GPUTypeApple
|
|
}
|
|
|
|
func (d *AppleDetector) GetDevicePaths() []string {
|
|
return []string{"/dev/metal", "/dev/mps"}
|
|
}
|
|
|
|
// None GPUDetector implementation
|
|
type NoneDetector struct{}
|
|
|
|
func (d *NoneDetector) DetectGPUCount() int {
|
|
return 0
|
|
}
|
|
|
|
func (d *NoneDetector) GetGPUType() GPUType {
|
|
return GPUTypeNone
|
|
}
|
|
|
|
func (d *NoneDetector) GetDevicePaths() []string {
|
|
return nil
|
|
}
|
|
|
|
// GPUDetectorFactory creates appropriate GPU detector based config
|
|
type GPUDetectorFactory struct{}
|
|
|
|
// DetectionResult contains both the detector and metadata about how it was selected
|
|
type DetectionResult struct {
|
|
Detector GPUDetector
|
|
Info GPUDetectionInfo
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
|
|
result := f.CreateDetectorWithInfo(cfg)
|
|
return result.Detector
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
|
|
// Check for explicit environment overrides
|
|
envType := os.Getenv("FETCH_ML_GPU_TYPE")
|
|
envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")
|
|
|
|
if envType != "" && hasEnvCount {
|
|
// Both env vars set
|
|
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
|
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
|
switch envType {
|
|
case string(GPUTypeNVIDIA):
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case string(GPUTypeApple):
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: true},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case string(GPUTypeNone):
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD env override uses NVIDIA detector (aliased)
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeAMD,
|
|
ConfiguredVendor: "amd",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
default:
|
|
// Defensive: unknown env type should not silently fall through
|
|
logWarningf("unrecognized FETCH_ML_GPU_TYPE value %q, using no GPU", envType)
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
if envType != "" {
|
|
// Only FETCH_ML_GPU_TYPE set
|
|
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
|
switch envType {
|
|
case string(GPUTypeNVIDIA):
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case string(GPUTypeApple):
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: true},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case string(GPUTypeNone):
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD env override uses NVIDIA detector (aliased)
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeAMD,
|
|
ConfiguredVendor: "amd",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
default:
|
|
// Defensive: unknown env type should not silently fall through
|
|
logWarningf("unrecognized FETCH_ML_GPU_TYPE value %q, using no GPU", envType)
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
if hasEnvCount {
|
|
// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
|
|
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
|
return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
|
|
}
|
|
|
|
// No env overrides - detect from config
|
|
return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
|
|
if cfg == nil {
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
},
|
|
}
|
|
}
|
|
|
|
// Check for auto-detection scenarios (GPUDevices provided or AppleGPU enabled without explicit vendor)
|
|
isAutoDetect := cfg.GPUVendorAutoDetected ||
|
|
(len(cfg.GPUDevices) > 0 && cfg.GPUVendor == "") ||
|
|
(cfg.AppleGPU.Enabled && cfg.GPUVendor == "")
|
|
if isAutoDetect && source == DetectionSourceConfig {
|
|
source = DetectionSourceAuto
|
|
}
|
|
|
|
switch GPUType(cfg.GPUVendor) {
|
|
case GPUTypeApple:
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case GPUTypeNone:
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case GPUTypeNVIDIA:
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD uses similar device exposure patterns in this codebase.
|
|
// This is the key aliasing point - we report AMD as configured vendor
|
|
// but use NVIDIADetector for implementation.
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "amd", // User configured "amd"
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
default:
|
|
// SECURITY: Explicit default prevents silent misconfiguration
|
|
// Unknown GPU vendor is treated as no GPU - fail secure
|
|
// Note: Config.Validate() should catch invalid vendors before this point
|
|
logWarningf("unrecognized GPU vendor %q, using no GPU", cfg.GPUVendor)
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
// DetectCapabilities returns full WorkerCapabilities with backend detection
|
|
// It tries NVIDIA first, then Metal (Apple Silicon), then Vulkan, then CPU fallback
|
|
func DetectCapabilities() scheduler.WorkerCapabilities {
|
|
// Try NVIDIA first
|
|
if IsNVMLAvailable() {
|
|
count, err := GetGPUCount()
|
|
if err == nil && count > 0 {
|
|
gpus, err := GetAllGPUInfo()
|
|
if err == nil && len(gpus) > 0 {
|
|
totalVRAM := float64(0)
|
|
for _, gpu := range gpus {
|
|
totalVRAM += float64(gpu.MemoryTotal) / (1024 * 1024 * 1024) // Convert to GB
|
|
}
|
|
return scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendNVIDIA,
|
|
GPUCount: count,
|
|
GPUType: gpus[0].Name,
|
|
VRAMGB: totalVRAM,
|
|
CPUCount: runtime.NumCPU(),
|
|
MemoryGB: getSystemMemoryGB(),
|
|
Hostname: getHostname(),
|
|
GPUInfo: scheduler.GPUDetectionInfo{
|
|
GPUType: "nvidia",
|
|
Count: count,
|
|
Devices: getNVIDIADevices(),
|
|
MemTotal: gpus[0].MemoryTotal,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try Metal (macOS Apple Silicon)
|
|
if runtime.GOOS == "darwin" && IsAppleSilicon() {
|
|
gpus, err := GetMacOSGPUInfo()
|
|
if err == nil && len(gpus) > 0 {
|
|
totalVRAM := float64(0)
|
|
for _, gpu := range gpus {
|
|
totalVRAM += float64(gpu.VRAM_MB) / 1024 // Convert MB to GB
|
|
}
|
|
return scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendMetal,
|
|
GPUCount: len(gpus),
|
|
GPUType: gpus[0].ChipsetModel,
|
|
VRAMGB: totalVRAM,
|
|
CPUCount: runtime.NumCPU(),
|
|
MemoryGB: getSystemMemoryGB(),
|
|
Hostname: getHostname(),
|
|
GPUInfo: scheduler.GPUDetectionInfo{
|
|
GPUType: "apple",
|
|
Count: len(gpus),
|
|
MemTotal: uint64(totalVRAM * 1024 * 1024 * 1024),
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try Vulkan (check for vulkaninfo or /dev/dri)
|
|
if hasVulkan() {
|
|
count := getVulkanGPUCount()
|
|
if count > 0 {
|
|
return scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendVulkan,
|
|
GPUCount: count,
|
|
GPUType: "vulkan",
|
|
VRAMGB: 0, // TODO: Query Vulkan for VRAM
|
|
CPUCount: runtime.NumCPU(),
|
|
MemoryGB: getSystemMemoryGB(),
|
|
Hostname: getHostname(),
|
|
GPUInfo: scheduler.GPUDetectionInfo{
|
|
GPUType: "vulkan",
|
|
Count: count,
|
|
Devices: getVulkanDevices(),
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
// CPU fallback
|
|
return scheduler.WorkerCapabilities{
|
|
GPUBackend: scheduler.BackendCPU,
|
|
GPUCount: 0,
|
|
GPUType: "cpu",
|
|
VRAMGB: 0,
|
|
CPUCount: runtime.NumCPU(),
|
|
MemoryGB: getSystemMemoryGB(),
|
|
Hostname: getHostname(),
|
|
GPUInfo: scheduler.GPUDetectionInfo{
|
|
GPUType: "cpu",
|
|
Count: 0,
|
|
},
|
|
}
|
|
}
|
|
|
|
// hasVulkan checks if Vulkan is available
|
|
func hasVulkan() bool {
|
|
// Check for vulkaninfo binary
|
|
if _, err := exec.LookPath("vulkaninfo"); err == nil {
|
|
return true
|
|
}
|
|
// Check for /dev/dri
|
|
if _, err := os.Stat("/dev/dri"); err == nil {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// getVulkanGPUCount returns the number of Vulkan GPUs
|
|
func getVulkanGPUCount() int {
|
|
// Try to get GPU count from vulkaninfo
|
|
out, err := exec.Command("vulkaninfo", "--summary").Output()
|
|
if err != nil {
|
|
return 1 // Assume 1 if vulkaninfo fails but /dev/dri exists
|
|
}
|
|
// Count "GPU" occurrences in output
|
|
count := 0
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
if strings.Contains(line, "GPU") && strings.Contains(line, "deviceName") {
|
|
count++
|
|
}
|
|
}
|
|
if count == 0 {
|
|
return 1 // Assume at least 1
|
|
}
|
|
return count
|
|
}
|
|
|
|
// getVulkanDevices returns Vulkan device paths
|
|
func getVulkanDevices() []string {
|
|
if _, err := os.Stat("/dev/dri"); err == nil {
|
|
return []string{"/dev/dri"}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// getNVIDIADevices returns NVIDIA device paths
|
|
func getNVIDIADevices() []string {
|
|
patterns := []string{"/dev/nvidia*"}
|
|
var devices []string
|
|
for _, pat := range patterns {
|
|
matches, _ := filepath.Glob(pat)
|
|
for _, m := range matches {
|
|
if _, err := os.Stat(m); err == nil {
|
|
devices = append(devices, m)
|
|
}
|
|
}
|
|
}
|
|
return devices
|
|
}
|
|
|
|
// getSystemMemoryGB returns system memory in GB
|
|
func getSystemMemoryGB() float64 {
|
|
// Try to read from /proc/meminfo on Linux
|
|
if data, err := os.ReadFile("/proc/meminfo"); err == nil {
|
|
for _, line := range strings.Split(string(data), "\n") {
|
|
if strings.HasPrefix(line, "MemTotal:") {
|
|
var kb uint64
|
|
fmt.Sscanf(line, "MemTotal: %d kB", &kb)
|
|
return float64(kb) / (1024 * 1024) // Convert KB to GB
|
|
}
|
|
}
|
|
}
|
|
// Fallback: return 0 to indicate unknown
|
|
return 0
|
|
}
|
|
|
|
// getHostname returns the system hostname
|
|
func getHostname() string {
|
|
hostname, _ := os.Hostname()
|
|
return hostname
|
|
}
|