GPU detection refactor: - Major rewrite of gpu_detector.go with unified detection interface - Support for NVIDIA (NVML), AMD (ROCm), and Apple Metal - Runtime GPU capability querying for scheduler matching macOS improvements: - gpu_macos.go: native Metal device enumeration and memory queries - Support for Apple Silicon (M1/M2/M3) unified memory reporting - Fallback to system profiler for Intel Macs Testing infrastructure: - Add gpu_detector_mock.go for testing without hardware - Update gpu_golden_test.go with platform-specific expectations - Cross-platform GPU info validation
168 lines
3.9 KiB
Go
168 lines
3.9 KiB
Go
package worker
|
|
|
|
import (
|
|
"os"
|
|
"strconv"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/scheduler"
|
|
)
|
|
|
|
// MockGPUDetector provides a mock GPU detection for testing
|
|
// Use environment variables FETCH_ML_MOCK_GPU_TYPE and FETCH_ML_MOCK_GPU_COUNT
|
|
// to configure the mock detector
|
|
|
|
type MockGPUDetector struct {
|
|
gpuType GPUType
|
|
gpuCount int
|
|
vramGB float64
|
|
cpuCount int
|
|
devicePaths []string
|
|
}
|
|
|
|
// NewMockGPUDetector creates a mock GPU detector from environment variables
|
|
func NewMockGPUDetector() *MockGPUDetector {
|
|
gpuType := GPUType(os.Getenv("FETCH_ML_MOCK_GPU_TYPE"))
|
|
if gpuType == "" {
|
|
gpuType = GPUTypeNone
|
|
}
|
|
|
|
gpuCount, _ := strconv.Atoi(os.Getenv("FETCH_ML_MOCK_GPU_COUNT"))
|
|
if gpuCount < 0 {
|
|
gpuCount = 0
|
|
}
|
|
|
|
vramGB, _ := strconv.ParseFloat(os.Getenv("FETCH_ML_MOCK_VRAM_GB"), 64)
|
|
|
|
cpuCount, _ := strconv.Atoi(os.Getenv("FETCH_ML_MOCK_CPU_COUNT"))
|
|
if cpuCount == 0 {
|
|
cpuCount = 8 // Default
|
|
}
|
|
|
|
return &MockGPUDetector{
|
|
gpuType: gpuType,
|
|
gpuCount: gpuCount,
|
|
vramGB: vramGB,
|
|
cpuCount: cpuCount,
|
|
devicePaths: getMockDevicePaths(gpuType, gpuCount),
|
|
}
|
|
}
|
|
|
|
func (d *MockGPUDetector) DetectGPUCount() int {
|
|
return d.gpuCount
|
|
}
|
|
|
|
func (d *MockGPUDetector) GetGPUType() GPUType {
|
|
return d.gpuType
|
|
}
|
|
|
|
func (d *MockGPUDetector) GetDevicePaths() []string {
|
|
return d.devicePaths
|
|
}
|
|
|
|
// DetectCapabilitiesMock returns WorkerCapabilities for testing
|
|
func (d *MockGPUDetector) DetectCapabilities() scheduler.WorkerCapabilities {
|
|
backend := scheduler.BackendCPU
|
|
gpuTypeStr := "cpu"
|
|
|
|
switch d.gpuType {
|
|
case GPUTypeNVIDIA:
|
|
backend = scheduler.BackendNVIDIA
|
|
gpuTypeStr = "nvidia"
|
|
case GPUTypeApple:
|
|
backend = scheduler.BackendMetal
|
|
gpuTypeStr = "apple"
|
|
}
|
|
|
|
return scheduler.WorkerCapabilities{
|
|
GPUBackend: backend,
|
|
GPUCount: d.gpuCount,
|
|
GPUType: gpuTypeStr,
|
|
VRAMGB: d.vramGB,
|
|
CPUCount: d.cpuCount,
|
|
MemoryGB: 32.0, // Default for mock
|
|
Hostname: "mock-worker",
|
|
GPUInfo: scheduler.GPUDetectionInfo{
|
|
GPUType: gpuTypeStr,
|
|
Count: d.gpuCount,
|
|
Devices: d.devicePaths,
|
|
},
|
|
}
|
|
}
|
|
|
|
// getMockDevicePaths returns mock device paths based on GPU type
|
|
func getMockDevicePaths(gpuType GPUType, count int) []string {
|
|
var paths []string
|
|
|
|
switch gpuType {
|
|
case GPUTypeNVIDIA:
|
|
paths = append(paths, "/dev/nvidiactl", "/dev/nvidia-uvm")
|
|
for i := 0; i < count && i < 8; i++ {
|
|
paths = append(paths, "/dev/nvidia"+strconv.Itoa(i))
|
|
}
|
|
case GPUTypeApple:
|
|
paths = append(paths, "/dev/metal", "/dev/mps")
|
|
default:
|
|
paths = []string{}
|
|
}
|
|
|
|
return paths
|
|
}
|
|
|
|
// Predefined mock scenarios
|
|
const (
|
|
MockScenario2xNVIDIAA100 = "2x-nvidia-a100"
|
|
MockScenario4xMetal = "4x-metal"
|
|
MockScenarioCPUOnly = "cpu-only"
|
|
)
|
|
|
|
// NewMockGPUDetectorWithScenario creates a mock detector for a predefined scenario
|
|
func NewMockGPUDetectorWithScenario(scenario string) *MockGPUDetector {
|
|
switch scenario {
|
|
case MockScenario2xNVIDIAA100:
|
|
return &MockGPUDetector{
|
|
gpuType: GPUTypeNVIDIA,
|
|
gpuCount: 2,
|
|
vramGB: 80.0, // A100 80GB
|
|
cpuCount: 64,
|
|
devicePaths: []string{"/dev/nvidia0", "/dev/nvidia1"},
|
|
}
|
|
case MockScenario4xMetal:
|
|
return &MockGPUDetector{
|
|
gpuType: GPUTypeApple,
|
|
gpuCount: 4,
|
|
vramGB: 128.0, // Unified memory
|
|
cpuCount: 24,
|
|
devicePaths: []string{"/dev/metal"},
|
|
}
|
|
case MockScenarioCPUOnly:
|
|
return &MockGPUDetector{
|
|
gpuType: GPUTypeNone,
|
|
gpuCount: 0,
|
|
vramGB: 0,
|
|
cpuCount: 32,
|
|
devicePaths: []string{},
|
|
}
|
|
default:
|
|
return &MockGPUDetector{
|
|
gpuType: GPUTypeNone,
|
|
gpuCount: 0,
|
|
vramGB: 0,
|
|
cpuCount: 8,
|
|
devicePaths: []string{},
|
|
}
|
|
}
|
|
}
|
|
|
|
// IsMockGPUEnabled returns true if mock GPU environment is configured
|
|
func IsMockGPUEnabled() bool {
|
|
return os.Getenv("FETCH_ML_MOCK_GPU_TYPE") != "" ||
|
|
os.Getenv("FETCH_ML_MOCK_GPU_COUNT") != ""
|
|
}
|
|
|
|
// GetMockDetector returns either a mock detector or real detector based on environment
|
|
func GetMockDetector() *MockGPUDetector {
|
|
if !IsMockGPUEnabled() {
|
|
return nil
|
|
}
|
|
return NewMockGPUDetector()
|
|
}
|