Some checks failed
Build CLI with Embedded SQLite / build (arm64, aarch64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build (x86_64, x86_64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (arm64) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (x86_64) (push) Waiting to run
Security Scan / Security Analysis (push) Waiting to run
Security Scan / Native Library Security (push) Waiting to run
Checkout test / test (push) Successful in 6s
CI/CD Pipeline / Test (push) Failing after 1s
CI/CD Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI/CD Pipeline / Build (push) Has been skipped
CI/CD Pipeline / Test Scripts (push) Has been skipped
CI/CD Pipeline / Test Native Libraries (push) Has been skipped
CI/CD Pipeline / GPU Golden Test Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 39s
CI/CD Pipeline / Docker Build (push) Has been skipped
- Surface GPUDetectionInfo from parseGPUCountFromConfig for detection metadata - Document FETCH_ML_TOTAL_CPU and FETCH_ML_GPU_SLOTS_PER_GPU env vars - Add debug logging for all env var overrides to stderr - Track config-layer auto-detection in GPUDetectionInfo.ConfigLayerAutoDetected - Add --include-all flag to artifact scanner (includeAll parameter) - Add AMD production mode enforcement (error in non-local mode) - Add GPU detector unit tests for env overrides and AMD aliasing
400 lines
10 KiB
Go
400 lines
10 KiB
Go
package worker
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// GPUType represents different GPU types
|
|
type GPUType string
|
|
|
|
const (
|
|
GPUTypeNVIDIA GPUType = "nvidia"
|
|
GPUTypeAMD GPUType = "amd"
|
|
GPUTypeApple GPUType = "apple"
|
|
GPUTypeNone GPUType = "none"
|
|
)
|
|
|
|
// DetectionSource indicates how the GPU detector was selected
|
|
type DetectionSource string
|
|
|
|
const (
|
|
DetectionSourceEnvType DetectionSource = "env_override_type"
|
|
DetectionSourceEnvCount DetectionSource = "env_override_count"
|
|
DetectionSourceEnvBoth DetectionSource = "env_override_both"
|
|
DetectionSourceConfig DetectionSource = "config"
|
|
DetectionSourceAuto DetectionSource = "auto"
|
|
DetectionSourceNone DetectionSource = "none"
|
|
)
|
|
|
|
// GPUDetectionInfo provides metadata about how GPU detection was determined
|
|
type GPUDetectionInfo struct {
|
|
GPUType GPUType `json:"gpu_type"`
|
|
ConfiguredVendor string `json:"configured_vendor"`
|
|
DetectionMethod DetectionSource `json:"detection_method"`
|
|
EnvOverrideType string `json:"env_override_type,omitempty"`
|
|
EnvOverrideCount int `json:"env_override_count,omitempty"`
|
|
ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"`
|
|
}
|
|
|
|
// GPUDetector interface for detecting GPU availability
|
|
type GPUDetector interface {
|
|
DetectGPUCount() int
|
|
GetGPUType() GPUType
|
|
GetDevicePaths() []string
|
|
}
|
|
|
|
// NVIDIA GPUDetector implementation
|
|
type NVIDIADetector struct{}
|
|
|
|
func (d *NVIDIADetector) DetectGPUCount() int {
|
|
// First try NVML for accurate detection
|
|
if IsNVMLAvailable() {
|
|
count, err := GetGPUCount()
|
|
if err == nil && count > 0 {
|
|
return count
|
|
}
|
|
}
|
|
|
|
// Fall back to environment variable
|
|
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
|
|
return n
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (d *NVIDIADetector) GetGPUType() GPUType {
|
|
return GPUTypeNVIDIA
|
|
}
|
|
|
|
func (d *NVIDIADetector) GetDevicePaths() []string {
|
|
// Prefer standard NVIDIA device nodes when present.
|
|
patterns := []string{
|
|
"/dev/nvidiactl",
|
|
"/dev/nvidia-modeset",
|
|
"/dev/nvidia-uvm",
|
|
"/dev/nvidia-uvm-tools",
|
|
"/dev/nvidia*",
|
|
}
|
|
seen := make(map[string]struct{})
|
|
out := make([]string, 0, 8)
|
|
for _, pat := range patterns {
|
|
if filepath.Base(pat) == pat {
|
|
continue
|
|
}
|
|
if strings.Contains(pat, "*") {
|
|
matches, _ := filepath.Glob(pat)
|
|
for _, m := range matches {
|
|
if _, ok := seen[m]; ok {
|
|
continue
|
|
}
|
|
if _, err := os.Stat(m); err != nil {
|
|
continue
|
|
}
|
|
seen[m] = struct{}{}
|
|
out = append(out, m)
|
|
}
|
|
continue
|
|
}
|
|
if _, ok := seen[pat]; ok {
|
|
continue
|
|
}
|
|
if _, err := os.Stat(pat); err != nil {
|
|
continue
|
|
}
|
|
seen[pat] = struct{}{}
|
|
out = append(out, pat)
|
|
}
|
|
// Fallback for non-NVIDIA setups where only generic DRM device exists.
|
|
if len(out) == 0 {
|
|
if _, err := os.Stat("/dev/dri"); err == nil {
|
|
out = append(out, "/dev/dri")
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Apple M-series GPUDetector implementation
|
|
type AppleDetector struct {
|
|
enabled bool
|
|
}
|
|
|
|
func (d *AppleDetector) DetectGPUCount() int {
|
|
// First try actual macOS GPU detection
|
|
if IsMacOS() {
|
|
count, err := GetMacOSGPUCount()
|
|
if err == nil && count > 0 {
|
|
return count
|
|
}
|
|
}
|
|
|
|
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
|
|
return n
|
|
}
|
|
if d.enabled {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (d *AppleDetector) GetGPUType() GPUType {
|
|
return GPUTypeApple
|
|
}
|
|
|
|
func (d *AppleDetector) GetDevicePaths() []string {
|
|
return []string{"/dev/metal", "/dev/mps"}
|
|
}
|
|
|
|
// None GPUDetector implementation
|
|
type NoneDetector struct{}
|
|
|
|
func (d *NoneDetector) DetectGPUCount() int {
|
|
return 0
|
|
}
|
|
|
|
func (d *NoneDetector) GetGPUType() GPUType {
|
|
return GPUTypeNone
|
|
}
|
|
|
|
func (d *NoneDetector) GetDevicePaths() []string {
|
|
return nil
|
|
}
|
|
|
|
// GPUDetectorFactory creates appropriate GPU detector based config
|
|
type GPUDetectorFactory struct{}
|
|
|
|
// DetectionResult contains both the detector and metadata about how it was selected
|
|
type DetectionResult struct {
|
|
Detector GPUDetector
|
|
Info GPUDetectionInfo
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
|
|
result := f.CreateDetectorWithInfo(cfg)
|
|
return result.Detector
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
|
|
// Check for explicit environment overrides
|
|
envType := os.Getenv("FETCH_ML_GPU_TYPE")
|
|
envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")
|
|
|
|
if envType != "" && hasEnvCount {
|
|
// Both env vars set
|
|
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
|
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
|
switch envType {
|
|
case string(GPUTypeNVIDIA):
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case string(GPUTypeApple):
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: true},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case string(GPUTypeNone):
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD env override uses NVIDIA detector (aliased)
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeAMD,
|
|
ConfiguredVendor: "amd",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
if envType != "" {
|
|
// Only FETCH_ML_GPU_TYPE set
|
|
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
|
switch envType {
|
|
case string(GPUTypeNVIDIA):
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case string(GPUTypeApple):
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: true},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case string(GPUTypeNone):
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD env override uses NVIDIA detector (aliased)
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeAMD,
|
|
ConfiguredVendor: "amd",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
if hasEnvCount {
|
|
// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
|
|
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
|
return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
|
|
}
|
|
|
|
// No env overrides - detect from config
|
|
return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
|
|
if cfg == nil {
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
},
|
|
}
|
|
}
|
|
|
|
switch GPUType(cfg.GPUVendor) {
|
|
case GPUTypeApple:
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case GPUTypeNone:
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case GPUTypeNVIDIA:
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD uses similar device exposure patterns in this codebase.
|
|
// This is the key aliasing point - we report AMD as configured vendor
|
|
// but use NVIDIADetector for implementation.
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "amd", // User configured "amd"
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
}
|
|
|
|
// Auto-detect based on config settings
|
|
if cfg.AppleGPU.Enabled {
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: true},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: DetectionSourceAuto,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
}
|
|
if len(cfg.GPUDevices) > 0 {
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: DetectionSourceAuto,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
}
|
|
|
|
// Default to no GPU
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
}
|