Update worker system for scheduler integration: - Worker server with scheduler registration - Configuration with scheduler endpoint support - Artifact handling with integrity verification - Container executor with supply chain validation - Local executor enhancements - GPU detection improvements (cross-platform) - Error handling with execution context - Factory pattern for executor instantiation - Hash integrity with native library support
415 lines
11 KiB
Go
415 lines
11 KiB
Go
package worker
|
|
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// logWarningf logs a warning message using slog
|
|
func logWarningf(format string, args ...any) {
|
|
slog.Warn(fmt.Sprintf(format, args...))
|
|
}
|
|
|
|
// GPUType represents different GPU types
|
|
type GPUType string
|
|
|
|
const (
|
|
GPUTypeNVIDIA GPUType = "nvidia"
|
|
GPUTypeAMD GPUType = "amd"
|
|
GPUTypeApple GPUType = "apple"
|
|
GPUTypeNone GPUType = "none"
|
|
)
|
|
|
|
// DetectionSource indicates how the GPU detector was selected
|
|
type DetectionSource string
|
|
|
|
const (
|
|
DetectionSourceEnvType DetectionSource = "env_override_type"
|
|
DetectionSourceEnvCount DetectionSource = "env_override_count"
|
|
DetectionSourceEnvBoth DetectionSource = "env_override_both"
|
|
DetectionSourceConfig DetectionSource = "config"
|
|
DetectionSourceAuto DetectionSource = "auto"
|
|
DetectionSourceNone DetectionSource = "none"
|
|
)
|
|
|
|
// GPUDetectionInfo provides metadata about how GPU detection was determined
|
|
type GPUDetectionInfo struct {
|
|
GPUType GPUType `json:"gpu_type"`
|
|
ConfiguredVendor string `json:"configured_vendor"`
|
|
DetectionMethod DetectionSource `json:"detection_method"`
|
|
EnvOverrideType string `json:"env_override_type,omitempty"`
|
|
EnvOverrideCount int `json:"env_override_count,omitempty"`
|
|
ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"`
|
|
}
|
|
|
|
// GPUDetector interface for detecting GPU availability
|
|
type GPUDetector interface {
|
|
DetectGPUCount() int
|
|
GetGPUType() GPUType
|
|
GetDevicePaths() []string
|
|
}
|
|
|
|
// NVIDIA GPUDetector implementation
|
|
type NVIDIADetector struct{}
|
|
|
|
func (d *NVIDIADetector) DetectGPUCount() int {
|
|
// First try NVML for accurate detection
|
|
if IsNVMLAvailable() {
|
|
count, err := GetGPUCount()
|
|
if err == nil && count > 0 {
|
|
return count
|
|
}
|
|
}
|
|
|
|
// Fall back to environment variable
|
|
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
|
|
return n
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (d *NVIDIADetector) GetGPUType() GPUType {
|
|
return GPUTypeNVIDIA
|
|
}
|
|
|
|
func (d *NVIDIADetector) GetDevicePaths() []string {
|
|
// Prefer standard NVIDIA device nodes when present.
|
|
patterns := []string{
|
|
"/dev/nvidiactl",
|
|
"/dev/nvidia-modeset",
|
|
"/dev/nvidia-uvm",
|
|
"/dev/nvidia-uvm-tools",
|
|
"/dev/nvidia*",
|
|
}
|
|
seen := make(map[string]struct{})
|
|
out := make([]string, 0, 8)
|
|
for _, pat := range patterns {
|
|
if filepath.Base(pat) == pat {
|
|
continue
|
|
}
|
|
if strings.Contains(pat, "*") {
|
|
matches, _ := filepath.Glob(pat)
|
|
for _, m := range matches {
|
|
if _, ok := seen[m]; ok {
|
|
continue
|
|
}
|
|
if _, err := os.Stat(m); err != nil {
|
|
continue
|
|
}
|
|
seen[m] = struct{}{}
|
|
out = append(out, m)
|
|
}
|
|
continue
|
|
}
|
|
if _, ok := seen[pat]; ok {
|
|
continue
|
|
}
|
|
if _, err := os.Stat(pat); err != nil {
|
|
continue
|
|
}
|
|
seen[pat] = struct{}{}
|
|
out = append(out, pat)
|
|
}
|
|
// Fallback for non-NVIDIA setups where only generic DRM device exists.
|
|
if len(out) == 0 {
|
|
if _, err := os.Stat("/dev/dri"); err == nil {
|
|
out = append(out, "/dev/dri")
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Apple M-series GPUDetector implementation
|
|
type AppleDetector struct {
|
|
enabled bool
|
|
}
|
|
|
|
func (d *AppleDetector) DetectGPUCount() int {
|
|
// First try actual macOS GPU detection
|
|
if IsMacOS() {
|
|
count, err := GetMacOSGPUCount()
|
|
if err == nil && count > 0 {
|
|
return count
|
|
}
|
|
}
|
|
|
|
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
|
|
return n
|
|
}
|
|
if d.enabled {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (d *AppleDetector) GetGPUType() GPUType {
|
|
return GPUTypeApple
|
|
}
|
|
|
|
func (d *AppleDetector) GetDevicePaths() []string {
|
|
return []string{"/dev/metal", "/dev/mps"}
|
|
}
|
|
|
|
// None GPUDetector implementation
|
|
type NoneDetector struct{}
|
|
|
|
func (d *NoneDetector) DetectGPUCount() int {
|
|
return 0
|
|
}
|
|
|
|
func (d *NoneDetector) GetGPUType() GPUType {
|
|
return GPUTypeNone
|
|
}
|
|
|
|
func (d *NoneDetector) GetDevicePaths() []string {
|
|
return nil
|
|
}
|
|
|
|
// GPUDetectorFactory creates appropriate GPU detector based config
|
|
type GPUDetectorFactory struct{}
|
|
|
|
// DetectionResult contains both the detector and metadata about how it was selected
|
|
type DetectionResult struct {
|
|
Detector GPUDetector
|
|
Info GPUDetectionInfo
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
|
|
result := f.CreateDetectorWithInfo(cfg)
|
|
return result.Detector
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
|
|
// Check for explicit environment overrides
|
|
envType := os.Getenv("FETCH_ML_GPU_TYPE")
|
|
envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")
|
|
|
|
if envType != "" && hasEnvCount {
|
|
// Both env vars set
|
|
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
|
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
|
switch envType {
|
|
case string(GPUTypeNVIDIA):
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case string(GPUTypeApple):
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: true},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case string(GPUTypeNone):
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD env override uses NVIDIA detector (aliased)
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeAMD,
|
|
ConfiguredVendor: "amd",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
default:
|
|
// Defensive: unknown env type should not silently fall through
|
|
logWarningf("unrecognized FETCH_ML_GPU_TYPE value %q, using no GPU", envType)
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvBoth,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
if envType != "" {
|
|
// Only FETCH_ML_GPU_TYPE set
|
|
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
|
switch envType {
|
|
case string(GPUTypeNVIDIA):
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case string(GPUTypeApple):
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: true},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case string(GPUTypeNone):
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD env override uses NVIDIA detector (aliased)
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeAMD,
|
|
ConfiguredVendor: "amd",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
default:
|
|
// Defensive: unknown env type should not silently fall through
|
|
logWarningf("unrecognized FETCH_ML_GPU_TYPE value %q, using no GPU", envType)
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: DetectionSourceEnvType,
|
|
EnvOverrideType: envType,
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
if hasEnvCount {
|
|
// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
|
|
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
|
return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
|
|
}
|
|
|
|
// No env overrides - detect from config
|
|
return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
|
|
}
|
|
|
|
func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
|
|
if cfg == nil {
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
},
|
|
}
|
|
}
|
|
|
|
// Check for auto-detection scenarios (GPUDevices provided or AppleGPU enabled without explicit vendor)
|
|
isAutoDetect := cfg.GPUVendorAutoDetected ||
|
|
(len(cfg.GPUDevices) > 0 && cfg.GPUVendor == "") ||
|
|
(cfg.AppleGPU.Enabled && cfg.GPUVendor == "")
|
|
if isAutoDetect && source == DetectionSourceConfig {
|
|
source = DetectionSourceAuto
|
|
}
|
|
|
|
switch GPUType(cfg.GPUVendor) {
|
|
case GPUTypeApple:
|
|
return DetectionResult{
|
|
Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeApple,
|
|
ConfiguredVendor: "apple",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case GPUTypeNone:
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case GPUTypeNVIDIA:
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "nvidia",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
case "amd":
|
|
// AMD uses similar device exposure patterns in this codebase.
|
|
// This is the key aliasing point - we report AMD as configured vendor
|
|
// but use NVIDIADetector for implementation.
|
|
return DetectionResult{
|
|
Detector: &NVIDIADetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNVIDIA,
|
|
ConfiguredVendor: "amd", // User configured "amd"
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
default:
|
|
// SECURITY: Explicit default prevents silent misconfiguration
|
|
// Unknown GPU vendor is treated as no GPU - fail secure
|
|
// Note: Config.Validate() should catch invalid vendors before this point
|
|
logWarningf("unrecognized GPU vendor %q, using no GPU", cfg.GPUVendor)
|
|
return DetectionResult{
|
|
Detector: &NoneDetector{},
|
|
Info: GPUDetectionInfo{
|
|
GPUType: GPUTypeNone,
|
|
ConfiguredVendor: "none",
|
|
DetectionMethod: source,
|
|
EnvOverrideType: envType,
|
|
EnvOverrideCount: envCount,
|
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
|
},
|
|
}
|
|
}
|
|
}
|