fetch_ml/internal/worker/gpu_detector.go

package worker

import (
	"fmt"
	"log/slog"
	"os"
	"path/filepath"
	"strings"
)

// logWarningf logs a warning message using slog
func logWarningf(format string, args ...any) {
	slog.Warn(fmt.Sprintf(format, args...))
}

// GPUType represents different GPU types
type GPUType string

const (
	GPUTypeNVIDIA GPUType = "nvidia"
	GPUTypeAMD    GPUType = "amd"
	GPUTypeApple  GPUType = "apple"
	GPUTypeNone   GPUType = "none"
)

// DetectionSource indicates how the GPU detector was selected
type DetectionSource string

const (
	DetectionSourceEnvType  DetectionSource = "env_override_type"
	DetectionSourceEnvCount DetectionSource = "env_override_count"
	DetectionSourceEnvBoth  DetectionSource = "env_override_both"
	DetectionSourceConfig   DetectionSource = "config"
	DetectionSourceAuto     DetectionSource = "auto"
	DetectionSourceNone     DetectionSource = "none"
)

// GPUDetectionInfo provides metadata about how GPU detection was determined
type GPUDetectionInfo struct {
	GPUType                 GPUType         `json:"gpu_type"`
	ConfiguredVendor        string          `json:"configured_vendor"`
	DetectionMethod         DetectionSource `json:"detection_method"`
	EnvOverrideType         string          `json:"env_override_type,omitempty"`
	EnvOverrideCount        int             `json:"env_override_count,omitempty"`
	ConfigLayerAutoDetected bool            `json:"config_layer_auto_detected,omitempty"`
}

// GPUDetector interface for detecting GPU availability
type GPUDetector interface {
	DetectGPUCount() int
	GetGPUType() GPUType
	GetDevicePaths() []string
}

// NVIDIA GPUDetector implementation
type NVIDIADetector struct{}

func (d *NVIDIADetector) DetectGPUCount() int {
	// First try NVML for accurate detection
	if IsNVMLAvailable() {
		count, err := GetGPUCount()
		if err == nil && count > 0 {
			return count
		}
	}

	// Fall back to environment variable
	if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
		return n
	}
	return 0
}

func (d *NVIDIADetector) GetGPUType() GPUType {
	return GPUTypeNVIDIA
}

func (d *NVIDIADetector) GetDevicePaths() []string {
	// Prefer standard NVIDIA device nodes when present.
	patterns := []string{
		"/dev/nvidiactl",
		"/dev/nvidia-modeset",
		"/dev/nvidia-uvm",
		"/dev/nvidia-uvm-tools",
		"/dev/nvidia*",
	}
	seen := make(map[string]struct{})
	out := make([]string, 0, 8)
	for _, pat := range patterns {
		if filepath.Base(pat) == pat {
			continue
		}
		if strings.Contains(pat, "*") {
			matches, _ := filepath.Glob(pat)
			for _, m := range matches {
				if _, ok := seen[m]; ok {
					continue
				}
				if _, err := os.Stat(m); err != nil {
					continue
				}
				seen[m] = struct{}{}
				out = append(out, m)
			}
			continue
		}
		if _, ok := seen[pat]; ok {
			continue
		}
		if _, err := os.Stat(pat); err != nil {
			continue
		}
		seen[pat] = struct{}{}
		out = append(out, pat)
	}
	// Fallback for non-NVIDIA setups where only generic DRM device exists.
	if len(out) == 0 {
		if _, err := os.Stat("/dev/dri"); err == nil {
			out = append(out, "/dev/dri")
		}
	}
	return out
}

// Apple M-series GPUDetector implementation
type AppleDetector struct {
	enabled bool
}

func (d *AppleDetector) DetectGPUCount() int {
	// First try actual macOS GPU detection
	if IsMacOS() {
		count, err := GetMacOSGPUCount()
		if err == nil && count > 0 {
			return count
		}
	}

	if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
		return n
	}
	if d.enabled {
		return 1
	}
	return 0
}

func (d *AppleDetector) GetGPUType() GPUType {
	return GPUTypeApple
}

func (d *AppleDetector) GetDevicePaths() []string {
	return []string{"/dev/metal", "/dev/mps"}
}

// None GPUDetector implementation
type NoneDetector struct{}

func (d *NoneDetector) DetectGPUCount() int {
	return 0
}

func (d *NoneDetector) GetGPUType() GPUType {
	return GPUTypeNone
}

func (d *NoneDetector) GetDevicePaths() []string {
	return nil
}

// GPUDetectorFactory creates appropriate GPU detector based config
type GPUDetectorFactory struct{}

// DetectionResult contains both the detector and metadata about how it was selected
type DetectionResult struct {
	Detector GPUDetector
	Info     GPUDetectionInfo
}

func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
	result := f.CreateDetectorWithInfo(cfg)
	return result.Detector
}

func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
	// Check for explicit environment overrides
	envType := os.Getenv("FETCH_ML_GPU_TYPE")
	envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")

	if envType != "" && hasEnvCount {
		// Both env vars set
		logEnvOverride("FETCH_ML_GPU_TYPE", envType)
		logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
		switch envType {
		case string(GPUTypeNVIDIA):
			return DetectionResult{
				Detector: &NVIDIADetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNVIDIA,
					ConfiguredVendor: "nvidia",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		case string(GPUTypeApple):
			return DetectionResult{
				Detector: &AppleDetector{enabled: true},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeApple,
					ConfiguredVendor: "apple",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		case string(GPUTypeNone):
			return DetectionResult{
				Detector: &NoneDetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNone,
					ConfiguredVendor: "none",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		case "amd":
			// AMD env override uses NVIDIA detector (aliased)
			return DetectionResult{
				Detector: &NVIDIADetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeAMD,
					ConfiguredVendor: "amd",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		default:
			// Defensive: unknown env type should not silently fall through
			logWarningf("unrecognized FETCH_ML_GPU_TYPE value %q, using no GPU", envType)
			return DetectionResult{
				Detector: &NoneDetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNone,
					ConfiguredVendor: "none",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		}
	}

	if envType != "" {
		// Only FETCH_ML_GPU_TYPE set
		logEnvOverride("FETCH_ML_GPU_TYPE", envType)
		switch envType {
		case string(GPUTypeNVIDIA):
			return DetectionResult{
				Detector: &NVIDIADetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNVIDIA,
					ConfiguredVendor: "nvidia",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		case string(GPUTypeApple):
			return DetectionResult{
				Detector: &AppleDetector{enabled: true},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeApple,
					ConfiguredVendor: "apple",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		case string(GPUTypeNone):
			return DetectionResult{
				Detector: &NoneDetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNone,
					ConfiguredVendor: "none",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		case "amd":
			// AMD env override uses NVIDIA detector (aliased)
			return DetectionResult{
				Detector: &NVIDIADetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeAMD,
					ConfiguredVendor: "amd",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		default:
			// Defensive: unknown env type should not silently fall through
			logWarningf("unrecognized FETCH_ML_GPU_TYPE value %q, using no GPU", envType)
			return DetectionResult{
				Detector: &NoneDetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNone,
					ConfiguredVendor: "none",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		}
	}

	if hasEnvCount {
		// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
		logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
		return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
	}

	// No env overrides - detect from config
	return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
}

func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
	if cfg == nil {
		return DetectionResult{
			Detector: &NoneDetector{},
			Info: GPUDetectionInfo{
				GPUType:          GPUTypeNone,
				ConfiguredVendor: "none",
				DetectionMethod:  source,
			},
		}
	}

	// Check for auto-detection scenarios (GPUDevices provided or AppleGPU enabled without explicit vendor)
	isAutoDetect := cfg.GPUVendorAutoDetected ||
		(len(cfg.GPUDevices) > 0 && cfg.GPUVendor == "") ||
		(cfg.AppleGPU.Enabled && cfg.GPUVendor == "")
	if isAutoDetect && source == DetectionSourceConfig {
		source = DetectionSourceAuto
	}

	switch GPUType(cfg.GPUVendor) {
	case GPUTypeApple:
		return DetectionResult{
			Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeApple,
				ConfiguredVendor:        "apple",
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	case GPUTypeNone:
		return DetectionResult{
			Detector: &NoneDetector{},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeNone,
				ConfiguredVendor:        "none",
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	case GPUTypeNVIDIA:
		return DetectionResult{
			Detector: &NVIDIADetector{},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeNVIDIA,
				ConfiguredVendor:        "nvidia",
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	case "amd":
		// AMD uses similar device exposure patterns in this codebase.
		// This is the key aliasing point - we report AMD as configured vendor
		// but use NVIDIADetector for implementation.
		return DetectionResult{
			Detector: &NVIDIADetector{},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeNVIDIA,
				ConfiguredVendor:        "amd", // User configured "amd"
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	default:
		// SECURITY: Explicit default prevents silent misconfiguration
		// Unknown GPU vendor is treated as no GPU - fail secure
		// Note: Config.Validate() should catch invalid vendors before this point
		logWarningf("unrecognized GPU vendor %q, using no GPU", cfg.GPUVendor)
		return DetectionResult{
			Detector: &NoneDetector{},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeNone,
				ConfiguredVendor:        "none",
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	}
}