fetch_ml/internal/worker/gpu_detector.go

package worker

import (
	"os"
	"path/filepath"
	"strings"
)

// GPUType represents different GPU types
type GPUType string

const (
	GPUTypeNVIDIA GPUType = "nvidia"
	GPUTypeAMD    GPUType = "amd"
	GPUTypeApple  GPUType = "apple"
	GPUTypeNone   GPUType = "none"
)

// DetectionSource indicates how the GPU detector was selected
type DetectionSource string

const (
	DetectionSourceEnvType  DetectionSource = "env_override_type"
	DetectionSourceEnvCount DetectionSource = "env_override_count"
	DetectionSourceEnvBoth  DetectionSource = "env_override_both"
	DetectionSourceConfig   DetectionSource = "config"
	DetectionSourceAuto     DetectionSource = "auto"
	DetectionSourceNone     DetectionSource = "none"
)

// GPUDetectionInfo provides metadata about how GPU detection was determined
type GPUDetectionInfo struct {
	GPUType                 GPUType         `json:"gpu_type"`
	ConfiguredVendor        string          `json:"configured_vendor"`
	DetectionMethod         DetectionSource `json:"detection_method"`
	EnvOverrideType         string          `json:"env_override_type,omitempty"`
	EnvOverrideCount        int             `json:"env_override_count,omitempty"`
	ConfigLayerAutoDetected bool            `json:"config_layer_auto_detected,omitempty"`
}

// GPUDetector interface for detecting GPU availability
type GPUDetector interface {
	DetectGPUCount() int
	GetGPUType() GPUType
	GetDevicePaths() []string
}

// NVIDIA GPUDetector implementation
type NVIDIADetector struct{}

func (d *NVIDIADetector) DetectGPUCount() int {
	// First try NVML for accurate detection
	if IsNVMLAvailable() {
		count, err := GetGPUCount()
		if err == nil && count > 0 {
			return count
		}
	}

	// Fall back to environment variable
	if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
		return n
	}
	return 0
}

func (d *NVIDIADetector) GetGPUType() GPUType {
	return GPUTypeNVIDIA
}

func (d *NVIDIADetector) GetDevicePaths() []string {
	// Prefer standard NVIDIA device nodes when present.
	patterns := []string{
		"/dev/nvidiactl",
		"/dev/nvidia-modeset",
		"/dev/nvidia-uvm",
		"/dev/nvidia-uvm-tools",
		"/dev/nvidia*",
	}
	seen := make(map[string]struct{})
	out := make([]string, 0, 8)
	for _, pat := range patterns {
		if filepath.Base(pat) == pat {
			continue
		}
		if strings.Contains(pat, "*") {
			matches, _ := filepath.Glob(pat)
			for _, m := range matches {
				if _, ok := seen[m]; ok {
					continue
				}
				if _, err := os.Stat(m); err != nil {
					continue
				}
				seen[m] = struct{}{}
				out = append(out, m)
			}
			continue
		}
		if _, ok := seen[pat]; ok {
			continue
		}
		if _, err := os.Stat(pat); err != nil {
			continue
		}
		seen[pat] = struct{}{}
		out = append(out, pat)
	}
	// Fallback for non-NVIDIA setups where only generic DRM device exists.
	if len(out) == 0 {
		if _, err := os.Stat("/dev/dri"); err == nil {
			out = append(out, "/dev/dri")
		}
	}
	return out
}

// Apple M-series GPUDetector implementation
type AppleDetector struct {
	enabled bool
}

func (d *AppleDetector) DetectGPUCount() int {
	// First try actual macOS GPU detection
	if IsMacOS() {
		count, err := GetMacOSGPUCount()
		if err == nil && count > 0 {
			return count
		}
	}

	if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
		return n
	}
	if d.enabled {
		return 1
	}
	return 0
}

func (d *AppleDetector) GetGPUType() GPUType {
	return GPUTypeApple
}

func (d *AppleDetector) GetDevicePaths() []string {
	return []string{"/dev/metal", "/dev/mps"}
}

// None GPUDetector implementation
type NoneDetector struct{}

func (d *NoneDetector) DetectGPUCount() int {
	return 0
}

func (d *NoneDetector) GetGPUType() GPUType {
	return GPUTypeNone
}

func (d *NoneDetector) GetDevicePaths() []string {
	return nil
}

// GPUDetectorFactory creates appropriate GPU detector based config
type GPUDetectorFactory struct{}

// DetectionResult contains both the detector and metadata about how it was selected
type DetectionResult struct {
	Detector GPUDetector
	Info     GPUDetectionInfo
}

func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
	result := f.CreateDetectorWithInfo(cfg)
	return result.Detector
}

func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
	// Check for explicit environment overrides
	envType := os.Getenv("FETCH_ML_GPU_TYPE")
	envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")

	if envType != "" && hasEnvCount {
		// Both env vars set
		logEnvOverride("FETCH_ML_GPU_TYPE", envType)
		logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
		switch envType {
		case string(GPUTypeNVIDIA):
			return DetectionResult{
				Detector: &NVIDIADetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNVIDIA,
					ConfiguredVendor: "nvidia",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		case string(GPUTypeApple):
			return DetectionResult{
				Detector: &AppleDetector{enabled: true},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeApple,
					ConfiguredVendor: "apple",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		case string(GPUTypeNone):
			return DetectionResult{
				Detector: &NoneDetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNone,
					ConfiguredVendor: "none",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		case "amd":
			// AMD env override uses NVIDIA detector (aliased)
			return DetectionResult{
				Detector: &NVIDIADetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeAMD,
					ConfiguredVendor: "amd",
					DetectionMethod:  DetectionSourceEnvBoth,
					EnvOverrideType:  envType,
					EnvOverrideCount: envCount,
				},
			}
		}
	}

	if envType != "" {
		// Only FETCH_ML_GPU_TYPE set
		logEnvOverride("FETCH_ML_GPU_TYPE", envType)
		switch envType {
		case string(GPUTypeNVIDIA):
			return DetectionResult{
				Detector: &NVIDIADetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNVIDIA,
					ConfiguredVendor: "nvidia",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		case string(GPUTypeApple):
			return DetectionResult{
				Detector: &AppleDetector{enabled: true},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeApple,
					ConfiguredVendor: "apple",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		case string(GPUTypeNone):
			return DetectionResult{
				Detector: &NoneDetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeNone,
					ConfiguredVendor: "none",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		case "amd":
			// AMD env override uses NVIDIA detector (aliased)
			return DetectionResult{
				Detector: &NVIDIADetector{},
				Info: GPUDetectionInfo{
					GPUType:          GPUTypeAMD,
					ConfiguredVendor: "amd",
					DetectionMethod:  DetectionSourceEnvType,
					EnvOverrideType:  envType,
				},
			}
		}
	}

	if hasEnvCount {
		// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
		logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
		return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
	}

	// No env overrides - detect from config
	return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
}

func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
	if cfg == nil {
		return DetectionResult{
			Detector: &NoneDetector{},
			Info: GPUDetectionInfo{
				GPUType:          GPUTypeNone,
				ConfiguredVendor: "none",
				DetectionMethod:  source,
			},
		}
	}

	switch GPUType(cfg.GPUVendor) {
	case GPUTypeApple:
		return DetectionResult{
			Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeApple,
				ConfiguredVendor:        "apple",
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	case GPUTypeNone:
		return DetectionResult{
			Detector: &NoneDetector{},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeNone,
				ConfiguredVendor:        "none",
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	case GPUTypeNVIDIA:
		return DetectionResult{
			Detector: &NVIDIADetector{},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeNVIDIA,
				ConfiguredVendor:        "nvidia",
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	case "amd":
		// AMD uses similar device exposure patterns in this codebase.
		// This is the key aliasing point - we report AMD as configured vendor
		// but use NVIDIADetector for implementation.
		return DetectionResult{
			Detector: &NVIDIADetector{},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeNVIDIA,
				ConfiguredVendor:        "amd", // User configured "amd"
				DetectionMethod:         source,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	}

	// Auto-detect based on config settings
	if cfg.AppleGPU.Enabled {
		return DetectionResult{
			Detector: &AppleDetector{enabled: true},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeApple,
				ConfiguredVendor:        "apple",
				DetectionMethod:         DetectionSourceAuto,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	}
	if len(cfg.GPUDevices) > 0 {
		return DetectionResult{
			Detector: &NVIDIADetector{},
			Info: GPUDetectionInfo{
				GPUType:                 GPUTypeNVIDIA,
				ConfiguredVendor:        "nvidia",
				DetectionMethod:         DetectionSourceAuto,
				EnvOverrideType:         envType,
				EnvOverrideCount:        envCount,
				ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
			},
		}
	}

	// Default to no GPU
	return DetectionResult{
		Detector: &NoneDetector{},
		Info: GPUDetectionInfo{
			GPUType:                 GPUTypeNone,
			ConfiguredVendor:        "none",
			DetectionMethod:         source,
			EnvOverrideType:         envType,
			EnvOverrideCount:        envCount,
			ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
		},
	}
}