feat: GPU detection transparency and artifact scanner improvements
Some checks failed
Build CLI with Embedded SQLite / build (arm64, aarch64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build (x86_64, x86_64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (arm64) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (x86_64) (push) Waiting to run
Security Scan / Security Analysis (push) Waiting to run
Security Scan / Native Library Security (push) Waiting to run
Checkout test / test (push) Successful in 6s
CI/CD Pipeline / Test (push) Failing after 1s
CI/CD Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI/CD Pipeline / Build (push) Has been skipped
CI/CD Pipeline / Test Scripts (push) Has been skipped
CI/CD Pipeline / Test Native Libraries (push) Has been skipped
CI/CD Pipeline / GPU Golden Test Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 39s
CI/CD Pipeline / Docker Build (push) Has been skipped
Some checks failed
Build CLI with Embedded SQLite / build (arm64, aarch64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build (x86_64, x86_64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (arm64) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (x86_64) (push) Waiting to run
Security Scan / Security Analysis (push) Waiting to run
Security Scan / Native Library Security (push) Waiting to run
Checkout test / test (push) Successful in 6s
CI/CD Pipeline / Test (push) Failing after 1s
CI/CD Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI/CD Pipeline / Build (push) Has been skipped
CI/CD Pipeline / Test Scripts (push) Has been skipped
CI/CD Pipeline / Test Native Libraries (push) Has been skipped
CI/CD Pipeline / GPU Golden Test Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 39s
CI/CD Pipeline / Docker Build (push) Has been skipped
- Surface GPUDetectionInfo from parseGPUCountFromConfig for detection metadata - Document FETCH_ML_TOTAL_CPU and FETCH_ML_GPU_SLOTS_PER_GPU env vars - Add debug logging for all env var overrides to stderr - Track config-layer auto-detection in GPUDetectionInfo.ConfigLayerAutoDetected - Add --include-all flag to artifact scanner (includeAll parameter) - Add AMD production mode enforcement (error in non-local mode) - Add GPU detector unit tests for env overrides and AMD aliasing
This commit is contained in:
parent
f987ddb86c
commit
3b194ff2e8
15 changed files with 915 additions and 91 deletions
|
|
@ -334,6 +334,62 @@ jobs:
|
||||||
echo "=== Native Implementation ==="
|
echo "=== Native Implementation ==="
|
||||||
CGO_ENABLED=1 go test -tags native_libs -bench=. ./tests/benchmarks/ -benchmem || true
|
CGO_ENABLED=1 go test -tags native_libs -bench=. ./tests/benchmarks/ -benchmem || true
|
||||||
|
|
||||||
|
test-gpu-matrix:
|
||||||
|
name: GPU Golden Test Matrix
|
||||||
|
runs-on: self-hosted
|
||||||
|
needs: test-native
|
||||||
|
timeout-minutes: 15
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
build_config: [cgo-native, cgo-only, nocgo]
|
||||||
|
fail-fast: false
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
|
- name: Setup Go
|
||||||
|
run: |
|
||||||
|
REQUIRED_GO="1.25.0"
|
||||||
|
if command -v go &> /dev/null && go version | grep -q "go${REQUIRED_GO}"; then
|
||||||
|
echo "Go ${REQUIRED_GO} already installed"
|
||||||
|
else
|
||||||
|
echo "Installing Go ${REQUIRED_GO}..."
|
||||||
|
curl -sL "https://go.dev/dl/go${REQUIRED_GO}.linux-amd64.tar.gz" | sudo tar -C /usr/local -xzf -
|
||||||
|
export PATH="/usr/local/go/bin:$PATH"
|
||||||
|
echo "/usr/local/go/bin" >> $GITHUB_PATH
|
||||||
|
fi
|
||||||
|
go version
|
||||||
|
|
||||||
|
- name: Build Native Libraries (for cgo-native config)
|
||||||
|
if: matrix.build_config == 'cgo-native'
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y cmake zlib1g-dev build-essential
|
||||||
|
make native-build || echo "Native build skipped (may fail without proper deps)"
|
||||||
|
|
||||||
|
- name: Run GPU Tests - cgo+native_libs
|
||||||
|
if: matrix.build_config == 'cgo-native'
|
||||||
|
run: |
|
||||||
|
echo "=== Testing cgo + native_libs build ==="
|
||||||
|
CGO_ENABLED=1 go test -tags native_libs -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
|
||||||
|
CGO_ENABLED=1 go test -tags native_libs -v ./tests/unit/gpu/ -run TestBuildTagMatrix
|
||||||
|
|
||||||
|
- name: Run GPU Tests - cgo only (no native_libs)
|
||||||
|
if: matrix.build_config == 'cgo-only'
|
||||||
|
run: |
|
||||||
|
echo "=== Testing cgo without native_libs build ==="
|
||||||
|
CGO_ENABLED=1 go test -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
|
||||||
|
CGO_ENABLED=1 go test -v ./tests/unit/gpu/ -run TestBuildTagMatrix
|
||||||
|
|
||||||
|
- name: Run GPU Tests - nocgo
|
||||||
|
if: matrix.build_config == 'nocgo'
|
||||||
|
run: |
|
||||||
|
echo "=== Testing !cgo build ==="
|
||||||
|
CGO_ENABLED=0 go test -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
|
||||||
|
CGO_ENABLED=0 go test -v ./tests/unit/gpu/ -run TestBuildTagMatrix
|
||||||
|
|
||||||
docker-build:
|
docker-build:
|
||||||
name: Docker Build
|
name: Docker Build
|
||||||
runs-on: self-hosted
|
runs-on: self-hosted
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ const WorkQueue = struct {
|
||||||
depth: usize,
|
depth: usize,
|
||||||
};
|
};
|
||||||
|
|
||||||
fn init(allocator: std.mem.Allocator) WorkQueue {
|
fn init() WorkQueue {
|
||||||
return .{
|
return .{
|
||||||
.items = .empty,
|
.items = .empty,
|
||||||
.mutex = .{},
|
.mutex = .{},
|
||||||
|
|
|
||||||
|
|
@ -321,6 +321,14 @@ api_key = "<analyst-api-key>"
|
||||||
| `FETCHML_CONFIG` | - | Path to config file |
|
| `FETCHML_CONFIG` | - | Path to config file |
|
||||||
| `FETCHML_LOG_LEVEL` | "info" | Override log level |
|
| `FETCHML_LOG_LEVEL` | "info" | Override log level |
|
||||||
| `CLI_CONFIG` | - | Path to CLI config file |
|
| `CLI_CONFIG` | - | Path to CLI config file |
|
||||||
|
| `FETCH_ML_GPU_TYPE` | - | Override GPU vendor detection (nvidia, amd, apple, none). Takes precedence over config file. |
|
||||||
|
| `FETCH_ML_GPU_COUNT` | - | Override GPU count detection. Used with auto-detected or configured vendor. |
|
||||||
|
| `FETCH_ML_TOTAL_CPU` | - | Override total CPU count detection. Sets the number of CPU cores available. |
|
||||||
|
| `FETCH_ML_GPU_SLOTS_PER_GPU` | 1 | Override GPU slots per GPU. Controls how many concurrent tasks can share a single GPU. |
|
||||||
|
|
||||||
|
When environment variable overrides are active, they are logged to stderr at worker startup for debugging.
|
||||||
|
|
||||||
|
Note: When `gpu_vendor: amd` is configured, the system uses the NVIDIA detector implementation (aliased) due to similar device exposure patterns. The `configured_vendor` field will show "amd" while the actual detection uses NVIDIA-compatible methods.
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ import (
|
||||||
"github.com/jfraeys/fetch_ml/internal/manifest"
|
"github.com/jfraeys/fetch_ml/internal/manifest"
|
||||||
)
|
)
|
||||||
|
|
||||||
func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
func scanArtifacts(runDir string, includeAll bool) (*manifest.Artifacts, error) {
|
||||||
runDir = strings.TrimSpace(runDir)
|
runDir = strings.TrimSpace(runDir)
|
||||||
if runDir == "" {
|
if runDir == "" {
|
||||||
return nil, fmt.Errorf("run dir is empty")
|
return nil, fmt.Errorf("run dir is empty")
|
||||||
|
|
@ -37,19 +37,7 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
||||||
}
|
}
|
||||||
rel = filepath.ToSlash(rel)
|
rel = filepath.ToSlash(rel)
|
||||||
|
|
||||||
if rel == "code" || strings.HasPrefix(rel, "code/") {
|
// Standard exclusions (always apply)
|
||||||
if d.IsDir() {
|
|
||||||
return fs.SkipDir
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
|
|
||||||
if d.IsDir() {
|
|
||||||
return fs.SkipDir
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if rel == manifestFilename {
|
if rel == manifestFilename {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
@ -57,12 +45,26 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if strings.HasSuffix(rel, ".log") {
|
// Optional exclusions (skipped when includeAll is true)
|
||||||
return nil
|
if !includeAll {
|
||||||
}
|
if rel == "code" || strings.HasPrefix(rel, "code/") {
|
||||||
|
if d.IsDir() {
|
||||||
if d.Type()&fs.ModeSymlink != 0 {
|
return fs.SkipDir
|
||||||
return nil
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
|
||||||
|
if d.IsDir() {
|
||||||
|
return fs.SkipDir
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if strings.HasSuffix(rel, ".log") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if d.Type()&fs.ModeSymlink != 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if d.IsDir() {
|
if d.IsDir() {
|
||||||
|
|
@ -100,6 +102,7 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
||||||
const manifestFilename = "run_manifest.json"
|
const manifestFilename = "run_manifest.json"
|
||||||
|
|
||||||
// ScanArtifacts is an exported wrapper for testing/benchmarking.
|
// ScanArtifacts is an exported wrapper for testing/benchmarking.
|
||||||
func ScanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
// When includeAll is false, excludes code/, snapshot/, *.log files, and symlinks.
|
||||||
return scanArtifacts(runDir)
|
func ScanArtifacts(runDir string, includeAll bool) (*manifest.Artifacts, error) {
|
||||||
|
return scanArtifacts(runDir, includeAll)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -77,13 +77,14 @@ type Config struct {
|
||||||
PrewarmEnabled bool `yaml:"prewarm_enabled"`
|
PrewarmEnabled bool `yaml:"prewarm_enabled"`
|
||||||
|
|
||||||
// Podman execution
|
// Podman execution
|
||||||
PodmanImage string `yaml:"podman_image"`
|
PodmanImage string `yaml:"podman_image"`
|
||||||
ContainerWorkspace string `yaml:"container_workspace"`
|
ContainerWorkspace string `yaml:"container_workspace"`
|
||||||
ContainerResults string `yaml:"container_results"`
|
ContainerResults string `yaml:"container_results"`
|
||||||
GPUDevices []string `yaml:"gpu_devices"`
|
GPUDevices []string `yaml:"gpu_devices"`
|
||||||
GPUVendor string `yaml:"gpu_vendor"`
|
GPUVendor string `yaml:"gpu_vendor"`
|
||||||
GPUVisibleDevices []int `yaml:"gpu_visible_devices"`
|
GPUVendorAutoDetected bool `yaml:"-"` // Set by LoadConfig when GPUVendor is auto-detected
|
||||||
GPUVisibleDeviceIDs []string `yaml:"gpu_visible_device_ids"`
|
GPUVisibleDevices []int `yaml:"gpu_visible_devices"`
|
||||||
|
GPUVisibleDeviceIDs []string `yaml:"gpu_visible_device_ids"`
|
||||||
|
|
||||||
// Apple M-series GPU configuration
|
// Apple M-series GPU configuration
|
||||||
AppleGPU AppleGPUConfig `yaml:"apple_gpu"`
|
AppleGPU AppleGPUConfig `yaml:"apple_gpu"`
|
||||||
|
|
@ -264,6 +265,7 @@ func LoadConfig(path string) (*Config, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if strings.TrimSpace(cfg.GPUVendor) == "" {
|
if strings.TrimSpace(cfg.GPUVendor) == "" {
|
||||||
|
cfg.GPUVendorAutoDetected = true
|
||||||
if cfg.AppleGPU.Enabled {
|
if cfg.AppleGPU.Enabled {
|
||||||
cfg.GPUVendor = string(GPUTypeApple)
|
cfg.GPUVendor = string(GPUTypeApple)
|
||||||
} else if len(cfg.GPUDevices) > 0 ||
|
} else if len(cfg.GPUDevices) > 0 ||
|
||||||
|
|
@ -453,9 +455,15 @@ func envInt(name string) (int, bool) {
|
||||||
return n, true
|
return n, true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// logEnvOverride logs environment variable overrides to stderr for debugging
|
||||||
|
func logEnvOverride(name string, value interface{}) {
|
||||||
|
fmt.Fprintf(os.Stderr, "[env] %s=%v (override active)\n", name, value)
|
||||||
|
}
|
||||||
|
|
||||||
// parseCPUFromConfig determines total CPU from environment or config
|
// parseCPUFromConfig determines total CPU from environment or config
|
||||||
func parseCPUFromConfig(cfg *Config) int {
|
func parseCPUFromConfig(cfg *Config) int {
|
||||||
if n, ok := envInt("FETCH_ML_TOTAL_CPU"); ok && n >= 0 {
|
if n, ok := envInt("FETCH_ML_TOTAL_CPU"); ok && n >= 0 {
|
||||||
|
logEnvOverride("FETCH_ML_TOTAL_CPU", n)
|
||||||
return n
|
return n
|
||||||
}
|
}
|
||||||
if cfg != nil {
|
if cfg != nil {
|
||||||
|
|
@ -471,11 +479,11 @@ func parseCPUFromConfig(cfg *Config) int {
|
||||||
return runtime.NumCPU()
|
return runtime.NumCPU()
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseGPUCountFromConfig detects GPU count from config
|
// parseGPUCountFromConfig detects GPU count from config and returns detection metadata
|
||||||
func parseGPUCountFromConfig(cfg *Config) int {
|
func parseGPUCountFromConfig(cfg *Config) (int, GPUDetectionInfo) {
|
||||||
factory := &GPUDetectorFactory{}
|
factory := &GPUDetectorFactory{}
|
||||||
detector := factory.CreateDetector(cfg)
|
result := factory.CreateDetectorWithInfo(cfg)
|
||||||
return detector.DetectGPUCount()
|
return result.Detector.DetectGPUCount(), result.Info
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseGPUSlotsPerGPUFromConfig reads GPU slots per GPU from environment
|
// parseGPUSlotsPerGPUFromConfig reads GPU slots per GPU from environment
|
||||||
|
|
|
||||||
|
|
@ -147,9 +147,10 @@ func NewWorker(cfg *Config, _ string) (*Worker, error) {
|
||||||
)
|
)
|
||||||
|
|
||||||
// Create resource manager
|
// Create resource manager
|
||||||
|
gpuCount, gpuDetectionInfo := parseGPUCountFromConfig(cfg)
|
||||||
rm, err := resources.NewManager(resources.Options{
|
rm, err := resources.NewManager(resources.Options{
|
||||||
TotalCPU: parseCPUFromConfig(cfg),
|
TotalCPU: parseCPUFromConfig(cfg),
|
||||||
GPUCount: parseGPUCountFromConfig(cfg),
|
GPUCount: gpuCount,
|
||||||
SlotsPerGPU: parseGPUSlotsPerGPUFromConfig(),
|
SlotsPerGPU: parseGPUSlotsPerGPUFromConfig(),
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -158,28 +159,32 @@ func NewWorker(cfg *Config, _ string) (*Worker, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
worker := &Worker{
|
worker := &Worker{
|
||||||
id: cfg.WorkerID,
|
id: cfg.WorkerID,
|
||||||
config: cfg,
|
config: cfg,
|
||||||
logger: logger,
|
logger: logger,
|
||||||
runLoop: runLoop,
|
runLoop: runLoop,
|
||||||
runner: jobRunner,
|
runner: jobRunner,
|
||||||
metrics: metricsObj,
|
metrics: metricsObj,
|
||||||
health: lifecycle.NewHealthMonitor(),
|
health: lifecycle.NewHealthMonitor(),
|
||||||
resources: rm,
|
resources: rm,
|
||||||
jupyter: jupyterMgr,
|
jupyter: jupyterMgr,
|
||||||
|
gpuDetectionInfo: gpuDetectionInfo,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Log GPU configuration
|
// Log GPU configuration
|
||||||
if !cfg.LocalMode {
|
if !cfg.LocalMode {
|
||||||
gpuType := strings.ToLower(strings.TrimSpace(os.Getenv("FETCH_ML_GPU_TYPE")))
|
gpuType := strings.ToLower(strings.TrimSpace(os.Getenv("FETCH_ML_GPU_TYPE")))
|
||||||
if cfg.AppleGPU.Enabled {
|
|
||||||
logger.Warn("apple MPS GPU mode is intended for development; do not use in production",
|
|
||||||
"gpu_type", "apple",
|
|
||||||
)
|
|
||||||
}
|
|
||||||
if gpuType == "amd" {
|
if gpuType == "amd" {
|
||||||
logger.Warn("amd GPU mode is intended for development; do not use in production",
|
cancel()
|
||||||
"gpu_type", "amd",
|
return nil, fmt.Errorf(
|
||||||
|
"AMD GPU mode is not supported in production (FETCH_ML_GPU_TYPE=amd). " +
|
||||||
|
"Use 'nvidia', 'apple', 'none', or GPUDevices config. " +
|
||||||
|
"AMD support is available in local mode for experimental development",
|
||||||
|
)
|
||||||
|
} else if cfg.AppleGPU.Enabled {
|
||||||
|
logger.Warn(
|
||||||
|
"apple MPS GPU mode is intended for development; do not use in production",
|
||||||
|
"gpu_type", "apple",
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -11,10 +11,33 @@ type GPUType string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
GPUTypeNVIDIA GPUType = "nvidia"
|
GPUTypeNVIDIA GPUType = "nvidia"
|
||||||
|
GPUTypeAMD GPUType = "amd"
|
||||||
GPUTypeApple GPUType = "apple"
|
GPUTypeApple GPUType = "apple"
|
||||||
GPUTypeNone GPUType = "none"
|
GPUTypeNone GPUType = "none"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// DetectionSource indicates how the GPU detector was selected
|
||||||
|
type DetectionSource string
|
||||||
|
|
||||||
|
const (
|
||||||
|
DetectionSourceEnvType DetectionSource = "env_override_type"
|
||||||
|
DetectionSourceEnvCount DetectionSource = "env_override_count"
|
||||||
|
DetectionSourceEnvBoth DetectionSource = "env_override_both"
|
||||||
|
DetectionSourceConfig DetectionSource = "config"
|
||||||
|
DetectionSourceAuto DetectionSource = "auto"
|
||||||
|
DetectionSourceNone DetectionSource = "none"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GPUDetectionInfo provides metadata about how GPU detection was determined
|
||||||
|
type GPUDetectionInfo struct {
|
||||||
|
GPUType GPUType `json:"gpu_type"`
|
||||||
|
ConfiguredVendor string `json:"configured_vendor"`
|
||||||
|
DetectionMethod DetectionSource `json:"detection_method"`
|
||||||
|
EnvOverrideType string `json:"env_override_type,omitempty"`
|
||||||
|
EnvOverrideCount int `json:"env_override_count,omitempty"`
|
||||||
|
ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
// GPUDetector interface for detecting GPU availability
|
// GPUDetector interface for detecting GPU availability
|
||||||
type GPUDetector interface {
|
type GPUDetector interface {
|
||||||
DetectGPUCount() int
|
DetectGPUCount() int
|
||||||
|
|
@ -138,47 +161,240 @@ func (d *NoneDetector) GetDevicePaths() []string {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPUDetectorFactory creates appropriate GPU detector based on config
|
// GPUDetectorFactory creates appropriate GPU detector based config
|
||||||
type GPUDetectorFactory struct{}
|
type GPUDetectorFactory struct{}
|
||||||
|
|
||||||
|
// DetectionResult contains both the detector and metadata about how it was selected
|
||||||
|
type DetectionResult struct {
|
||||||
|
Detector GPUDetector
|
||||||
|
Info GPUDetectionInfo
|
||||||
|
}
|
||||||
|
|
||||||
func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
|
func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
|
||||||
// Check for explicit environment override
|
result := f.CreateDetectorWithInfo(cfg)
|
||||||
if gpuType := os.Getenv("FETCH_ML_GPU_TYPE"); gpuType != "" {
|
return result.Detector
|
||||||
switch gpuType {
|
}
|
||||||
|
|
||||||
|
func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
|
||||||
|
// Check for explicit environment overrides
|
||||||
|
envType := os.Getenv("FETCH_ML_GPU_TYPE")
|
||||||
|
envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")
|
||||||
|
|
||||||
|
if envType != "" && hasEnvCount {
|
||||||
|
// Both env vars set
|
||||||
|
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
||||||
|
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
||||||
|
switch envType {
|
||||||
case string(GPUTypeNVIDIA):
|
case string(GPUTypeNVIDIA):
|
||||||
return &NVIDIADetector{}
|
return DetectionResult{
|
||||||
|
Detector: &NVIDIADetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNVIDIA,
|
||||||
|
ConfiguredVendor: "nvidia",
|
||||||
|
DetectionMethod: DetectionSourceEnvBoth,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
},
|
||||||
|
}
|
||||||
case string(GPUTypeApple):
|
case string(GPUTypeApple):
|
||||||
return &AppleDetector{enabled: true}
|
return DetectionResult{
|
||||||
|
Detector: &AppleDetector{enabled: true},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeApple,
|
||||||
|
ConfiguredVendor: "apple",
|
||||||
|
DetectionMethod: DetectionSourceEnvBoth,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
},
|
||||||
|
}
|
||||||
case string(GPUTypeNone):
|
case string(GPUTypeNone):
|
||||||
return &NoneDetector{}
|
return DetectionResult{
|
||||||
}
|
Detector: &NoneDetector{},
|
||||||
}
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNone,
|
||||||
// Respect configured vendor when explicitly set.
|
ConfiguredVendor: "none",
|
||||||
if cfg != nil {
|
DetectionMethod: DetectionSourceEnvBoth,
|
||||||
switch GPUType(cfg.GPUVendor) {
|
EnvOverrideType: envType,
|
||||||
case GPUTypeApple:
|
EnvOverrideCount: envCount,
|
||||||
return &AppleDetector{enabled: cfg.AppleGPU.Enabled}
|
},
|
||||||
case GPUTypeNone:
|
}
|
||||||
return &NoneDetector{}
|
|
||||||
case GPUTypeNVIDIA:
|
|
||||||
return &NVIDIADetector{}
|
|
||||||
case "amd":
|
case "amd":
|
||||||
// AMD uses similar device exposure patterns in this codebase.
|
// AMD env override uses NVIDIA detector (aliased)
|
||||||
return &NVIDIADetector{}
|
return DetectionResult{
|
||||||
|
Detector: &NVIDIADetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeAMD,
|
||||||
|
ConfiguredVendor: "amd",
|
||||||
|
DetectionMethod: DetectionSourceEnvBoth,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Auto-detect based on config
|
if envType != "" {
|
||||||
if cfg != nil {
|
// Only FETCH_ML_GPU_TYPE set
|
||||||
if cfg.AppleGPU.Enabled {
|
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
||||||
return &AppleDetector{enabled: true}
|
switch envType {
|
||||||
|
case string(GPUTypeNVIDIA):
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &NVIDIADetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNVIDIA,
|
||||||
|
ConfiguredVendor: "nvidia",
|
||||||
|
DetectionMethod: DetectionSourceEnvType,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
case string(GPUTypeApple):
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &AppleDetector{enabled: true},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeApple,
|
||||||
|
ConfiguredVendor: "apple",
|
||||||
|
DetectionMethod: DetectionSourceEnvType,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
case string(GPUTypeNone):
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &NoneDetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNone,
|
||||||
|
ConfiguredVendor: "none",
|
||||||
|
DetectionMethod: DetectionSourceEnvType,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
case "amd":
|
||||||
|
// AMD env override uses NVIDIA detector (aliased)
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &NVIDIADetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeAMD,
|
||||||
|
ConfiguredVendor: "amd",
|
||||||
|
DetectionMethod: DetectionSourceEnvType,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if len(cfg.GPUDevices) > 0 {
|
}
|
||||||
return &NVIDIADetector{}
|
|
||||||
|
if hasEnvCount {
|
||||||
|
// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
|
||||||
|
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
||||||
|
return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// No env overrides - detect from config
|
||||||
|
return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
|
||||||
|
if cfg == nil {
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &NoneDetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNone,
|
||||||
|
ConfiguredVendor: "none",
|
||||||
|
DetectionMethod: source,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch GPUType(cfg.GPUVendor) {
|
||||||
|
case GPUTypeApple:
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeApple,
|
||||||
|
ConfiguredVendor: "apple",
|
||||||
|
DetectionMethod: source,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
case GPUTypeNone:
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &NoneDetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNone,
|
||||||
|
ConfiguredVendor: "none",
|
||||||
|
DetectionMethod: source,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
case GPUTypeNVIDIA:
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &NVIDIADetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNVIDIA,
|
||||||
|
ConfiguredVendor: "nvidia",
|
||||||
|
DetectionMethod: source,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
case "amd":
|
||||||
|
// AMD uses similar device exposure patterns in this codebase.
|
||||||
|
// This is the key aliasing point - we report AMD as configured vendor
|
||||||
|
// but use NVIDIADetector for implementation.
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &NVIDIADetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNVIDIA,
|
||||||
|
ConfiguredVendor: "amd", // User configured "amd"
|
||||||
|
DetectionMethod: source,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Auto-detect based on config settings
|
||||||
|
if cfg.AppleGPU.Enabled {
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &AppleDetector{enabled: true},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeApple,
|
||||||
|
ConfiguredVendor: "apple",
|
||||||
|
DetectionMethod: DetectionSourceAuto,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(cfg.GPUDevices) > 0 {
|
||||||
|
return DetectionResult{
|
||||||
|
Detector: &NVIDIADetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNVIDIA,
|
||||||
|
ConfiguredVendor: "nvidia",
|
||||||
|
DetectionMethod: DetectionSourceAuto,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Default to no GPU
|
// Default to no GPU
|
||||||
return &NoneDetector{}
|
return DetectionResult{
|
||||||
|
Detector: &NoneDetector{},
|
||||||
|
Info: GPUDetectionInfo{
|
||||||
|
GPUType: GPUTypeNone,
|
||||||
|
ConfiguredVendor: "none",
|
||||||
|
DetectionMethod: source,
|
||||||
|
EnvOverrideType: envType,
|
||||||
|
EnvOverrideCount: envCount,
|
||||||
|
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||||
|
},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"regexp"
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
|
@ -176,6 +177,9 @@ func GetPowermetricsData() (*PowermetricsData, error) {
|
||||||
out, err := cmd.Output()
|
out, err := cmd.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// powermetrics not available or no permission
|
// powermetrics not available or no permission
|
||||||
|
if ctx.Err() != context.DeadlineExceeded {
|
||||||
|
fmt.Fprintln(os.Stderr, "Warning: powermetrics requires sudo for GPU metrics")
|
||||||
|
}
|
||||||
return &PowermetricsData{HasData: false}, nil
|
return &PowermetricsData{HasData: false}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,8 @@ var (
|
||||||
ctxInitTime time.Time
|
ctxInitTime time.Time
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// getHashContext returns the native hash context, initializing it on first call.
|
||||||
|
// First call initializes C++ context (5-20ms) - subsequent calls reuse context.
|
||||||
func getHashContext() *C.fh_context_t {
|
func getHashContext() *C.fh_context_t {
|
||||||
hashCtxOnce.Do(func() {
|
hashCtxOnce.Do(func() {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
|
|
@ -65,7 +67,7 @@ func HasSIMDSHA256() bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
func ScanArtifactsNative(runDir string) (*manifest.Artifacts, error) {
|
func ScanArtifactsNative(runDir string) (*manifest.Artifacts, error) {
|
||||||
return ScanArtifacts(runDir)
|
return ScanArtifacts(runDir, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
func ExtractTarGzNative(archivePath, dstDir string) error {
|
func ExtractTarGzNative(archivePath, dstDir string) error {
|
||||||
|
|
|
||||||
|
|
@ -55,6 +55,9 @@ type Worker struct {
|
||||||
health *lifecycle.HealthMonitor
|
health *lifecycle.HealthMonitor
|
||||||
resources *resources.Manager
|
resources *resources.Manager
|
||||||
|
|
||||||
|
// GPU detection metadata for status output
|
||||||
|
gpuDetectionInfo GPUDetectionInfo
|
||||||
|
|
||||||
// Legacy fields for backward compatibility during migration
|
// Legacy fields for backward compatibility during migration
|
||||||
jupyter JupyterManager
|
jupyter JupyterManager
|
||||||
queueClient queue.Backend // Stored for prewarming access
|
queueClient queue.Backend // Stored for prewarming access
|
||||||
|
|
|
||||||
|
|
@ -131,7 +131,7 @@ func BenchmarkScanArtifacts(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
|
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
_, err := worker.ScanArtifacts(runDir)
|
_, err := worker.ScanArtifacts(runDir, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
b.Fatal(err)
|
b.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,11 +15,10 @@ func BenchmarkArtifactScanGo(b *testing.B) {
|
||||||
// Create test artifact structure
|
// Create test artifact structure
|
||||||
createTestArtifacts(b, tmpDir, 100)
|
createTestArtifacts(b, tmpDir, 100)
|
||||||
|
|
||||||
b.ResetTimer()
|
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
|
|
||||||
for i := 0; i < b.N; i++ {
|
for b.Loop() {
|
||||||
_, err := worker.ScanArtifacts(tmpDir)
|
_, err := worker.ScanArtifacts(tmpDir, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
b.Fatal(err)
|
b.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
@ -34,10 +33,9 @@ func BenchmarkArtifactScanNative(b *testing.B) {
|
||||||
// Create test artifact structure
|
// Create test artifact structure
|
||||||
createTestArtifacts(b, tmpDir, 100)
|
createTestArtifacts(b, tmpDir, 100)
|
||||||
|
|
||||||
b.ResetTimer()
|
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
|
|
||||||
for i := 0; i < b.N; i++ {
|
for b.Loop() {
|
||||||
_, err := worker.ScanArtifactsNative(tmpDir)
|
_, err := worker.ScanArtifactsNative(tmpDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
b.Fatal(err)
|
b.Fatal(err)
|
||||||
|
|
@ -54,8 +52,8 @@ func BenchmarkArtifactScanLarge(b *testing.B) {
|
||||||
|
|
||||||
b.Run("Go", func(b *testing.B) {
|
b.Run("Go", func(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
for i := 0; i < b.N; i++ {
|
for b.Loop() {
|
||||||
_, err := worker.ScanArtifacts(tmpDir)
|
_, err := worker.ScanArtifacts(tmpDir, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
b.Fatal(err)
|
b.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
@ -64,7 +62,7 @@ func BenchmarkArtifactScanLarge(b *testing.B) {
|
||||||
|
|
||||||
b.Run("Native", func(b *testing.B) {
|
b.Run("Native", func(b *testing.B) {
|
||||||
b.ReportAllocs()
|
b.ReportAllocs()
|
||||||
for i := 0; i < b.N; i++ {
|
for b.Loop() {
|
||||||
_, err := worker.ScanArtifactsNative(tmpDir)
|
_, err := worker.ScanArtifactsNative(tmpDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
b.Fatal(err)
|
b.Fatal(err)
|
||||||
|
|
@ -93,7 +91,7 @@ func createTestArtifacts(b testing.TB, root string, count int) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create test files
|
// Create test files
|
||||||
for i := 0; i < count; i++ {
|
for i := range count {
|
||||||
var path string
|
var path string
|
||||||
switch i % 5 {
|
switch i % 5 {
|
||||||
case 0:
|
case 0:
|
||||||
|
|
|
||||||
210
tests/unit/gpu/gpu_detector_test.go
Normal file
210
tests/unit/gpu/gpu_detector_test.go
Normal file
|
|
@ -0,0 +1,210 @@
|
||||||
|
package worker_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/jfraeys/fetch_ml/internal/worker"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestGPUDetectorEnvOverrides validates both FETCH_ML_GPU_TYPE and FETCH_ML_GPU_COUNT work
|
||||||
|
func TestGPUDetectorEnvOverrides(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
gpuType string
|
||||||
|
gpuCount string
|
||||||
|
wantType worker.GPUType
|
||||||
|
wantCount int
|
||||||
|
wantMethod worker.DetectionSource
|
||||||
|
wantConfigured string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "env type only - nvidia",
|
||||||
|
gpuType: "nvidia",
|
||||||
|
wantType: worker.GPUTypeNVIDIA,
|
||||||
|
wantMethod: worker.DetectionSourceEnvType,
|
||||||
|
wantConfigured: "nvidia",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "env type only - apple",
|
||||||
|
gpuType: "apple",
|
||||||
|
wantType: worker.GPUTypeApple,
|
||||||
|
wantMethod: worker.DetectionSourceEnvType,
|
||||||
|
wantConfigured: "apple",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "env type only - none",
|
||||||
|
gpuType: "none",
|
||||||
|
wantType: worker.GPUTypeNone,
|
||||||
|
wantMethod: worker.DetectionSourceEnvType,
|
||||||
|
wantConfigured: "none",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "both env vars set",
|
||||||
|
gpuType: "nvidia",
|
||||||
|
gpuCount: "4",
|
||||||
|
wantType: worker.GPUTypeNVIDIA,
|
||||||
|
wantMethod: worker.DetectionSourceEnvBoth,
|
||||||
|
wantConfigured: "nvidia",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "env type amd - shows amd configured vendor",
|
||||||
|
gpuType: "amd",
|
||||||
|
wantType: worker.GPUTypeAMD,
|
||||||
|
wantMethod: worker.DetectionSourceEnvType,
|
||||||
|
wantConfigured: "amd",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
// Set env vars
|
||||||
|
if tt.gpuType != "" {
|
||||||
|
os.Setenv("FETCH_ML_GPU_TYPE", tt.gpuType)
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||||
|
}
|
||||||
|
if tt.gpuCount != "" {
|
||||||
|
os.Setenv("FETCH_ML_GPU_COUNT", tt.gpuCount)
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||||
|
}
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(nil)
|
||||||
|
|
||||||
|
if result.Info.GPUType != tt.wantType {
|
||||||
|
t.Errorf("GPUType = %v, want %v", result.Info.GPUType, tt.wantType)
|
||||||
|
}
|
||||||
|
if result.Info.DetectionMethod != tt.wantMethod {
|
||||||
|
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, tt.wantMethod)
|
||||||
|
}
|
||||||
|
if result.Info.ConfiguredVendor != tt.wantConfigured {
|
||||||
|
t.Errorf("ConfiguredVendor = %v, want %v", result.Info.ConfiguredVendor, tt.wantConfigured)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGPUDetectorAMDVendorAlias validates AMD config shows proper aliasing
|
||||||
|
func TestGPUDetectorAMDVendorAlias(t *testing.T) {
|
||||||
|
cfg := &worker.Config{
|
||||||
|
GPUVendor: "amd",
|
||||||
|
}
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(cfg)
|
||||||
|
|
||||||
|
// AMD uses NVIDIA detector implementation
|
||||||
|
if result.Info.ConfiguredVendor != "amd" {
|
||||||
|
t.Errorf("ConfiguredVendor = %v, want 'amd'", result.Info.ConfiguredVendor)
|
||||||
|
}
|
||||||
|
if result.Info.GPUType != worker.GPUTypeNVIDIA {
|
||||||
|
t.Errorf("GPUType = %v, want %v (NVIDIA implementation for AMD alias)", result.Info.GPUType, worker.GPUTypeNVIDIA)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGPUDetectorEnvCountOverride validates FETCH_ML_GPU_COUNT with auto-detect
|
||||||
|
func TestGPUDetectorEnvCountOverride(t *testing.T) {
|
||||||
|
os.Setenv("FETCH_ML_GPU_COUNT", "8")
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||||
|
|
||||||
|
cfg := &worker.Config{
|
||||||
|
GPUVendor: "nvidia",
|
||||||
|
}
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(cfg)
|
||||||
|
|
||||||
|
if result.Info.DetectionMethod != worker.DetectionSourceEnvCount {
|
||||||
|
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, worker.DetectionSourceEnvCount)
|
||||||
|
}
|
||||||
|
if result.Info.EnvOverrideCount != 8 {
|
||||||
|
t.Errorf("EnvOverrideCount = %v, want 8", result.Info.EnvOverrideCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGPUDetectorDetectionSources validates all detection source types
|
||||||
|
func TestGPUDetectorDetectionSources(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
envType string
|
||||||
|
envCount string
|
||||||
|
config *worker.Config
|
||||||
|
wantSource worker.DetectionSource
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "env type takes precedence over config",
|
||||||
|
envType: "apple",
|
||||||
|
config: &worker.Config{GPUVendor: "nvidia"},
|
||||||
|
wantSource: worker.DetectionSourceEnvType,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "env count triggers env_count source",
|
||||||
|
envCount: "2",
|
||||||
|
config: &worker.Config{GPUVendor: "nvidia"},
|
||||||
|
wantSource: worker.DetectionSourceEnvCount,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "config source when no env",
|
||||||
|
config: &worker.Config{GPUVendor: "nvidia"},
|
||||||
|
wantSource: worker.DetectionSourceConfig,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "auto source for GPUDevices",
|
||||||
|
config: &worker.Config{GPUDevices: []string{"/dev/nvidia0"}},
|
||||||
|
wantSource: worker.DetectionSourceAuto,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "auto source for AppleGPU",
|
||||||
|
config: &worker.Config{AppleGPU: worker.AppleGPUConfig{Enabled: true}},
|
||||||
|
wantSource: worker.DetectionSourceAuto,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if tt.envType != "" {
|
||||||
|
os.Setenv("FETCH_ML_GPU_TYPE", tt.envType)
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||||
|
}
|
||||||
|
if tt.envCount != "" {
|
||||||
|
os.Setenv("FETCH_ML_GPU_COUNT", tt.envCount)
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||||
|
}
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(tt.config)
|
||||||
|
|
||||||
|
if result.Info.DetectionMethod != tt.wantSource {
|
||||||
|
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, tt.wantSource)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGPUDetectorInfoFields validates all GPUDetectionInfo fields are populated
|
||||||
|
func TestGPUDetectorInfoFields(t *testing.T) {
|
||||||
|
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
|
||||||
|
os.Setenv("FETCH_ML_GPU_COUNT", "4")
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(nil)
|
||||||
|
|
||||||
|
// Validate all expected fields
|
||||||
|
if result.Info.GPUType == "" {
|
||||||
|
t.Error("GPUType field is empty")
|
||||||
|
}
|
||||||
|
if result.Info.ConfiguredVendor == "" {
|
||||||
|
t.Error("ConfiguredVendor field is empty")
|
||||||
|
}
|
||||||
|
if result.Info.DetectionMethod == "" {
|
||||||
|
t.Error("DetectionMethod field is empty")
|
||||||
|
}
|
||||||
|
if result.Info.EnvOverrideType != "nvidia" {
|
||||||
|
t.Errorf("EnvOverrideType = %v, want 'nvidia'", result.Info.EnvOverrideType)
|
||||||
|
}
|
||||||
|
if result.Info.EnvOverrideCount != 4 {
|
||||||
|
t.Errorf("EnvOverrideCount = %v, want 4", result.Info.EnvOverrideCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
311
tests/unit/gpu/gpu_golden_test.go
Normal file
311
tests/unit/gpu/gpu_golden_test.go
Normal file
|
|
@ -0,0 +1,311 @@
|
||||||
|
package worker_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/jfraeys/fetch_ml/internal/worker"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GoldenGPUStatus represents the expected GPU status output for golden file testing
|
||||||
|
type GoldenGPUStatus struct {
|
||||||
|
GPUCount int `json:"gpu_count"`
|
||||||
|
GPUType string `json:"gpu_type"`
|
||||||
|
ConfiguredVendor string `json:"configured_vendor"`
|
||||||
|
DetectionMethod string `json:"detection_method"`
|
||||||
|
EnvOverrideType string `json:"env_override_type,omitempty"`
|
||||||
|
EnvOverrideCount int `json:"env_override_count,omitempty"`
|
||||||
|
BuildTags map[string]bool `json:"build_tags"`
|
||||||
|
NativeAvailable bool `json:"native_available"`
|
||||||
|
Extra map[string]interface{} `json:"extra,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectBuildTags returns which build tags are active
|
||||||
|
func detectBuildTags() map[string]bool {
|
||||||
|
tags := map[string]bool{
|
||||||
|
"cgo": false,
|
||||||
|
"native_libs": false,
|
||||||
|
"darwin": false,
|
||||||
|
"linux": false,
|
||||||
|
}
|
||||||
|
|
||||||
|
// CGO is determined at compile time - we can detect by trying to use native
|
||||||
|
// If native functions return "disabled", we know native_libs is not set
|
||||||
|
simdName := worker.GetSIMDImplName()
|
||||||
|
tags["native_libs"] = simdName != "disabled" && simdName != "disabled (no CGO)"
|
||||||
|
tags["cgo"] = simdName != "disabled (no CGO)"
|
||||||
|
|
||||||
|
// OS detection
|
||||||
|
if worker.IsMacOS() {
|
||||||
|
tags["darwin"] = true
|
||||||
|
} else {
|
||||||
|
tags["linux"] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
return tags
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGoldenGPUStatusNVML validates GPU status against golden file for NVML path
|
||||||
|
// This test runs under all build configurations but expectations differ:
|
||||||
|
// - cgo+native_libs: Real GPU count and NVML detection
|
||||||
|
// - cgo without native_libs: Returns 0, nil (stub behavior)
|
||||||
|
// - !cgo: Returns 0, nil (stub behavior)
|
||||||
|
func TestGoldenGPUStatusNVML(t *testing.T) {
|
||||||
|
// Setup: Configure for NVIDIA detection
|
||||||
|
cfg := &worker.Config{
|
||||||
|
GPUVendor: "nvidia",
|
||||||
|
}
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(cfg)
|
||||||
|
|
||||||
|
// Get actual detected count (behavior varies by build tags)
|
||||||
|
count := result.Detector.DetectGPUCount()
|
||||||
|
|
||||||
|
buildTags := detectBuildTags()
|
||||||
|
|
||||||
|
// Build the golden status object
|
||||||
|
got := GoldenGPUStatus{
|
||||||
|
GPUCount: count,
|
||||||
|
GPUType: string(result.Info.GPUType),
|
||||||
|
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||||
|
DetectionMethod: string(result.Info.DetectionMethod),
|
||||||
|
BuildTags: buildTags,
|
||||||
|
NativeAvailable: buildTags["native_libs"] && buildTags["cgo"],
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate against build-specific expectations
|
||||||
|
if buildTags["native_libs"] && buildTags["cgo"] {
|
||||||
|
// Real NVML build: Should detect actual GPUs or get real NVML error
|
||||||
|
// GPU count may be 0 if no NVIDIA hardware, but detection method should be config
|
||||||
|
if got.DetectionMethod != "config" {
|
||||||
|
t.Errorf("cgo+native_libs: DetectionMethod = %v, want 'config'", got.DetectionMethod)
|
||||||
|
}
|
||||||
|
} else if buildTags["cgo"] {
|
||||||
|
// CGO without native_libs: Stub returns 0
|
||||||
|
if got.GPUCount != 0 {
|
||||||
|
t.Logf("cgo-only build: GPUCount = %d (expected 0 from stub)", got.GPUCount)
|
||||||
|
}
|
||||||
|
if got.NativeAvailable {
|
||||||
|
t.Error("cgo-only build: NativeAvailable should be false")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No CGO: Stub returns 0
|
||||||
|
if got.GPUCount != 0 {
|
||||||
|
t.Logf("nocgo build: GPUCount = %d (expected 0 from stub)", got.GPUCount)
|
||||||
|
}
|
||||||
|
if got.NativeAvailable {
|
||||||
|
t.Error("nocgo build: NativeAvailable should be false")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common validations
|
||||||
|
if got.ConfiguredVendor != "nvidia" {
|
||||||
|
t.Errorf("ConfiguredVendor = %v, want 'nvidia'", got.ConfiguredVendor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGoldenGPUStatusAMDVendorAlias validates AMD aliasing is visible in output
|
||||||
|
// Build tags: all three configurations
|
||||||
|
// Runtime scenarios: amd config
|
||||||
|
func TestGoldenGPUStatusAMDVendorAlias(t *testing.T) {
|
||||||
|
cfg := &worker.Config{
|
||||||
|
GPUVendor: "amd",
|
||||||
|
}
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(cfg)
|
||||||
|
|
||||||
|
buildTags := detectBuildTags()
|
||||||
|
|
||||||
|
got := GoldenGPUStatus{
|
||||||
|
GPUCount: result.Detector.DetectGPUCount(),
|
||||||
|
GPUType: string(result.Info.GPUType),
|
||||||
|
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||||
|
DetectionMethod: string(result.Info.DetectionMethod),
|
||||||
|
BuildTags: buildTags,
|
||||||
|
NativeAvailable: buildTags["native_libs"] && buildTags["cgo"],
|
||||||
|
}
|
||||||
|
|
||||||
|
// The key assertion: configured_vendor should be "amd" but GPUType should be "nvidia"
|
||||||
|
// This makes the aliasing visible in status output
|
||||||
|
if got.ConfiguredVendor != "amd" {
|
||||||
|
t.Errorf("AMD config: ConfiguredVendor = %v, want 'amd'", got.ConfiguredVendor)
|
||||||
|
}
|
||||||
|
if got.GPUType != "nvidia" {
|
||||||
|
t.Errorf("AMD config: GPUType = %v, want 'nvidia' (AMD aliased to NVIDIA implementation)", got.GPUType)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGoldenGPUStatusEnvOverride validates env override behavior across build configs
|
||||||
|
// Build tags: all three
|
||||||
|
// Runtime scenarios: env override set
|
||||||
|
func TestGoldenGPUStatusEnvOverride(t *testing.T) {
|
||||||
|
// Set env override
|
||||||
|
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
|
||||||
|
os.Setenv("FETCH_ML_GPU_COUNT", "4")
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(&worker.Config{GPUVendor: "apple"})
|
||||||
|
|
||||||
|
buildTags := detectBuildTags()
|
||||||
|
|
||||||
|
got := GoldenGPUStatus{
|
||||||
|
GPUCount: result.Detector.DetectGPUCount(),
|
||||||
|
GPUType: string(result.Info.GPUType),
|
||||||
|
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||||
|
DetectionMethod: string(result.Info.DetectionMethod),
|
||||||
|
EnvOverrideType: result.Info.EnvOverrideType,
|
||||||
|
EnvOverrideCount: result.Info.EnvOverrideCount,
|
||||||
|
BuildTags: buildTags,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Env should take precedence over config
|
||||||
|
if got.DetectionMethod != "env_override_both" {
|
||||||
|
t.Errorf("Env override: DetectionMethod = %v, want 'env_override_both'", got.DetectionMethod)
|
||||||
|
}
|
||||||
|
if got.GPUType != "nvidia" {
|
||||||
|
t.Errorf("Env override: GPUType = %v, want 'nvidia'", got.GPUType)
|
||||||
|
}
|
||||||
|
if got.EnvOverrideType != "nvidia" {
|
||||||
|
t.Errorf("Env override: EnvOverrideType = %v, want 'nvidia'", got.EnvOverrideType)
|
||||||
|
}
|
||||||
|
if got.EnvOverrideCount != 4 {
|
||||||
|
t.Errorf("Env override: EnvOverrideCount = %v, want 4", got.EnvOverrideCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGoldenGPUStatusMacOS validates macOS detection when running on Darwin
|
||||||
|
// Build tags: cgo+native_libs on Darwin
|
||||||
|
// Runtime scenarios: darwin
|
||||||
|
func TestGoldenGPUStatusMacOS(t *testing.T) {
|
||||||
|
if !worker.IsMacOS() {
|
||||||
|
t.Skip("Skipping macOS-specific test on non-Darwin platform")
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := &worker.Config{
|
||||||
|
GPUVendor: "apple",
|
||||||
|
AppleGPU: worker.AppleGPUConfig{Enabled: true},
|
||||||
|
}
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(cfg)
|
||||||
|
|
||||||
|
buildTags := detectBuildTags()
|
||||||
|
|
||||||
|
got := GoldenGPUStatus{
|
||||||
|
GPUCount: result.Detector.DetectGPUCount(),
|
||||||
|
GPUType: string(result.Info.GPUType),
|
||||||
|
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||||
|
DetectionMethod: string(result.Info.DetectionMethod),
|
||||||
|
BuildTags: buildTags,
|
||||||
|
NativeAvailable: buildTags["darwin"],
|
||||||
|
}
|
||||||
|
|
||||||
|
if got.ConfiguredVendor != "apple" {
|
||||||
|
t.Errorf("macOS: ConfiguredVendor = %v, want 'apple'", got.ConfiguredVendor)
|
||||||
|
}
|
||||||
|
if got.GPUType != "apple" {
|
||||||
|
t.Errorf("macOS: GPUType = %v, want 'apple'", got.GPUType)
|
||||||
|
}
|
||||||
|
if !got.BuildTags["darwin"] {
|
||||||
|
t.Error("macOS: darwin build tag should be true")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGoldenGPUStatusNone validates no-GPU configuration
|
||||||
|
// Build tags: all three
|
||||||
|
// Runtime scenarios: none
|
||||||
|
func TestGoldenGPUStatusNone(t *testing.T) {
|
||||||
|
cfg := &worker.Config{
|
||||||
|
GPUVendor: "none",
|
||||||
|
}
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(cfg)
|
||||||
|
|
||||||
|
if result.Detector.DetectGPUCount() != 0 {
|
||||||
|
t.Errorf("none config: GPUCount = %v, want 0", result.Detector.DetectGPUCount())
|
||||||
|
}
|
||||||
|
if result.Info.ConfiguredVendor != "none" {
|
||||||
|
t.Errorf("none config: ConfiguredVendor = %v, want 'none'", result.Info.ConfiguredVendor)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGoldenJSONSerialization validates the GPU status serializes to JSON correctly
|
||||||
|
func TestGoldenJSONSerialization(t *testing.T) {
|
||||||
|
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
|
||||||
|
os.Setenv("FETCH_ML_GPU_COUNT", "2")
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||||
|
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||||
|
|
||||||
|
factory := &worker.GPUDetectorFactory{}
|
||||||
|
result := factory.CreateDetectorWithInfo(nil)
|
||||||
|
|
||||||
|
status := GoldenGPUStatus{
|
||||||
|
GPUCount: result.Detector.DetectGPUCount(),
|
||||||
|
GPUType: string(result.Info.GPUType),
|
||||||
|
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||||
|
DetectionMethod: string(result.Info.DetectionMethod),
|
||||||
|
EnvOverrideType: result.Info.EnvOverrideType,
|
||||||
|
EnvOverrideCount: result.Info.EnvOverrideCount,
|
||||||
|
BuildTags: detectBuildTags(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serialize to JSON (this mimics what ml status --json would output)
|
||||||
|
jsonData, err := json.MarshalIndent(status, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("JSON serialization failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify JSON can be parsed back
|
||||||
|
var parsed GoldenGPUStatus
|
||||||
|
if err := json.Unmarshal(jsonData, &parsed); err != nil {
|
||||||
|
t.Fatalf("JSON deserialization failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if parsed.ConfiguredVendor != status.ConfiguredVendor {
|
||||||
|
t.Errorf("JSON roundtrip: ConfiguredVendor mismatch")
|
||||||
|
}
|
||||||
|
if parsed.DetectionMethod != status.DetectionMethod {
|
||||||
|
t.Errorf("JSON roundtrip: DetectionMethod mismatch")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBuildTagMatrix validates that all expected build tag combinations are testable
|
||||||
|
// This test documents the three build configurations:
|
||||||
|
// 1. cgo + native_libs: Real native library implementations
|
||||||
|
// 2. cgo without native_libs: Stubs that return errors
|
||||||
|
// 3. !cgo: Stubs that return "disabled (no CGO)"
|
||||||
|
func TestBuildTagMatrix(t *testing.T) {
|
||||||
|
tags := detectBuildTags()
|
||||||
|
|
||||||
|
// Log the current build configuration for CI visibility
|
||||||
|
t.Logf("Build configuration: cgo=%v native_libs=%v darwin=%v linux=%v",
|
||||||
|
tags["cgo"], tags["native_libs"], tags["darwin"], tags["linux"])
|
||||||
|
|
||||||
|
// Validate SIMD implementation name matches build tags
|
||||||
|
simdName := worker.GetSIMDImplName()
|
||||||
|
t.Logf("SIMD implementation: %s", simdName)
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case tags["native_libs"]:
|
||||||
|
// Should have real implementation name (avx2, sha_ni, armv8_crypto, or generic)
|
||||||
|
if simdName == "disabled" || simdName == "disabled (no CGO)" {
|
||||||
|
t.Errorf("native_libs build: SIMD impl should be active, got %q", simdName)
|
||||||
|
}
|
||||||
|
case tags["cgo"]:
|
||||||
|
// Should be disabled without native_libs
|
||||||
|
if simdName != "disabled" {
|
||||||
|
t.Errorf("cgo-only build: SIMD impl should be 'disabled', got %q", simdName)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
// No CGO
|
||||||
|
if simdName != "disabled (no CGO)" {
|
||||||
|
t.Errorf("nocgo build: SIMD impl should be 'disabled (no CGO)', got %q", simdName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -30,7 +30,7 @@ func TestScanArtifacts_SkipsKnownPathsAndLogs(t *testing.T) {
|
||||||
mustWrite("checkpoints/best.pt", []byte("checkpoint"))
|
mustWrite("checkpoints/best.pt", []byte("checkpoint"))
|
||||||
mustWrite("plots/loss.png", []byte("png"))
|
mustWrite("plots/loss.png", []byte("png"))
|
||||||
|
|
||||||
art, err := worker.ScanArtifacts(runDir)
|
art, err := worker.ScanArtifacts(runDir, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("scanArtifacts: %v", err)
|
t.Fatalf("scanArtifacts: %v", err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue