feat: GPU detection transparency and artifact scanner improvements
Some checks failed
Build CLI with Embedded SQLite / build (arm64, aarch64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build (x86_64, x86_64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (arm64) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (x86_64) (push) Waiting to run
Security Scan / Security Analysis (push) Waiting to run
Security Scan / Native Library Security (push) Waiting to run
Checkout test / test (push) Successful in 6s
CI/CD Pipeline / Test (push) Failing after 1s
CI/CD Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI/CD Pipeline / Build (push) Has been skipped
CI/CD Pipeline / Test Scripts (push) Has been skipped
CI/CD Pipeline / Test Native Libraries (push) Has been skipped
CI/CD Pipeline / GPU Golden Test Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 39s
CI/CD Pipeline / Docker Build (push) Has been skipped
Some checks failed
Build CLI with Embedded SQLite / build (arm64, aarch64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build (x86_64, x86_64-linux) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (arm64) (push) Waiting to run
Build CLI with Embedded SQLite / build-macos (x86_64) (push) Waiting to run
Security Scan / Security Analysis (push) Waiting to run
Security Scan / Native Library Security (push) Waiting to run
Checkout test / test (push) Successful in 6s
CI/CD Pipeline / Test (push) Failing after 1s
CI/CD Pipeline / Dev Compose Smoke Test (push) Has been skipped
CI/CD Pipeline / Build (push) Has been skipped
CI/CD Pipeline / Test Scripts (push) Has been skipped
CI/CD Pipeline / Test Native Libraries (push) Has been skipped
CI/CD Pipeline / GPU Golden Test Matrix (push) Has been skipped
Documentation / build-and-publish (push) Failing after 39s
CI/CD Pipeline / Docker Build (push) Has been skipped
- Surface GPUDetectionInfo from parseGPUCountFromConfig for detection metadata - Document FETCH_ML_TOTAL_CPU and FETCH_ML_GPU_SLOTS_PER_GPU env vars - Add debug logging for all env var overrides to stderr - Track config-layer auto-detection in GPUDetectionInfo.ConfigLayerAutoDetected - Add --include-all flag to artifact scanner (includeAll parameter) - Add AMD production mode enforcement (error in non-local mode) - Add GPU detector unit tests for env overrides and AMD aliasing
This commit is contained in:
parent
f987ddb86c
commit
3b194ff2e8
15 changed files with 915 additions and 91 deletions
|
|
@ -334,6 +334,62 @@ jobs:
|
|||
echo "=== Native Implementation ==="
|
||||
CGO_ENABLED=1 go test -tags native_libs -bench=. ./tests/benchmarks/ -benchmem || true
|
||||
|
||||
test-gpu-matrix:
|
||||
name: GPU Golden Test Matrix
|
||||
runs-on: self-hosted
|
||||
needs: test-native
|
||||
timeout-minutes: 15
|
||||
strategy:
|
||||
matrix:
|
||||
build_config: [cgo-native, cgo-only, nocgo]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Setup Go
|
||||
run: |
|
||||
REQUIRED_GO="1.25.0"
|
||||
if command -v go &> /dev/null && go version | grep -q "go${REQUIRED_GO}"; then
|
||||
echo "Go ${REQUIRED_GO} already installed"
|
||||
else
|
||||
echo "Installing Go ${REQUIRED_GO}..."
|
||||
curl -sL "https://go.dev/dl/go${REQUIRED_GO}.linux-amd64.tar.gz" | sudo tar -C /usr/local -xzf -
|
||||
export PATH="/usr/local/go/bin:$PATH"
|
||||
echo "/usr/local/go/bin" >> $GITHUB_PATH
|
||||
fi
|
||||
go version
|
||||
|
||||
- name: Build Native Libraries (for cgo-native config)
|
||||
if: matrix.build_config == 'cgo-native'
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake zlib1g-dev build-essential
|
||||
make native-build || echo "Native build skipped (may fail without proper deps)"
|
||||
|
||||
- name: Run GPU Tests - cgo+native_libs
|
||||
if: matrix.build_config == 'cgo-native'
|
||||
run: |
|
||||
echo "=== Testing cgo + native_libs build ==="
|
||||
CGO_ENABLED=1 go test -tags native_libs -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
|
||||
CGO_ENABLED=1 go test -tags native_libs -v ./tests/unit/gpu/ -run TestBuildTagMatrix
|
||||
|
||||
- name: Run GPU Tests - cgo only (no native_libs)
|
||||
if: matrix.build_config == 'cgo-only'
|
||||
run: |
|
||||
echo "=== Testing cgo without native_libs build ==="
|
||||
CGO_ENABLED=1 go test -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
|
||||
CGO_ENABLED=1 go test -v ./tests/unit/gpu/ -run TestBuildTagMatrix
|
||||
|
||||
- name: Run GPU Tests - nocgo
|
||||
if: matrix.build_config == 'nocgo'
|
||||
run: |
|
||||
echo "=== Testing !cgo build ==="
|
||||
CGO_ENABLED=0 go test -v ./tests/unit/gpu/ -run TestGoldenGPUStatus
|
||||
CGO_ENABLED=0 go test -v ./tests/unit/gpu/ -run TestBuildTagMatrix
|
||||
|
||||
docker-build:
|
||||
name: Docker Build
|
||||
runs-on: self-hosted
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ const WorkQueue = struct {
|
|||
depth: usize,
|
||||
};
|
||||
|
||||
fn init(allocator: std.mem.Allocator) WorkQueue {
|
||||
fn init() WorkQueue {
|
||||
return .{
|
||||
.items = .empty,
|
||||
.mutex = .{},
|
||||
|
|
|
|||
|
|
@ -321,6 +321,14 @@ api_key = "<analyst-api-key>"
|
|||
| `FETCHML_CONFIG` | - | Path to config file |
|
||||
| `FETCHML_LOG_LEVEL` | "info" | Override log level |
|
||||
| `CLI_CONFIG` | - | Path to CLI config file |
|
||||
| `FETCH_ML_GPU_TYPE` | - | Override GPU vendor detection (nvidia, amd, apple, none). Takes precedence over config file. |
|
||||
| `FETCH_ML_GPU_COUNT` | - | Override GPU count detection. Used with auto-detected or configured vendor. |
|
||||
| `FETCH_ML_TOTAL_CPU` | - | Override total CPU count detection. Sets the number of CPU cores available. |
|
||||
| `FETCH_ML_GPU_SLOTS_PER_GPU` | 1 | Override GPU slots per GPU. Controls how many concurrent tasks can share a single GPU. |
|
||||
|
||||
When environment variable overrides are active, they are logged to stderr at worker startup for debugging.
|
||||
|
||||
Note: When `gpu_vendor: amd` is configured, the system uses the NVIDIA detector implementation (aliased) due to similar device exposure patterns. The `configured_vendor` field will show "amd" while the actual detection uses NVIDIA-compatible methods.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import (
|
|||
"github.com/jfraeys/fetch_ml/internal/manifest"
|
||||
)
|
||||
|
||||
func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
||||
func scanArtifacts(runDir string, includeAll bool) (*manifest.Artifacts, error) {
|
||||
runDir = strings.TrimSpace(runDir)
|
||||
if runDir == "" {
|
||||
return nil, fmt.Errorf("run dir is empty")
|
||||
|
|
@ -37,19 +37,7 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
|||
}
|
||||
rel = filepath.ToSlash(rel)
|
||||
|
||||
if rel == "code" || strings.HasPrefix(rel, "code/") {
|
||||
if d.IsDir() {
|
||||
return fs.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
|
||||
if d.IsDir() {
|
||||
return fs.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Standard exclusions (always apply)
|
||||
if rel == manifestFilename {
|
||||
return nil
|
||||
}
|
||||
|
|
@ -57,12 +45,26 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
|||
return nil
|
||||
}
|
||||
|
||||
if strings.HasSuffix(rel, ".log") {
|
||||
return nil
|
||||
}
|
||||
|
||||
if d.Type()&fs.ModeSymlink != 0 {
|
||||
return nil
|
||||
// Optional exclusions (skipped when includeAll is true)
|
||||
if !includeAll {
|
||||
if rel == "code" || strings.HasPrefix(rel, "code/") {
|
||||
if d.IsDir() {
|
||||
return fs.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if rel == "snapshot" || strings.HasPrefix(rel, "snapshot/") {
|
||||
if d.IsDir() {
|
||||
return fs.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if strings.HasSuffix(rel, ".log") {
|
||||
return nil
|
||||
}
|
||||
if d.Type()&fs.ModeSymlink != 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
if d.IsDir() {
|
||||
|
|
@ -100,6 +102,7 @@ func scanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
|||
const manifestFilename = "run_manifest.json"
|
||||
|
||||
// ScanArtifacts is an exported wrapper for testing/benchmarking.
|
||||
func ScanArtifacts(runDir string) (*manifest.Artifacts, error) {
|
||||
return scanArtifacts(runDir)
|
||||
// When includeAll is false, excludes code/, snapshot/, *.log files, and symlinks.
|
||||
func ScanArtifacts(runDir string, includeAll bool) (*manifest.Artifacts, error) {
|
||||
return scanArtifacts(runDir, includeAll)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -77,13 +77,14 @@ type Config struct {
|
|||
PrewarmEnabled bool `yaml:"prewarm_enabled"`
|
||||
|
||||
// Podman execution
|
||||
PodmanImage string `yaml:"podman_image"`
|
||||
ContainerWorkspace string `yaml:"container_workspace"`
|
||||
ContainerResults string `yaml:"container_results"`
|
||||
GPUDevices []string `yaml:"gpu_devices"`
|
||||
GPUVendor string `yaml:"gpu_vendor"`
|
||||
GPUVisibleDevices []int `yaml:"gpu_visible_devices"`
|
||||
GPUVisibleDeviceIDs []string `yaml:"gpu_visible_device_ids"`
|
||||
PodmanImage string `yaml:"podman_image"`
|
||||
ContainerWorkspace string `yaml:"container_workspace"`
|
||||
ContainerResults string `yaml:"container_results"`
|
||||
GPUDevices []string `yaml:"gpu_devices"`
|
||||
GPUVendor string `yaml:"gpu_vendor"`
|
||||
GPUVendorAutoDetected bool `yaml:"-"` // Set by LoadConfig when GPUVendor is auto-detected
|
||||
GPUVisibleDevices []int `yaml:"gpu_visible_devices"`
|
||||
GPUVisibleDeviceIDs []string `yaml:"gpu_visible_device_ids"`
|
||||
|
||||
// Apple M-series GPU configuration
|
||||
AppleGPU AppleGPUConfig `yaml:"apple_gpu"`
|
||||
|
|
@ -264,6 +265,7 @@ func LoadConfig(path string) (*Config, error) {
|
|||
}
|
||||
|
||||
if strings.TrimSpace(cfg.GPUVendor) == "" {
|
||||
cfg.GPUVendorAutoDetected = true
|
||||
if cfg.AppleGPU.Enabled {
|
||||
cfg.GPUVendor = string(GPUTypeApple)
|
||||
} else if len(cfg.GPUDevices) > 0 ||
|
||||
|
|
@ -453,9 +455,15 @@ func envInt(name string) (int, bool) {
|
|||
return n, true
|
||||
}
|
||||
|
||||
// logEnvOverride logs environment variable overrides to stderr for debugging
|
||||
func logEnvOverride(name string, value interface{}) {
|
||||
fmt.Fprintf(os.Stderr, "[env] %s=%v (override active)\n", name, value)
|
||||
}
|
||||
|
||||
// parseCPUFromConfig determines total CPU from environment or config
|
||||
func parseCPUFromConfig(cfg *Config) int {
|
||||
if n, ok := envInt("FETCH_ML_TOTAL_CPU"); ok && n >= 0 {
|
||||
logEnvOverride("FETCH_ML_TOTAL_CPU", n)
|
||||
return n
|
||||
}
|
||||
if cfg != nil {
|
||||
|
|
@ -471,11 +479,11 @@ func parseCPUFromConfig(cfg *Config) int {
|
|||
return runtime.NumCPU()
|
||||
}
|
||||
|
||||
// parseGPUCountFromConfig detects GPU count from config
|
||||
func parseGPUCountFromConfig(cfg *Config) int {
|
||||
// parseGPUCountFromConfig detects GPU count from config and returns detection metadata
|
||||
func parseGPUCountFromConfig(cfg *Config) (int, GPUDetectionInfo) {
|
||||
factory := &GPUDetectorFactory{}
|
||||
detector := factory.CreateDetector(cfg)
|
||||
return detector.DetectGPUCount()
|
||||
result := factory.CreateDetectorWithInfo(cfg)
|
||||
return result.Detector.DetectGPUCount(), result.Info
|
||||
}
|
||||
|
||||
// parseGPUSlotsPerGPUFromConfig reads GPU slots per GPU from environment
|
||||
|
|
|
|||
|
|
@ -147,9 +147,10 @@ func NewWorker(cfg *Config, _ string) (*Worker, error) {
|
|||
)
|
||||
|
||||
// Create resource manager
|
||||
gpuCount, gpuDetectionInfo := parseGPUCountFromConfig(cfg)
|
||||
rm, err := resources.NewManager(resources.Options{
|
||||
TotalCPU: parseCPUFromConfig(cfg),
|
||||
GPUCount: parseGPUCountFromConfig(cfg),
|
||||
GPUCount: gpuCount,
|
||||
SlotsPerGPU: parseGPUSlotsPerGPUFromConfig(),
|
||||
})
|
||||
if err != nil {
|
||||
|
|
@ -158,28 +159,32 @@ func NewWorker(cfg *Config, _ string) (*Worker, error) {
|
|||
}
|
||||
|
||||
worker := &Worker{
|
||||
id: cfg.WorkerID,
|
||||
config: cfg,
|
||||
logger: logger,
|
||||
runLoop: runLoop,
|
||||
runner: jobRunner,
|
||||
metrics: metricsObj,
|
||||
health: lifecycle.NewHealthMonitor(),
|
||||
resources: rm,
|
||||
jupyter: jupyterMgr,
|
||||
id: cfg.WorkerID,
|
||||
config: cfg,
|
||||
logger: logger,
|
||||
runLoop: runLoop,
|
||||
runner: jobRunner,
|
||||
metrics: metricsObj,
|
||||
health: lifecycle.NewHealthMonitor(),
|
||||
resources: rm,
|
||||
jupyter: jupyterMgr,
|
||||
gpuDetectionInfo: gpuDetectionInfo,
|
||||
}
|
||||
|
||||
// Log GPU configuration
|
||||
if !cfg.LocalMode {
|
||||
gpuType := strings.ToLower(strings.TrimSpace(os.Getenv("FETCH_ML_GPU_TYPE")))
|
||||
if cfg.AppleGPU.Enabled {
|
||||
logger.Warn("apple MPS GPU mode is intended for development; do not use in production",
|
||||
"gpu_type", "apple",
|
||||
)
|
||||
}
|
||||
if gpuType == "amd" {
|
||||
logger.Warn("amd GPU mode is intended for development; do not use in production",
|
||||
"gpu_type", "amd",
|
||||
cancel()
|
||||
return nil, fmt.Errorf(
|
||||
"AMD GPU mode is not supported in production (FETCH_ML_GPU_TYPE=amd). " +
|
||||
"Use 'nvidia', 'apple', 'none', or GPUDevices config. " +
|
||||
"AMD support is available in local mode for experimental development",
|
||||
)
|
||||
} else if cfg.AppleGPU.Enabled {
|
||||
logger.Warn(
|
||||
"apple MPS GPU mode is intended for development; do not use in production",
|
||||
"gpu_type", "apple",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,10 +11,33 @@ type GPUType string
|
|||
|
||||
const (
|
||||
GPUTypeNVIDIA GPUType = "nvidia"
|
||||
GPUTypeAMD GPUType = "amd"
|
||||
GPUTypeApple GPUType = "apple"
|
||||
GPUTypeNone GPUType = "none"
|
||||
)
|
||||
|
||||
// DetectionSource indicates how the GPU detector was selected
|
||||
type DetectionSource string
|
||||
|
||||
const (
|
||||
DetectionSourceEnvType DetectionSource = "env_override_type"
|
||||
DetectionSourceEnvCount DetectionSource = "env_override_count"
|
||||
DetectionSourceEnvBoth DetectionSource = "env_override_both"
|
||||
DetectionSourceConfig DetectionSource = "config"
|
||||
DetectionSourceAuto DetectionSource = "auto"
|
||||
DetectionSourceNone DetectionSource = "none"
|
||||
)
|
||||
|
||||
// GPUDetectionInfo provides metadata about how GPU detection was determined
|
||||
type GPUDetectionInfo struct {
|
||||
GPUType GPUType `json:"gpu_type"`
|
||||
ConfiguredVendor string `json:"configured_vendor"`
|
||||
DetectionMethod DetectionSource `json:"detection_method"`
|
||||
EnvOverrideType string `json:"env_override_type,omitempty"`
|
||||
EnvOverrideCount int `json:"env_override_count,omitempty"`
|
||||
ConfigLayerAutoDetected bool `json:"config_layer_auto_detected,omitempty"`
|
||||
}
|
||||
|
||||
// GPUDetector interface for detecting GPU availability
|
||||
type GPUDetector interface {
|
||||
DetectGPUCount() int
|
||||
|
|
@ -138,47 +161,240 @@ func (d *NoneDetector) GetDevicePaths() []string {
|
|||
return nil
|
||||
}
|
||||
|
||||
// GPUDetectorFactory creates appropriate GPU detector based on config
|
||||
// GPUDetectorFactory creates appropriate GPU detector based config
|
||||
type GPUDetectorFactory struct{}
|
||||
|
||||
// DetectionResult contains both the detector and metadata about how it was selected
|
||||
type DetectionResult struct {
|
||||
Detector GPUDetector
|
||||
Info GPUDetectionInfo
|
||||
}
|
||||
|
||||
func (f *GPUDetectorFactory) CreateDetector(cfg *Config) GPUDetector {
|
||||
// Check for explicit environment override
|
||||
if gpuType := os.Getenv("FETCH_ML_GPU_TYPE"); gpuType != "" {
|
||||
switch gpuType {
|
||||
result := f.CreateDetectorWithInfo(cfg)
|
||||
return result.Detector
|
||||
}
|
||||
|
||||
func (f *GPUDetectorFactory) CreateDetectorWithInfo(cfg *Config) DetectionResult {
|
||||
// Check for explicit environment overrides
|
||||
envType := os.Getenv("FETCH_ML_GPU_TYPE")
|
||||
envCount, hasEnvCount := envInt("FETCH_ML_GPU_COUNT")
|
||||
|
||||
if envType != "" && hasEnvCount {
|
||||
// Both env vars set
|
||||
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
||||
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
||||
switch envType {
|
||||
case string(GPUTypeNVIDIA):
|
||||
return &NVIDIADetector{}
|
||||
return DetectionResult{
|
||||
Detector: &NVIDIADetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNVIDIA,
|
||||
ConfiguredVendor: "nvidia",
|
||||
DetectionMethod: DetectionSourceEnvBoth,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
},
|
||||
}
|
||||
case string(GPUTypeApple):
|
||||
return &AppleDetector{enabled: true}
|
||||
return DetectionResult{
|
||||
Detector: &AppleDetector{enabled: true},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeApple,
|
||||
ConfiguredVendor: "apple",
|
||||
DetectionMethod: DetectionSourceEnvBoth,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
},
|
||||
}
|
||||
case string(GPUTypeNone):
|
||||
return &NoneDetector{}
|
||||
}
|
||||
}
|
||||
|
||||
// Respect configured vendor when explicitly set.
|
||||
if cfg != nil {
|
||||
switch GPUType(cfg.GPUVendor) {
|
||||
case GPUTypeApple:
|
||||
return &AppleDetector{enabled: cfg.AppleGPU.Enabled}
|
||||
case GPUTypeNone:
|
||||
return &NoneDetector{}
|
||||
case GPUTypeNVIDIA:
|
||||
return &NVIDIADetector{}
|
||||
return DetectionResult{
|
||||
Detector: &NoneDetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNone,
|
||||
ConfiguredVendor: "none",
|
||||
DetectionMethod: DetectionSourceEnvBoth,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
},
|
||||
}
|
||||
case "amd":
|
||||
// AMD uses similar device exposure patterns in this codebase.
|
||||
return &NVIDIADetector{}
|
||||
// AMD env override uses NVIDIA detector (aliased)
|
||||
return DetectionResult{
|
||||
Detector: &NVIDIADetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeAMD,
|
||||
ConfiguredVendor: "amd",
|
||||
DetectionMethod: DetectionSourceEnvBoth,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-detect based on config
|
||||
if cfg != nil {
|
||||
if cfg.AppleGPU.Enabled {
|
||||
return &AppleDetector{enabled: true}
|
||||
if envType != "" {
|
||||
// Only FETCH_ML_GPU_TYPE set
|
||||
logEnvOverride("FETCH_ML_GPU_TYPE", envType)
|
||||
switch envType {
|
||||
case string(GPUTypeNVIDIA):
|
||||
return DetectionResult{
|
||||
Detector: &NVIDIADetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNVIDIA,
|
||||
ConfiguredVendor: "nvidia",
|
||||
DetectionMethod: DetectionSourceEnvType,
|
||||
EnvOverrideType: envType,
|
||||
},
|
||||
}
|
||||
case string(GPUTypeApple):
|
||||
return DetectionResult{
|
||||
Detector: &AppleDetector{enabled: true},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeApple,
|
||||
ConfiguredVendor: "apple",
|
||||
DetectionMethod: DetectionSourceEnvType,
|
||||
EnvOverrideType: envType,
|
||||
},
|
||||
}
|
||||
case string(GPUTypeNone):
|
||||
return DetectionResult{
|
||||
Detector: &NoneDetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNone,
|
||||
ConfiguredVendor: "none",
|
||||
DetectionMethod: DetectionSourceEnvType,
|
||||
EnvOverrideType: envType,
|
||||
},
|
||||
}
|
||||
case "amd":
|
||||
// AMD env override uses NVIDIA detector (aliased)
|
||||
return DetectionResult{
|
||||
Detector: &NVIDIADetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeAMD,
|
||||
ConfiguredVendor: "amd",
|
||||
DetectionMethod: DetectionSourceEnvType,
|
||||
EnvOverrideType: envType,
|
||||
},
|
||||
}
|
||||
}
|
||||
if len(cfg.GPUDevices) > 0 {
|
||||
return &NVIDIADetector{}
|
||||
}
|
||||
|
||||
if hasEnvCount {
|
||||
// Only FETCH_ML_GPU_COUNT set - need to detect vendor from config or auto
|
||||
logEnvOverride("FETCH_ML_GPU_COUNT", envCount)
|
||||
return f.detectFromConfigWithSource(cfg, DetectionSourceEnvCount, "", envCount)
|
||||
}
|
||||
|
||||
// No env overrides - detect from config
|
||||
return f.detectFromConfigWithSource(cfg, DetectionSourceConfig, "", -1)
|
||||
}
|
||||
|
||||
func (f *GPUDetectorFactory) detectFromConfigWithSource(cfg *Config, source DetectionSource, envType string, envCount int) DetectionResult {
|
||||
if cfg == nil {
|
||||
return DetectionResult{
|
||||
Detector: &NoneDetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNone,
|
||||
ConfiguredVendor: "none",
|
||||
DetectionMethod: source,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
switch GPUType(cfg.GPUVendor) {
|
||||
case GPUTypeApple:
|
||||
return DetectionResult{
|
||||
Detector: &AppleDetector{enabled: cfg.AppleGPU.Enabled},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeApple,
|
||||
ConfiguredVendor: "apple",
|
||||
DetectionMethod: source,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||
},
|
||||
}
|
||||
case GPUTypeNone:
|
||||
return DetectionResult{
|
||||
Detector: &NoneDetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNone,
|
||||
ConfiguredVendor: "none",
|
||||
DetectionMethod: source,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||
},
|
||||
}
|
||||
case GPUTypeNVIDIA:
|
||||
return DetectionResult{
|
||||
Detector: &NVIDIADetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNVIDIA,
|
||||
ConfiguredVendor: "nvidia",
|
||||
DetectionMethod: source,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||
},
|
||||
}
|
||||
case "amd":
|
||||
// AMD uses similar device exposure patterns in this codebase.
|
||||
// This is the key aliasing point - we report AMD as configured vendor
|
||||
// but use NVIDIADetector for implementation.
|
||||
return DetectionResult{
|
||||
Detector: &NVIDIADetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNVIDIA,
|
||||
ConfiguredVendor: "amd", // User configured "amd"
|
||||
DetectionMethod: source,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-detect based on config settings
|
||||
if cfg.AppleGPU.Enabled {
|
||||
return DetectionResult{
|
||||
Detector: &AppleDetector{enabled: true},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeApple,
|
||||
ConfiguredVendor: "apple",
|
||||
DetectionMethod: DetectionSourceAuto,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||
},
|
||||
}
|
||||
}
|
||||
if len(cfg.GPUDevices) > 0 {
|
||||
return DetectionResult{
|
||||
Detector: &NVIDIADetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNVIDIA,
|
||||
ConfiguredVendor: "nvidia",
|
||||
DetectionMethod: DetectionSourceAuto,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Default to no GPU
|
||||
return &NoneDetector{}
|
||||
return DetectionResult{
|
||||
Detector: &NoneDetector{},
|
||||
Info: GPUDetectionInfo{
|
||||
GPUType: GPUTypeNone,
|
||||
ConfiguredVendor: "none",
|
||||
DetectionMethod: source,
|
||||
EnvOverrideType: envType,
|
||||
EnvOverrideCount: envCount,
|
||||
ConfigLayerAutoDetected: cfg.GPUVendorAutoDetected,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import (
|
|||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"runtime"
|
||||
|
|
@ -176,6 +177,9 @@ func GetPowermetricsData() (*PowermetricsData, error) {
|
|||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
// powermetrics not available or no permission
|
||||
if ctx.Err() != context.DeadlineExceeded {
|
||||
fmt.Fprintln(os.Stderr, "Warning: powermetrics requires sudo for GPU metrics")
|
||||
}
|
||||
return &PowermetricsData{HasData: false}, nil
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@ var (
|
|||
ctxInitTime time.Time
|
||||
)
|
||||
|
||||
// getHashContext returns the native hash context, initializing it on first call.
|
||||
// First call initializes C++ context (5-20ms) - subsequent calls reuse context.
|
||||
func getHashContext() *C.fh_context_t {
|
||||
hashCtxOnce.Do(func() {
|
||||
start := time.Now()
|
||||
|
|
@ -65,7 +67,7 @@ func HasSIMDSHA256() bool {
|
|||
}
|
||||
|
||||
func ScanArtifactsNative(runDir string) (*manifest.Artifacts, error) {
|
||||
return ScanArtifacts(runDir)
|
||||
return ScanArtifacts(runDir, false)
|
||||
}
|
||||
|
||||
func ExtractTarGzNative(archivePath, dstDir string) error {
|
||||
|
|
|
|||
|
|
@ -55,6 +55,9 @@ type Worker struct {
|
|||
health *lifecycle.HealthMonitor
|
||||
resources *resources.Manager
|
||||
|
||||
// GPU detection metadata for status output
|
||||
gpuDetectionInfo GPUDetectionInfo
|
||||
|
||||
// Legacy fields for backward compatibility during migration
|
||||
jupyter JupyterManager
|
||||
queueClient queue.Backend // Stored for prewarming access
|
||||
|
|
|
|||
|
|
@ -131,7 +131,7 @@ func BenchmarkScanArtifacts(b *testing.B) {
|
|||
b.ReportAllocs()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, err := worker.ScanArtifacts(runDir)
|
||||
_, err := worker.ScanArtifacts(runDir, false)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,11 +15,10 @@ func BenchmarkArtifactScanGo(b *testing.B) {
|
|||
// Create test artifact structure
|
||||
createTestArtifacts(b, tmpDir, 100)
|
||||
|
||||
b.ResetTimer()
|
||||
b.ReportAllocs()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, err := worker.ScanArtifacts(tmpDir)
|
||||
for b.Loop() {
|
||||
_, err := worker.ScanArtifacts(tmpDir, false)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
|
@ -34,10 +33,9 @@ func BenchmarkArtifactScanNative(b *testing.B) {
|
|||
// Create test artifact structure
|
||||
createTestArtifacts(b, tmpDir, 100)
|
||||
|
||||
b.ResetTimer()
|
||||
b.ReportAllocs()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
for b.Loop() {
|
||||
_, err := worker.ScanArtifactsNative(tmpDir)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
|
|
@ -54,8 +52,8 @@ func BenchmarkArtifactScanLarge(b *testing.B) {
|
|||
|
||||
b.Run("Go", func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
_, err := worker.ScanArtifacts(tmpDir)
|
||||
for b.Loop() {
|
||||
_, err := worker.ScanArtifacts(tmpDir, false)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
|
@ -64,7 +62,7 @@ func BenchmarkArtifactScanLarge(b *testing.B) {
|
|||
|
||||
b.Run("Native", func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for b.Loop() {
|
||||
_, err := worker.ScanArtifactsNative(tmpDir)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
|
|
@ -93,7 +91,7 @@ func createTestArtifacts(b testing.TB, root string, count int) {
|
|||
}
|
||||
|
||||
// Create test files
|
||||
for i := 0; i < count; i++ {
|
||||
for i := range count {
|
||||
var path string
|
||||
switch i % 5 {
|
||||
case 0:
|
||||
|
|
|
|||
210
tests/unit/gpu/gpu_detector_test.go
Normal file
210
tests/unit/gpu/gpu_detector_test.go
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
package worker_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/jfraeys/fetch_ml/internal/worker"
|
||||
)
|
||||
|
||||
// TestGPUDetectorEnvOverrides validates both FETCH_ML_GPU_TYPE and FETCH_ML_GPU_COUNT work
|
||||
func TestGPUDetectorEnvOverrides(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
gpuType string
|
||||
gpuCount string
|
||||
wantType worker.GPUType
|
||||
wantCount int
|
||||
wantMethod worker.DetectionSource
|
||||
wantConfigured string
|
||||
}{
|
||||
{
|
||||
name: "env type only - nvidia",
|
||||
gpuType: "nvidia",
|
||||
wantType: worker.GPUTypeNVIDIA,
|
||||
wantMethod: worker.DetectionSourceEnvType,
|
||||
wantConfigured: "nvidia",
|
||||
},
|
||||
{
|
||||
name: "env type only - apple",
|
||||
gpuType: "apple",
|
||||
wantType: worker.GPUTypeApple,
|
||||
wantMethod: worker.DetectionSourceEnvType,
|
||||
wantConfigured: "apple",
|
||||
},
|
||||
{
|
||||
name: "env type only - none",
|
||||
gpuType: "none",
|
||||
wantType: worker.GPUTypeNone,
|
||||
wantMethod: worker.DetectionSourceEnvType,
|
||||
wantConfigured: "none",
|
||||
},
|
||||
{
|
||||
name: "both env vars set",
|
||||
gpuType: "nvidia",
|
||||
gpuCount: "4",
|
||||
wantType: worker.GPUTypeNVIDIA,
|
||||
wantMethod: worker.DetectionSourceEnvBoth,
|
||||
wantConfigured: "nvidia",
|
||||
},
|
||||
{
|
||||
name: "env type amd - shows amd configured vendor",
|
||||
gpuType: "amd",
|
||||
wantType: worker.GPUTypeAMD,
|
||||
wantMethod: worker.DetectionSourceEnvType,
|
||||
wantConfigured: "amd",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
// Set env vars
|
||||
if tt.gpuType != "" {
|
||||
os.Setenv("FETCH_ML_GPU_TYPE", tt.gpuType)
|
||||
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||
}
|
||||
if tt.gpuCount != "" {
|
||||
os.Setenv("FETCH_ML_GPU_COUNT", tt.gpuCount)
|
||||
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||
}
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(nil)
|
||||
|
||||
if result.Info.GPUType != tt.wantType {
|
||||
t.Errorf("GPUType = %v, want %v", result.Info.GPUType, tt.wantType)
|
||||
}
|
||||
if result.Info.DetectionMethod != tt.wantMethod {
|
||||
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, tt.wantMethod)
|
||||
}
|
||||
if result.Info.ConfiguredVendor != tt.wantConfigured {
|
||||
t.Errorf("ConfiguredVendor = %v, want %v", result.Info.ConfiguredVendor, tt.wantConfigured)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestGPUDetectorAMDVendorAlias validates AMD config shows proper aliasing
|
||||
func TestGPUDetectorAMDVendorAlias(t *testing.T) {
|
||||
cfg := &worker.Config{
|
||||
GPUVendor: "amd",
|
||||
}
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(cfg)
|
||||
|
||||
// AMD uses NVIDIA detector implementation
|
||||
if result.Info.ConfiguredVendor != "amd" {
|
||||
t.Errorf("ConfiguredVendor = %v, want 'amd'", result.Info.ConfiguredVendor)
|
||||
}
|
||||
if result.Info.GPUType != worker.GPUTypeNVIDIA {
|
||||
t.Errorf("GPUType = %v, want %v (NVIDIA implementation for AMD alias)", result.Info.GPUType, worker.GPUTypeNVIDIA)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGPUDetectorEnvCountOverride validates FETCH_ML_GPU_COUNT with auto-detect
|
||||
func TestGPUDetectorEnvCountOverride(t *testing.T) {
|
||||
os.Setenv("FETCH_ML_GPU_COUNT", "8")
|
||||
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||
|
||||
cfg := &worker.Config{
|
||||
GPUVendor: "nvidia",
|
||||
}
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(cfg)
|
||||
|
||||
if result.Info.DetectionMethod != worker.DetectionSourceEnvCount {
|
||||
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, worker.DetectionSourceEnvCount)
|
||||
}
|
||||
if result.Info.EnvOverrideCount != 8 {
|
||||
t.Errorf("EnvOverrideCount = %v, want 8", result.Info.EnvOverrideCount)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGPUDetectorDetectionSources validates all detection source types
|
||||
func TestGPUDetectorDetectionSources(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
envType string
|
||||
envCount string
|
||||
config *worker.Config
|
||||
wantSource worker.DetectionSource
|
||||
}{
|
||||
{
|
||||
name: "env type takes precedence over config",
|
||||
envType: "apple",
|
||||
config: &worker.Config{GPUVendor: "nvidia"},
|
||||
wantSource: worker.DetectionSourceEnvType,
|
||||
},
|
||||
{
|
||||
name: "env count triggers env_count source",
|
||||
envCount: "2",
|
||||
config: &worker.Config{GPUVendor: "nvidia"},
|
||||
wantSource: worker.DetectionSourceEnvCount,
|
||||
},
|
||||
{
|
||||
name: "config source when no env",
|
||||
config: &worker.Config{GPUVendor: "nvidia"},
|
||||
wantSource: worker.DetectionSourceConfig,
|
||||
},
|
||||
{
|
||||
name: "auto source for GPUDevices",
|
||||
config: &worker.Config{GPUDevices: []string{"/dev/nvidia0"}},
|
||||
wantSource: worker.DetectionSourceAuto,
|
||||
},
|
||||
{
|
||||
name: "auto source for AppleGPU",
|
||||
config: &worker.Config{AppleGPU: worker.AppleGPUConfig{Enabled: true}},
|
||||
wantSource: worker.DetectionSourceAuto,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if tt.envType != "" {
|
||||
os.Setenv("FETCH_ML_GPU_TYPE", tt.envType)
|
||||
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||
}
|
||||
if tt.envCount != "" {
|
||||
os.Setenv("FETCH_ML_GPU_COUNT", tt.envCount)
|
||||
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||
}
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(tt.config)
|
||||
|
||||
if result.Info.DetectionMethod != tt.wantSource {
|
||||
t.Errorf("DetectionMethod = %v, want %v", result.Info.DetectionMethod, tt.wantSource)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestGPUDetectorInfoFields validates all GPUDetectionInfo fields are populated
|
||||
func TestGPUDetectorInfoFields(t *testing.T) {
|
||||
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
|
||||
os.Setenv("FETCH_ML_GPU_COUNT", "4")
|
||||
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(nil)
|
||||
|
||||
// Validate all expected fields
|
||||
if result.Info.GPUType == "" {
|
||||
t.Error("GPUType field is empty")
|
||||
}
|
||||
if result.Info.ConfiguredVendor == "" {
|
||||
t.Error("ConfiguredVendor field is empty")
|
||||
}
|
||||
if result.Info.DetectionMethod == "" {
|
||||
t.Error("DetectionMethod field is empty")
|
||||
}
|
||||
if result.Info.EnvOverrideType != "nvidia" {
|
||||
t.Errorf("EnvOverrideType = %v, want 'nvidia'", result.Info.EnvOverrideType)
|
||||
}
|
||||
if result.Info.EnvOverrideCount != 4 {
|
||||
t.Errorf("EnvOverrideCount = %v, want 4", result.Info.EnvOverrideCount)
|
||||
}
|
||||
}
|
||||
311
tests/unit/gpu/gpu_golden_test.go
Normal file
311
tests/unit/gpu/gpu_golden_test.go
Normal file
|
|
@ -0,0 +1,311 @@
|
|||
package worker_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/jfraeys/fetch_ml/internal/worker"
|
||||
)
|
||||
|
||||
// GoldenGPUStatus represents the expected GPU status output for golden file testing
|
||||
type GoldenGPUStatus struct {
|
||||
GPUCount int `json:"gpu_count"`
|
||||
GPUType string `json:"gpu_type"`
|
||||
ConfiguredVendor string `json:"configured_vendor"`
|
||||
DetectionMethod string `json:"detection_method"`
|
||||
EnvOverrideType string `json:"env_override_type,omitempty"`
|
||||
EnvOverrideCount int `json:"env_override_count,omitempty"`
|
||||
BuildTags map[string]bool `json:"build_tags"`
|
||||
NativeAvailable bool `json:"native_available"`
|
||||
Extra map[string]interface{} `json:"extra,omitempty"`
|
||||
}
|
||||
|
||||
// detectBuildTags returns which build tags are active
|
||||
func detectBuildTags() map[string]bool {
|
||||
tags := map[string]bool{
|
||||
"cgo": false,
|
||||
"native_libs": false,
|
||||
"darwin": false,
|
||||
"linux": false,
|
||||
}
|
||||
|
||||
// CGO is determined at compile time - we can detect by trying to use native
|
||||
// If native functions return "disabled", we know native_libs is not set
|
||||
simdName := worker.GetSIMDImplName()
|
||||
tags["native_libs"] = simdName != "disabled" && simdName != "disabled (no CGO)"
|
||||
tags["cgo"] = simdName != "disabled (no CGO)"
|
||||
|
||||
// OS detection
|
||||
if worker.IsMacOS() {
|
||||
tags["darwin"] = true
|
||||
} else {
|
||||
tags["linux"] = true
|
||||
}
|
||||
|
||||
return tags
|
||||
}
|
||||
|
||||
// TestGoldenGPUStatusNVML validates GPU status against golden file for NVML path
|
||||
// This test runs under all build configurations but expectations differ:
|
||||
// - cgo+native_libs: Real GPU count and NVML detection
|
||||
// - cgo without native_libs: Returns 0, nil (stub behavior)
|
||||
// - !cgo: Returns 0, nil (stub behavior)
|
||||
func TestGoldenGPUStatusNVML(t *testing.T) {
|
||||
// Setup: Configure for NVIDIA detection
|
||||
cfg := &worker.Config{
|
||||
GPUVendor: "nvidia",
|
||||
}
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(cfg)
|
||||
|
||||
// Get actual detected count (behavior varies by build tags)
|
||||
count := result.Detector.DetectGPUCount()
|
||||
|
||||
buildTags := detectBuildTags()
|
||||
|
||||
// Build the golden status object
|
||||
got := GoldenGPUStatus{
|
||||
GPUCount: count,
|
||||
GPUType: string(result.Info.GPUType),
|
||||
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||
DetectionMethod: string(result.Info.DetectionMethod),
|
||||
BuildTags: buildTags,
|
||||
NativeAvailable: buildTags["native_libs"] && buildTags["cgo"],
|
||||
}
|
||||
|
||||
// Validate against build-specific expectations
|
||||
if buildTags["native_libs"] && buildTags["cgo"] {
|
||||
// Real NVML build: Should detect actual GPUs or get real NVML error
|
||||
// GPU count may be 0 if no NVIDIA hardware, but detection method should be config
|
||||
if got.DetectionMethod != "config" {
|
||||
t.Errorf("cgo+native_libs: DetectionMethod = %v, want 'config'", got.DetectionMethod)
|
||||
}
|
||||
} else if buildTags["cgo"] {
|
||||
// CGO without native_libs: Stub returns 0
|
||||
if got.GPUCount != 0 {
|
||||
t.Logf("cgo-only build: GPUCount = %d (expected 0 from stub)", got.GPUCount)
|
||||
}
|
||||
if got.NativeAvailable {
|
||||
t.Error("cgo-only build: NativeAvailable should be false")
|
||||
}
|
||||
} else {
|
||||
// No CGO: Stub returns 0
|
||||
if got.GPUCount != 0 {
|
||||
t.Logf("nocgo build: GPUCount = %d (expected 0 from stub)", got.GPUCount)
|
||||
}
|
||||
if got.NativeAvailable {
|
||||
t.Error("nocgo build: NativeAvailable should be false")
|
||||
}
|
||||
}
|
||||
|
||||
// Common validations
|
||||
if got.ConfiguredVendor != "nvidia" {
|
||||
t.Errorf("ConfiguredVendor = %v, want 'nvidia'", got.ConfiguredVendor)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGoldenGPUStatusAMDVendorAlias validates AMD aliasing is visible in output
|
||||
// Build tags: all three configurations
|
||||
// Runtime scenarios: amd config
|
||||
func TestGoldenGPUStatusAMDVendorAlias(t *testing.T) {
|
||||
cfg := &worker.Config{
|
||||
GPUVendor: "amd",
|
||||
}
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(cfg)
|
||||
|
||||
buildTags := detectBuildTags()
|
||||
|
||||
got := GoldenGPUStatus{
|
||||
GPUCount: result.Detector.DetectGPUCount(),
|
||||
GPUType: string(result.Info.GPUType),
|
||||
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||
DetectionMethod: string(result.Info.DetectionMethod),
|
||||
BuildTags: buildTags,
|
||||
NativeAvailable: buildTags["native_libs"] && buildTags["cgo"],
|
||||
}
|
||||
|
||||
// The key assertion: configured_vendor should be "amd" but GPUType should be "nvidia"
|
||||
// This makes the aliasing visible in status output
|
||||
if got.ConfiguredVendor != "amd" {
|
||||
t.Errorf("AMD config: ConfiguredVendor = %v, want 'amd'", got.ConfiguredVendor)
|
||||
}
|
||||
if got.GPUType != "nvidia" {
|
||||
t.Errorf("AMD config: GPUType = %v, want 'nvidia' (AMD aliased to NVIDIA implementation)", got.GPUType)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGoldenGPUStatusEnvOverride validates env override behavior across build configs
|
||||
// Build tags: all three
|
||||
// Runtime scenarios: env override set
|
||||
func TestGoldenGPUStatusEnvOverride(t *testing.T) {
|
||||
// Set env override
|
||||
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
|
||||
os.Setenv("FETCH_ML_GPU_COUNT", "4")
|
||||
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(&worker.Config{GPUVendor: "apple"})
|
||||
|
||||
buildTags := detectBuildTags()
|
||||
|
||||
got := GoldenGPUStatus{
|
||||
GPUCount: result.Detector.DetectGPUCount(),
|
||||
GPUType: string(result.Info.GPUType),
|
||||
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||
DetectionMethod: string(result.Info.DetectionMethod),
|
||||
EnvOverrideType: result.Info.EnvOverrideType,
|
||||
EnvOverrideCount: result.Info.EnvOverrideCount,
|
||||
BuildTags: buildTags,
|
||||
}
|
||||
|
||||
// Env should take precedence over config
|
||||
if got.DetectionMethod != "env_override_both" {
|
||||
t.Errorf("Env override: DetectionMethod = %v, want 'env_override_both'", got.DetectionMethod)
|
||||
}
|
||||
if got.GPUType != "nvidia" {
|
||||
t.Errorf("Env override: GPUType = %v, want 'nvidia'", got.GPUType)
|
||||
}
|
||||
if got.EnvOverrideType != "nvidia" {
|
||||
t.Errorf("Env override: EnvOverrideType = %v, want 'nvidia'", got.EnvOverrideType)
|
||||
}
|
||||
if got.EnvOverrideCount != 4 {
|
||||
t.Errorf("Env override: EnvOverrideCount = %v, want 4", got.EnvOverrideCount)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGoldenGPUStatusMacOS validates macOS detection when running on Darwin
|
||||
// Build tags: cgo+native_libs on Darwin
|
||||
// Runtime scenarios: darwin
|
||||
func TestGoldenGPUStatusMacOS(t *testing.T) {
|
||||
if !worker.IsMacOS() {
|
||||
t.Skip("Skipping macOS-specific test on non-Darwin platform")
|
||||
}
|
||||
|
||||
cfg := &worker.Config{
|
||||
GPUVendor: "apple",
|
||||
AppleGPU: worker.AppleGPUConfig{Enabled: true},
|
||||
}
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(cfg)
|
||||
|
||||
buildTags := detectBuildTags()
|
||||
|
||||
got := GoldenGPUStatus{
|
||||
GPUCount: result.Detector.DetectGPUCount(),
|
||||
GPUType: string(result.Info.GPUType),
|
||||
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||
DetectionMethod: string(result.Info.DetectionMethod),
|
||||
BuildTags: buildTags,
|
||||
NativeAvailable: buildTags["darwin"],
|
||||
}
|
||||
|
||||
if got.ConfiguredVendor != "apple" {
|
||||
t.Errorf("macOS: ConfiguredVendor = %v, want 'apple'", got.ConfiguredVendor)
|
||||
}
|
||||
if got.GPUType != "apple" {
|
||||
t.Errorf("macOS: GPUType = %v, want 'apple'", got.GPUType)
|
||||
}
|
||||
if !got.BuildTags["darwin"] {
|
||||
t.Error("macOS: darwin build tag should be true")
|
||||
}
|
||||
}
|
||||
|
||||
// TestGoldenGPUStatusNone validates no-GPU configuration
|
||||
// Build tags: all three
|
||||
// Runtime scenarios: none
|
||||
func TestGoldenGPUStatusNone(t *testing.T) {
|
||||
cfg := &worker.Config{
|
||||
GPUVendor: "none",
|
||||
}
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(cfg)
|
||||
|
||||
if result.Detector.DetectGPUCount() != 0 {
|
||||
t.Errorf("none config: GPUCount = %v, want 0", result.Detector.DetectGPUCount())
|
||||
}
|
||||
if result.Info.ConfiguredVendor != "none" {
|
||||
t.Errorf("none config: ConfiguredVendor = %v, want 'none'", result.Info.ConfiguredVendor)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGoldenJSONSerialization validates the GPU status serializes to JSON correctly
|
||||
func TestGoldenJSONSerialization(t *testing.T) {
|
||||
os.Setenv("FETCH_ML_GPU_TYPE", "nvidia")
|
||||
os.Setenv("FETCH_ML_GPU_COUNT", "2")
|
||||
defer os.Unsetenv("FETCH_ML_GPU_TYPE")
|
||||
defer os.Unsetenv("FETCH_ML_GPU_COUNT")
|
||||
|
||||
factory := &worker.GPUDetectorFactory{}
|
||||
result := factory.CreateDetectorWithInfo(nil)
|
||||
|
||||
status := GoldenGPUStatus{
|
||||
GPUCount: result.Detector.DetectGPUCount(),
|
||||
GPUType: string(result.Info.GPUType),
|
||||
ConfiguredVendor: result.Info.ConfiguredVendor,
|
||||
DetectionMethod: string(result.Info.DetectionMethod),
|
||||
EnvOverrideType: result.Info.EnvOverrideType,
|
||||
EnvOverrideCount: result.Info.EnvOverrideCount,
|
||||
BuildTags: detectBuildTags(),
|
||||
}
|
||||
|
||||
// Serialize to JSON (this mimics what ml status --json would output)
|
||||
jsonData, err := json.MarshalIndent(status, "", " ")
|
||||
if err != nil {
|
||||
t.Fatalf("JSON serialization failed: %v", err)
|
||||
}
|
||||
|
||||
// Verify JSON can be parsed back
|
||||
var parsed GoldenGPUStatus
|
||||
if err := json.Unmarshal(jsonData, &parsed); err != nil {
|
||||
t.Fatalf("JSON deserialization failed: %v", err)
|
||||
}
|
||||
|
||||
if parsed.ConfiguredVendor != status.ConfiguredVendor {
|
||||
t.Errorf("JSON roundtrip: ConfiguredVendor mismatch")
|
||||
}
|
||||
if parsed.DetectionMethod != status.DetectionMethod {
|
||||
t.Errorf("JSON roundtrip: DetectionMethod mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildTagMatrix validates that all expected build tag combinations are testable
|
||||
// This test documents the three build configurations:
|
||||
// 1. cgo + native_libs: Real native library implementations
|
||||
// 2. cgo without native_libs: Stubs that return errors
|
||||
// 3. !cgo: Stubs that return "disabled (no CGO)"
|
||||
func TestBuildTagMatrix(t *testing.T) {
|
||||
tags := detectBuildTags()
|
||||
|
||||
// Log the current build configuration for CI visibility
|
||||
t.Logf("Build configuration: cgo=%v native_libs=%v darwin=%v linux=%v",
|
||||
tags["cgo"], tags["native_libs"], tags["darwin"], tags["linux"])
|
||||
|
||||
// Validate SIMD implementation name matches build tags
|
||||
simdName := worker.GetSIMDImplName()
|
||||
t.Logf("SIMD implementation: %s", simdName)
|
||||
|
||||
switch {
|
||||
case tags["native_libs"]:
|
||||
// Should have real implementation name (avx2, sha_ni, armv8_crypto, or generic)
|
||||
if simdName == "disabled" || simdName == "disabled (no CGO)" {
|
||||
t.Errorf("native_libs build: SIMD impl should be active, got %q", simdName)
|
||||
}
|
||||
case tags["cgo"]:
|
||||
// Should be disabled without native_libs
|
||||
if simdName != "disabled" {
|
||||
t.Errorf("cgo-only build: SIMD impl should be 'disabled', got %q", simdName)
|
||||
}
|
||||
default:
|
||||
// No CGO
|
||||
if simdName != "disabled (no CGO)" {
|
||||
t.Errorf("nocgo build: SIMD impl should be 'disabled (no CGO)', got %q", simdName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -30,7 +30,7 @@ func TestScanArtifacts_SkipsKnownPathsAndLogs(t *testing.T) {
|
|||
mustWrite("checkpoints/best.pt", []byte("checkpoint"))
|
||||
mustWrite("plots/loss.png", []byte("png"))
|
||||
|
||||
art, err := worker.ScanArtifacts(runDir)
|
||||
art, err := worker.ScanArtifacts(runDir, false)
|
||||
if err != nil {
|
||||
t.Fatalf("scanArtifacts: %v", err)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue