fetch_ml/internal/worker/gpu_nvml_native.go
Jeremie Fraeys be39b37aec
feat: native GPU detection and NVML bridge for macOS and Linux
- Add dynamic NVML loading for Linux GPU detection
- Add macOS GPU detection via IOKit framework
- Add Zig NVML wrapper for cross-platform GPU queries
- Update native bridge to support platform-specific GPU libs
- Add CMake support for NVML dynamic library
2026-02-21 17:59:59 -05:00

169 lines
4.1 KiB
Go

//go:build cgo && native_libs && linux
// +build cgo,native_libs,linux
package worker
// #cgo LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -lnvml_gpu -lnvidia-ml
// #include "../../native/nvml_gpu/nvml_gpu.h"
// #include <stdlib.h>
import "C"
import (
"errors"
"fmt"
)
// GPUInfo holds information about a GPU
type GPUInfo struct {
Index uint32
Name string
Utilization uint32 // GPU utilization percentage (0-100)
MemoryUsed uint64 // Memory used in bytes
MemoryTotal uint64 // Total memory in bytes
Temperature uint32 // Temperature in Celsius
PowerDraw uint32 // Power draw in milliwatts
ClockSM uint32 // SM clock in MHz
ClockMemory uint32 // Memory clock in MHz
PCIeGen uint32 // PCIe generation
PCIeWidth uint32 // PCIe link width
UUID string // GPU UUID
VBIOSVersion string // VBIOS version
}
var (
nvmlInitialized = false
)
// InitNVML initializes the NVML library
func InitNVML() error {
result := C.gpu_init()
if result != 0 {
return errors.New(C.GoString(C.gpu_last_error()))
}
nvmlInitialized = true
return nil
}
// ShutdownNVML shuts down the NVML library
func ShutdownNVML() {
if nvmlInitialized {
C.gpu_shutdown()
nvmlInitialized = false
}
}
// IsNVMLAvailable checks if NVML is available at runtime
func IsNVMLAvailable() bool {
return C.gpu_is_available() == 1
}
// GetGPUCount returns the number of GPUs
func GetGPUCount() (int, error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return 0, err
}
}
count := C.gpu_get_count()
if count < 0 {
return 0, errors.New(C.GoString(C.gpu_last_error()))
}
return int(count), nil
}
// GetGPUInfo returns detailed information about a GPU
func GetGPUInfo(index uint32) (*GPUInfo, error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return nil, err
}
}
var cInfo C.gpu_info_t
result := C.gpu_get_info(C.uint32_t(index), &cInfo)
if result != 0 {
return nil, errors.New(C.GoString(C.gpu_last_error()))
}
return &GPUInfo{
Index: uint32(cInfo.index),
Name: C.GoString(&cInfo.name[0]),
Utilization: uint32(cInfo.utilization),
MemoryUsed: uint64(cInfo.memory_used),
MemoryTotal: uint64(cInfo.memory_total),
Temperature: uint32(cInfo.temperature),
PowerDraw: uint32(cInfo.power_draw),
ClockSM: uint32(cInfo.clock_sm),
ClockMemory: uint32(cInfo.clock_memory),
PCIeGen: uint32(cInfo.pcie_gen),
PCIeWidth: uint32(cInfo.pcie_width),
UUID: C.GoString(&cInfo.uuid[0]),
VBIOSVersion: C.GoString(&cInfo.vbios_version[0]),
}, nil
}
// GetAllGPUInfo returns information about all GPUs
func GetAllGPUInfo() ([]*GPUInfo, error) {
count, err := GetGPUCount()
if err != nil {
return nil, err
}
gpus := make([]*GPUInfo, 0, count)
for i := 0; i < count; i++ {
info, err := GetGPUInfo(uint32(i))
if err != nil {
return nil, fmt.Errorf("failed to get GPU %d info: %w", i, err)
}
gpus = append(gpus, info)
}
return gpus, nil
}
// GetGPUUtilization returns the current GPU utilization
func GetGPUUtilization(index uint32) (uint32, error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return 0, err
}
}
var utilization C.uint32_t
result := C.gpu_get_utilization(C.uint32_t(index), &utilization)
if result != 0 {
return 0, errors.New(C.GoString(C.gpu_last_error()))
}
return uint32(utilization), nil
}
// GetGPUMemory returns the current GPU memory usage
func GetGPUMemory(index uint32) (used uint64, total uint64, err error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return 0, 0, err
}
}
var cUsed, cTotal C.uint64_t
result := C.gpu_get_memory(C.uint32_t(index), &cUsed, &cTotal)
if result != 0 {
return 0, 0, errors.New(C.GoString(C.gpu_last_error()))
}
return uint64(cUsed), uint64(cTotal), nil
}
// GetGPUTemperature returns the current GPU temperature
func GetGPUTemperature(index uint32) (uint32, error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return 0, err
}
}
var temp C.uint32_t
result := C.gpu_get_temperature(C.uint32_t(index), &temp)
if result != 0 {
return 0, errors.New(C.GoString(C.gpu_last_error()))
}
return uint32(temp), nil
}