- Add native/nvml_gpu/ C++ library wrapping NVIDIA Management Library - Add Go bindings in internal/worker/gpu_nvml_native.go and gpu_nvml_stub.go - Update gpu_detector.go to use NVML for accurate GPU count detection - Update native/CMakeLists.txt to build nvml_gpu library - Provides real-time GPU utilization, memory, temperature, clocks, power - Falls back to environment variable when NVML unavailable
169 lines
4.1 KiB
Go
169 lines
4.1 KiB
Go
//go:build cgo && native_libs
|
|
// +build cgo,native_libs
|
|
|
|
package worker
|
|
|
|
// #cgo LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -lnvml_gpu -lnvidia-ml
|
|
// #include "../../native/nvml_gpu/nvml_gpu.h"
|
|
// #include <stdlib.h>
|
|
import "C"
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
)
|
|
|
|
// GPUInfo holds information about a GPU
|
|
type GPUInfo struct {
|
|
Index uint32
|
|
Name string
|
|
Utilization uint32 // GPU utilization percentage (0-100)
|
|
MemoryUsed uint64 // Memory used in bytes
|
|
MemoryTotal uint64 // Total memory in bytes
|
|
Temperature uint32 // Temperature in Celsius
|
|
PowerDraw uint32 // Power draw in milliwatts
|
|
ClockSM uint32 // SM clock in MHz
|
|
ClockMemory uint32 // Memory clock in MHz
|
|
PCIeGen uint32 // PCIe generation
|
|
PCIeWidth uint32 // PCIe link width
|
|
UUID string // GPU UUID
|
|
VBIOSVersion string // VBIOS version
|
|
}
|
|
|
|
var (
|
|
nvmlInitialized = false
|
|
)
|
|
|
|
// InitNVML initializes the NVML library
|
|
func InitNVML() error {
|
|
result := C.gpu_init()
|
|
if result != 0 {
|
|
return errors.New(C.GoString(C.gpu_last_error()))
|
|
}
|
|
nvmlInitialized = true
|
|
return nil
|
|
}
|
|
|
|
// ShutdownNVML shuts down the NVML library
|
|
func ShutdownNVML() {
|
|
if nvmlInitialized {
|
|
C.gpu_shutdown()
|
|
nvmlInitialized = false
|
|
}
|
|
}
|
|
|
|
// IsNVMLAvailable checks if NVML is available at runtime
|
|
func IsNVMLAvailable() bool {
|
|
return C.gpu_is_available() == 1
|
|
}
|
|
|
|
// GetGPUCount returns the number of GPUs
|
|
func GetGPUCount() (int, error) {
|
|
if !nvmlInitialized {
|
|
if err := InitNVML(); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
|
|
count := C.gpu_get_count()
|
|
if count < 0 {
|
|
return 0, errors.New(C.GoString(C.gpu_last_error()))
|
|
}
|
|
return int(count), nil
|
|
}
|
|
|
|
// GetGPUInfo returns detailed information about a GPU
|
|
func GetGPUInfo(index uint32) (*GPUInfo, error) {
|
|
if !nvmlInitialized {
|
|
if err := InitNVML(); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
var cInfo C.gpu_info_t
|
|
result := C.gpu_get_info(C.uint32_t(index), &cInfo)
|
|
if result != 0 {
|
|
return nil, errors.New(C.GoString(C.gpu_last_error()))
|
|
}
|
|
|
|
return &GPUInfo{
|
|
Index: uint32(cInfo.index),
|
|
Name: C.GoString(&cInfo.name[0]),
|
|
Utilization: uint32(cInfo.utilization),
|
|
MemoryUsed: uint64(cInfo.memory_used),
|
|
MemoryTotal: uint64(cInfo.memory_total),
|
|
Temperature: uint32(cInfo.temperature),
|
|
PowerDraw: uint32(cInfo.power_draw),
|
|
ClockSM: uint32(cInfo.clock_sm),
|
|
ClockMemory: uint32(cInfo.clock_memory),
|
|
PCIeGen: uint32(cInfo.pcie_gen),
|
|
PCIeWidth: uint32(cInfo.pcie_width),
|
|
UUID: C.GoString(&cInfo.uuid[0]),
|
|
VBIOSVersion: C.GoString(&cInfo.vbios_version[0]),
|
|
}, nil
|
|
}
|
|
|
|
// GetAllGPUInfo returns information about all GPUs
|
|
func GetAllGPUInfo() ([]*GPUInfo, error) {
|
|
count, err := GetGPUCount()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
gpus := make([]*GPUInfo, 0, count)
|
|
for i := 0; i < count; i++ {
|
|
info, err := GetGPUInfo(uint32(i))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get GPU %d info: %w", i, err)
|
|
}
|
|
gpus = append(gpus, info)
|
|
}
|
|
return gpus, nil
|
|
}
|
|
|
|
// GetGPUUtilization returns the current GPU utilization
|
|
func GetGPUUtilization(index uint32) (uint32, error) {
|
|
if !nvmlInitialized {
|
|
if err := InitNVML(); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
|
|
var utilization C.uint32_t
|
|
result := C.gpu_get_utilization(C.uint32_t(index), &utilization)
|
|
if result != 0 {
|
|
return 0, errors.New(C.GoString(C.gpu_last_error()))
|
|
}
|
|
return uint32(utilization), nil
|
|
}
|
|
|
|
// GetGPUMemory returns the current GPU memory usage
|
|
func GetGPUMemory(index uint32) (used uint64, total uint64, err error) {
|
|
if !nvmlInitialized {
|
|
if err := InitNVML(); err != nil {
|
|
return 0, 0, err
|
|
}
|
|
}
|
|
|
|
var cUsed, cTotal C.uint64_t
|
|
result := C.gpu_get_memory(C.uint32_t(index), &cUsed, &cTotal)
|
|
if result != 0 {
|
|
return 0, 0, errors.New(C.GoString(C.gpu_last_error()))
|
|
}
|
|
return uint64(cUsed), uint64(cTotal), nil
|
|
}
|
|
|
|
// GetGPUTemperature returns the current GPU temperature
|
|
func GetGPUTemperature(index uint32) (uint32, error) {
|
|
if !nvmlInitialized {
|
|
if err := InitNVML(); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
|
|
var temp C.uint32_t
|
|
result := C.gpu_get_temperature(C.uint32_t(index), &temp)
|
|
if result != 0 {
|
|
return 0, errors.New(C.GoString(C.gpu_last_error()))
|
|
}
|
|
return uint32(temp), nil
|
|
}
|