feat: implement NVML-based GPU monitoring

- Add native/nvml_gpu/ C++ library wrapping NVIDIA Management Library
- Add Go bindings in internal/worker/gpu_nvml_native.go and gpu_nvml_stub.go
- Update gpu_detector.go to use NVML for accurate GPU count detection
- Update native/CMakeLists.txt to build nvml_gpu library
- Provides real-time GPU utilization, memory, temperature, clocks, power
- Falls back to environment variable when NVML unavailable
This commit is contained in:
Jeremie Fraeys 2026-02-21 15:16:09 -05:00
parent d6265df0bd
commit 05b7af6991
No known key found for this signature in database
7 changed files with 532 additions and 1 deletions

View file

@ -26,10 +26,18 @@ type GPUDetector interface {
type NVIDIADetector struct{}
func (d *NVIDIADetector) DetectGPUCount() int {
// First try NVML for accurate detection
if IsNVMLAvailable() {
count, err := GetGPUCount()
if err == nil && count > 0 {
return count
}
}
// Fall back to environment variable
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
return n
}
// Could use nvidia-sml or other detection methods here
return 0
}

View file

@ -0,0 +1,169 @@
//go:build cgo && native_libs
// +build cgo,native_libs
package worker
// #cgo LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -lnvml_gpu -lnvidia-ml
// #include "../../native/nvml_gpu/nvml_gpu.h"
// #include <stdlib.h>
import "C"
import (
"errors"
"fmt"
)
// GPUInfo holds information about a GPU
type GPUInfo struct {
Index uint32
Name string
Utilization uint32 // GPU utilization percentage (0-100)
MemoryUsed uint64 // Memory used in bytes
MemoryTotal uint64 // Total memory in bytes
Temperature uint32 // Temperature in Celsius
PowerDraw uint32 // Power draw in milliwatts
ClockSM uint32 // SM clock in MHz
ClockMemory uint32 // Memory clock in MHz
PCIeGen uint32 // PCIe generation
PCIeWidth uint32 // PCIe link width
UUID string // GPU UUID
VBIOSVersion string // VBIOS version
}
var (
nvmlInitialized = false
)
// InitNVML initializes the NVML library
func InitNVML() error {
result := C.gpu_init()
if result != 0 {
return errors.New(C.GoString(C.gpu_last_error()))
}
nvmlInitialized = true
return nil
}
// ShutdownNVML shuts down the NVML library
func ShutdownNVML() {
if nvmlInitialized {
C.gpu_shutdown()
nvmlInitialized = false
}
}
// IsNVMLAvailable checks if NVML is available at runtime
func IsNVMLAvailable() bool {
return C.gpu_is_available() == 1
}
// GetGPUCount returns the number of GPUs
func GetGPUCount() (int, error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return 0, err
}
}
count := C.gpu_get_count()
if count < 0 {
return 0, errors.New(C.GoString(C.gpu_last_error()))
}
return int(count), nil
}
// GetGPUInfo returns detailed information about a GPU
func GetGPUInfo(index uint32) (*GPUInfo, error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return nil, err
}
}
var cInfo C.gpu_info_t
result := C.gpu_get_info(C.uint32_t(index), &cInfo)
if result != 0 {
return nil, errors.New(C.GoString(C.gpu_last_error()))
}
return &GPUInfo{
Index: uint32(cInfo.index),
Name: C.GoString(&cInfo.name[0]),
Utilization: uint32(cInfo.utilization),
MemoryUsed: uint64(cInfo.memory_used),
MemoryTotal: uint64(cInfo.memory_total),
Temperature: uint32(cInfo.temperature),
PowerDraw: uint32(cInfo.power_draw),
ClockSM: uint32(cInfo.clock_sm),
ClockMemory: uint32(cInfo.clock_memory),
PCIeGen: uint32(cInfo.pcie_gen),
PCIeWidth: uint32(cInfo.pcie_width),
UUID: C.GoString(&cInfo.uuid[0]),
VBIOSVersion: C.GoString(&cInfo.vbios_version[0]),
}, nil
}
// GetAllGPUInfo returns information about all GPUs
func GetAllGPUInfo() ([]*GPUInfo, error) {
count, err := GetGPUCount()
if err != nil {
return nil, err
}
gpus := make([]*GPUInfo, 0, count)
for i := 0; i < count; i++ {
info, err := GetGPUInfo(uint32(i))
if err != nil {
return nil, fmt.Errorf("failed to get GPU %d info: %w", i, err)
}
gpus = append(gpus, info)
}
return gpus, nil
}
// GetGPUUtilization returns the current GPU utilization
func GetGPUUtilization(index uint32) (uint32, error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return 0, err
}
}
var utilization C.uint32_t
result := C.gpu_get_utilization(C.uint32_t(index), &utilization)
if result != 0 {
return 0, errors.New(C.GoString(C.gpu_last_error()))
}
return uint32(utilization), nil
}
// GetGPUMemory returns the current GPU memory usage
func GetGPUMemory(index uint32) (used uint64, total uint64, err error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return 0, 0, err
}
}
var cUsed, cTotal C.uint64_t
result := C.gpu_get_memory(C.uint32_t(index), &cUsed, &cTotal)
if result != 0 {
return 0, 0, errors.New(C.GoString(C.gpu_last_error()))
}
return uint64(cUsed), uint64(cTotal), nil
}
// GetGPUTemperature returns the current GPU temperature
func GetGPUTemperature(index uint32) (uint32, error) {
if !nvmlInitialized {
if err := InitNVML(); err != nil {
return 0, err
}
}
var temp C.uint32_t
result := C.gpu_get_temperature(C.uint32_t(index), &temp)
if result != 0 {
return 0, errors.New(C.GoString(C.gpu_last_error()))
}
return uint32(temp), nil
}

View file

@ -0,0 +1,42 @@
//go:build cgo && !native_libs
// +build cgo,!native_libs
package worker
import "errors"
// Stub implementations when native_libs build tag is not present
func InitNVML() error {
return errors.New("NVML requires native_libs build tag")
}
func ShutdownNVML() {}
func IsNVMLAvailable() bool {
return false
}
func GetGPUCount() (int, error) {
return 0, errors.New("NVML requires native_libs build tag")
}
func GetGPUInfo(index uint32) (*GPUInfo, error) {
return nil, errors.New("NVML requires native_libs build tag")
}
func GetAllGPUInfo() ([]*GPUInfo, error) {
return nil, errors.New("NVML requires native_libs build tag")
}
func GetGPUUtilization(index uint32) (uint32, error) {
return 0, errors.New("NVML requires native_libs build tag")
}
func GetGPUMemory(index uint32) (used uint64, total uint64, err error) {
return 0, 0, errors.New("NVML requires native_libs build tag")
}
func GetGPUTemperature(index uint32) (uint32, error) {
return 0, errors.New("NVML requires native_libs build tag")
}

View file

@ -71,6 +71,7 @@ enable_testing()
add_subdirectory(common)
add_subdirectory(queue_index)
add_subdirectory(dataset_hash)
add_subdirectory(nvml_gpu)
# Tests from root tests/ directory
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tests)
@ -119,4 +120,5 @@ endif()
add_custom_target(all_native_libs DEPENDS
queue_index
dataset_hash
nvml_gpu
)

View file

@ -0,0 +1,35 @@
add_library(nvml_gpu SHARED
nvml_gpu.cpp
)
target_include_directories(nvml_gpu PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}
)
# Find NVML library
find_library(NVML_LIBRARY nvidia-ml
PATHS
/usr/lib/x86_64-linux-gnu
/usr/local/cuda/lib64
/usr/lib64
/usr/lib
/opt/cuda/lib64
DOC "NVIDIA Management Library"
)
if(NVML_LIBRARY)
target_link_libraries(nvml_gpu PRIVATE ${NVML_LIBRARY})
message(STATUS "Found NVML: ${NVML_LIBRARY}")
else()
message(WARNING "NVML library not found. GPU monitoring will be disabled.")
# Create stub library that always returns unavailable
target_compile_definitions(nvml_gpu PRIVATE NVML_STUB)
endif()
set_target_properties(nvml_gpu PROPERTIES
VERSION ${PROJECT_VERSION}
SOVERSION ${PROJECT_VERSION_MAJOR}
POSITION_INDEPENDENT_CODE ON
C_STANDARD 11
CXX_STANDARD 17
)

View file

@ -0,0 +1,209 @@
#include "nvml_gpu.h"
#include <nvml.h>
#include <stdio.h>
#include <string.h>
// Thread-local error buffer
static __thread char last_error_buffer[256] = {0};
static void set_error(const char* msg) {
strncpy(last_error_buffer, msg, sizeof(last_error_buffer) - 1);
last_error_buffer[sizeof(last_error_buffer) - 1] = '\0';
}
int gpu_init(void) {
nvmlReturn_t result = nvmlInit();
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
return 0;
}
void gpu_shutdown(void) {
nvmlShutdown();
}
int gpu_get_count(void) {
unsigned int count = 0;
nvmlReturn_t result = nvmlDeviceGetCount(&count);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
return (int)count;
}
int gpu_get_info(uint32_t index, gpu_info_t* info) {
if (!info) {
set_error("null info pointer");
return -1;
}
memset(info, 0, sizeof(gpu_info_t));
info->index = index;
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
// Get name
result = nvmlDeviceGetName(device, info->name, sizeof(info->name));
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
// Get UUID
result = nvmlDeviceGetUUID(device, info->uuid, sizeof(info->uuid));
if (result != NVML_SUCCESS) {
// Non-critical, continue
strcpy(info->uuid, "unknown");
}
// Get VBIOS version
result = nvmlDeviceGetVbiosVersion(device, info->vbios_version, sizeof(info->vbios_version));
if (result != NVML_SUCCESS) {
strcpy(info->vbios_version, "unknown");
}
// Get utilization
nvmlUtilization_t utilization;
result = nvmlDeviceGetUtilizationRates(device, &utilization);
if (result == NVML_SUCCESS) {
info->utilization = utilization.gpu;
}
// Get memory info
nvmlMemory_t memory;
result = nvmlDeviceGetMemoryInfo(device, &memory);
if (result == NVML_SUCCESS) {
info->memory_used = memory.used;
info->memory_total = memory.total;
}
// Get temperature
result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info->temperature);
if (result != NVML_SUCCESS) {
info->temperature = 0; // Not available
}
// Get power draw
unsigned int power_mw;
result = nvmlDeviceGetPowerUsage(device, &power_mw);
if (result == NVML_SUCCESS) {
info->power_draw = power_mw;
}
// Get clocks
unsigned int clock;
result = nvmlDeviceGetClock(device, NVML_CLOCK_SM, NVML_CLOCK_ID_CURRENT, &clock);
if (result == NVML_SUCCESS) {
info->clock_sm = clock;
}
result = nvmlDeviceGetClock(device, NVML_CLOCK_MEM, NVML_CLOCK_ID_CURRENT, &clock);
if (result == NVML_SUCCESS) {
info->clock_memory = clock;
}
// Get PCIe info
nvmlPciInfo_t pciInfo;
result = nvmlDeviceGetPciInfo(device, &pciInfo);
if (result == NVML_SUCCESS) {
// PCIe generation and width require NVML 11.0+
// For now, we leave them as 0
info->pcie_gen = 0;
info->pcie_width = 0;
}
return 0;
}
int gpu_get_utilization(uint32_t index, uint32_t* utilization) {
if (!utilization) {
set_error("null utilization pointer");
return -1;
}
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
nvmlUtilization_t util;
result = nvmlDeviceGetUtilizationRates(device, &util);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
*utilization = util.gpu;
return 0;
}
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total) {
if (!used || !total) {
set_error("null pointer");
return -1;
}
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
nvmlMemory_t memory;
result = nvmlDeviceGetMemoryInfo(device, &memory);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
*used = memory.used;
*total = memory.total;
return 0;
}
int gpu_get_temperature(uint32_t index, uint32_t* temp) {
if (!temp) {
set_error("null temp pointer");
return -1;
}
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
unsigned int temperature;
result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
*temp = temperature;
return 0;
}
int gpu_is_available(void) {
nvmlReturn_t result = nvmlInit();
if (result == NVML_SUCCESS) {
nvmlShutdown();
return 1;
}
return 0;
}
const char* gpu_last_error(void) {
return last_error_buffer;
}

View file

@ -0,0 +1,66 @@
#ifndef NVML_GPU_H
#define NVML_GPU_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
// GPU information structure
typedef struct {
uint32_t index;
char name[256];
uint32_t utilization; // GPU utilization percentage (0-100)
uint64_t memory_used; // Memory used in bytes
uint64_t memory_total; // Total memory in bytes
uint32_t temperature; // Temperature in Celsius
uint32_t power_draw; // Power draw in milliwatts
uint32_t clock_sm; // SM clock in MHz
uint32_t clock_memory; // Memory clock in MHz
uint32_t pcie_gen; // PCIe generation
uint32_t pcie_width; // PCIe link width
char uuid[80]; // GPU UUID
char vbios_version[32]; // VBIOS version
} gpu_info_t;
// Initialize NVML library
// Returns 0 on success, non-zero on failure
int gpu_init(void);
// Shutdown NVML library
void gpu_shutdown(void);
// Get number of GPUs
// Returns -1 on error, >= 0 on success
int gpu_get_count(void);
// Get GPU info by index
// Returns 0 on success, non-zero on failure
int gpu_get_info(uint32_t index, gpu_info_t* info);
// Get current utilization for a GPU
// Returns 0 on success, non-zero on failure
int gpu_get_utilization(uint32_t index, uint32_t* utilization);
// Get memory info for a GPU
// Returns 0 on success, non-zero on failure
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total);
// Get temperature for a GPU
// Returns 0 on success, non-zero on failure
int gpu_get_temperature(uint32_t index, uint32_t* temp);
// Check if NVML is available (runtime detection)
// Returns 1 if available, 0 if not
int gpu_is_available(void);
// Get last error message (thread-safe)
const char* gpu_last_error(void);
#ifdef __cplusplus
}
#endif
#endif // NVML_GPU_H