From 05b7af69911a4de1777c2499cbc3ad5a5d0efe87 Mon Sep 17 00:00:00 2001 From: Jeremie Fraeys Date: Sat, 21 Feb 2026 15:16:09 -0500 Subject: [PATCH] feat: implement NVML-based GPU monitoring - Add native/nvml_gpu/ C++ library wrapping NVIDIA Management Library - Add Go bindings in internal/worker/gpu_nvml_native.go and gpu_nvml_stub.go - Update gpu_detector.go to use NVML for accurate GPU count detection - Update native/CMakeLists.txt to build nvml_gpu library - Provides real-time GPU utilization, memory, temperature, clocks, power - Falls back to environment variable when NVML unavailable --- internal/worker/gpu_detector.go | 10 +- internal/worker/gpu_nvml_native.go | 169 +++++++++++++++++++++++ internal/worker/gpu_nvml_stub.go | 42 ++++++ native/CMakeLists.txt | 2 + native/nvml_gpu/CMakeLists.txt | 35 +++++ native/nvml_gpu/nvml_gpu.cpp | 209 +++++++++++++++++++++++++++++ native/nvml_gpu/nvml_gpu.h | 66 +++++++++ 7 files changed, 532 insertions(+), 1 deletion(-) create mode 100644 internal/worker/gpu_nvml_native.go create mode 100644 internal/worker/gpu_nvml_stub.go create mode 100644 native/nvml_gpu/CMakeLists.txt create mode 100644 native/nvml_gpu/nvml_gpu.cpp create mode 100644 native/nvml_gpu/nvml_gpu.h diff --git a/internal/worker/gpu_detector.go b/internal/worker/gpu_detector.go index 86b1a4c..61693e6 100644 --- a/internal/worker/gpu_detector.go +++ b/internal/worker/gpu_detector.go @@ -26,10 +26,18 @@ type GPUDetector interface { type NVIDIADetector struct{} func (d *NVIDIADetector) DetectGPUCount() int { + // First try NVML for accurate detection + if IsNVMLAvailable() { + count, err := GetGPUCount() + if err == nil && count > 0 { + return count + } + } + + // Fall back to environment variable if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 { return n } - // Could use nvidia-sml or other detection methods here return 0 } diff --git a/internal/worker/gpu_nvml_native.go b/internal/worker/gpu_nvml_native.go new file mode 100644 index 0000000..2feb72a --- /dev/null +++ b/internal/worker/gpu_nvml_native.go @@ -0,0 +1,169 @@ +//go:build cgo && native_libs +// +build cgo,native_libs + +package worker + +// #cgo LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -lnvml_gpu -lnvidia-ml +// #include "../../native/nvml_gpu/nvml_gpu.h" +// #include +import "C" +import ( + "errors" + "fmt" +) + +// GPUInfo holds information about a GPU +type GPUInfo struct { + Index uint32 + Name string + Utilization uint32 // GPU utilization percentage (0-100) + MemoryUsed uint64 // Memory used in bytes + MemoryTotal uint64 // Total memory in bytes + Temperature uint32 // Temperature in Celsius + PowerDraw uint32 // Power draw in milliwatts + ClockSM uint32 // SM clock in MHz + ClockMemory uint32 // Memory clock in MHz + PCIeGen uint32 // PCIe generation + PCIeWidth uint32 // PCIe link width + UUID string // GPU UUID + VBIOSVersion string // VBIOS version +} + +var ( + nvmlInitialized = false +) + +// InitNVML initializes the NVML library +func InitNVML() error { + result := C.gpu_init() + if result != 0 { + return errors.New(C.GoString(C.gpu_last_error())) + } + nvmlInitialized = true + return nil +} + +// ShutdownNVML shuts down the NVML library +func ShutdownNVML() { + if nvmlInitialized { + C.gpu_shutdown() + nvmlInitialized = false + } +} + +// IsNVMLAvailable checks if NVML is available at runtime +func IsNVMLAvailable() bool { + return C.gpu_is_available() == 1 +} + +// GetGPUCount returns the number of GPUs +func GetGPUCount() (int, error) { + if !nvmlInitialized { + if err := InitNVML(); err != nil { + return 0, err + } + } + + count := C.gpu_get_count() + if count < 0 { + return 0, errors.New(C.GoString(C.gpu_last_error())) + } + return int(count), nil +} + +// GetGPUInfo returns detailed information about a GPU +func GetGPUInfo(index uint32) (*GPUInfo, error) { + if !nvmlInitialized { + if err := InitNVML(); err != nil { + return nil, err + } + } + + var cInfo C.gpu_info_t + result := C.gpu_get_info(C.uint32_t(index), &cInfo) + if result != 0 { + return nil, errors.New(C.GoString(C.gpu_last_error())) + } + + return &GPUInfo{ + Index: uint32(cInfo.index), + Name: C.GoString(&cInfo.name[0]), + Utilization: uint32(cInfo.utilization), + MemoryUsed: uint64(cInfo.memory_used), + MemoryTotal: uint64(cInfo.memory_total), + Temperature: uint32(cInfo.temperature), + PowerDraw: uint32(cInfo.power_draw), + ClockSM: uint32(cInfo.clock_sm), + ClockMemory: uint32(cInfo.clock_memory), + PCIeGen: uint32(cInfo.pcie_gen), + PCIeWidth: uint32(cInfo.pcie_width), + UUID: C.GoString(&cInfo.uuid[0]), + VBIOSVersion: C.GoString(&cInfo.vbios_version[0]), + }, nil +} + +// GetAllGPUInfo returns information about all GPUs +func GetAllGPUInfo() ([]*GPUInfo, error) { + count, err := GetGPUCount() + if err != nil { + return nil, err + } + + gpus := make([]*GPUInfo, 0, count) + for i := 0; i < count; i++ { + info, err := GetGPUInfo(uint32(i)) + if err != nil { + return nil, fmt.Errorf("failed to get GPU %d info: %w", i, err) + } + gpus = append(gpus, info) + } + return gpus, nil +} + +// GetGPUUtilization returns the current GPU utilization +func GetGPUUtilization(index uint32) (uint32, error) { + if !nvmlInitialized { + if err := InitNVML(); err != nil { + return 0, err + } + } + + var utilization C.uint32_t + result := C.gpu_get_utilization(C.uint32_t(index), &utilization) + if result != 0 { + return 0, errors.New(C.GoString(C.gpu_last_error())) + } + return uint32(utilization), nil +} + +// GetGPUMemory returns the current GPU memory usage +func GetGPUMemory(index uint32) (used uint64, total uint64, err error) { + if !nvmlInitialized { + if err := InitNVML(); err != nil { + return 0, 0, err + } + } + + var cUsed, cTotal C.uint64_t + result := C.gpu_get_memory(C.uint32_t(index), &cUsed, &cTotal) + if result != 0 { + return 0, 0, errors.New(C.GoString(C.gpu_last_error())) + } + return uint64(cUsed), uint64(cTotal), nil +} + +// GetGPUTemperature returns the current GPU temperature +func GetGPUTemperature(index uint32) (uint32, error) { + if !nvmlInitialized { + if err := InitNVML(); err != nil { + return 0, err + } + } + + var temp C.uint32_t + result := C.gpu_get_temperature(C.uint32_t(index), &temp) + if result != 0 { + return 0, errors.New(C.GoString(C.gpu_last_error())) + } + return uint32(temp), nil +} diff --git a/internal/worker/gpu_nvml_stub.go b/internal/worker/gpu_nvml_stub.go new file mode 100644 index 0000000..337a7f8 --- /dev/null +++ b/internal/worker/gpu_nvml_stub.go @@ -0,0 +1,42 @@ +//go:build cgo && !native_libs +// +build cgo,!native_libs + +package worker + +import "errors" + +// Stub implementations when native_libs build tag is not present + +func InitNVML() error { + return errors.New("NVML requires native_libs build tag") +} + +func ShutdownNVML() {} + +func IsNVMLAvailable() bool { + return false +} + +func GetGPUCount() (int, error) { + return 0, errors.New("NVML requires native_libs build tag") +} + +func GetGPUInfo(index uint32) (*GPUInfo, error) { + return nil, errors.New("NVML requires native_libs build tag") +} + +func GetAllGPUInfo() ([]*GPUInfo, error) { + return nil, errors.New("NVML requires native_libs build tag") +} + +func GetGPUUtilization(index uint32) (uint32, error) { + return 0, errors.New("NVML requires native_libs build tag") +} + +func GetGPUMemory(index uint32) (used uint64, total uint64, err error) { + return 0, 0, errors.New("NVML requires native_libs build tag") +} + +func GetGPUTemperature(index uint32) (uint32, error) { + return 0, errors.New("NVML requires native_libs build tag") +} diff --git a/native/CMakeLists.txt b/native/CMakeLists.txt index e9f5d75..c8b823c 100644 --- a/native/CMakeLists.txt +++ b/native/CMakeLists.txt @@ -71,6 +71,7 @@ enable_testing() add_subdirectory(common) add_subdirectory(queue_index) add_subdirectory(dataset_hash) +add_subdirectory(nvml_gpu) # Tests from root tests/ directory if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tests) @@ -119,4 +120,5 @@ endif() add_custom_target(all_native_libs DEPENDS queue_index dataset_hash + nvml_gpu ) diff --git a/native/nvml_gpu/CMakeLists.txt b/native/nvml_gpu/CMakeLists.txt new file mode 100644 index 0000000..d8a2882 --- /dev/null +++ b/native/nvml_gpu/CMakeLists.txt @@ -0,0 +1,35 @@ +add_library(nvml_gpu SHARED + nvml_gpu.cpp +) + +target_include_directories(nvml_gpu PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} +) + +# Find NVML library +find_library(NVML_LIBRARY nvidia-ml + PATHS + /usr/lib/x86_64-linux-gnu + /usr/local/cuda/lib64 + /usr/lib64 + /usr/lib + /opt/cuda/lib64 + DOC "NVIDIA Management Library" +) + +if(NVML_LIBRARY) + target_link_libraries(nvml_gpu PRIVATE ${NVML_LIBRARY}) + message(STATUS "Found NVML: ${NVML_LIBRARY}") +else() + message(WARNING "NVML library not found. GPU monitoring will be disabled.") + # Create stub library that always returns unavailable + target_compile_definitions(nvml_gpu PRIVATE NVML_STUB) +endif() + +set_target_properties(nvml_gpu PROPERTIES + VERSION ${PROJECT_VERSION} + SOVERSION ${PROJECT_VERSION_MAJOR} + POSITION_INDEPENDENT_CODE ON + C_STANDARD 11 + CXX_STANDARD 17 +) diff --git a/native/nvml_gpu/nvml_gpu.cpp b/native/nvml_gpu/nvml_gpu.cpp new file mode 100644 index 0000000..0286029 --- /dev/null +++ b/native/nvml_gpu/nvml_gpu.cpp @@ -0,0 +1,209 @@ +#include "nvml_gpu.h" +#include +#include +#include + +// Thread-local error buffer +static __thread char last_error_buffer[256] = {0}; + +static void set_error(const char* msg) { + strncpy(last_error_buffer, msg, sizeof(last_error_buffer) - 1); + last_error_buffer[sizeof(last_error_buffer) - 1] = '\0'; +} + +int gpu_init(void) { + nvmlReturn_t result = nvmlInit(); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + return 0; +} + +void gpu_shutdown(void) { + nvmlShutdown(); +} + +int gpu_get_count(void) { + unsigned int count = 0; + nvmlReturn_t result = nvmlDeviceGetCount(&count); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + return (int)count; +} + +int gpu_get_info(uint32_t index, gpu_info_t* info) { + if (!info) { + set_error("null info pointer"); + return -1; + } + + memset(info, 0, sizeof(gpu_info_t)); + info->index = index; + + nvmlDevice_t device; + nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + + // Get name + result = nvmlDeviceGetName(device, info->name, sizeof(info->name)); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + + // Get UUID + result = nvmlDeviceGetUUID(device, info->uuid, sizeof(info->uuid)); + if (result != NVML_SUCCESS) { + // Non-critical, continue + strcpy(info->uuid, "unknown"); + } + + // Get VBIOS version + result = nvmlDeviceGetVbiosVersion(device, info->vbios_version, sizeof(info->vbios_version)); + if (result != NVML_SUCCESS) { + strcpy(info->vbios_version, "unknown"); + } + + // Get utilization + nvmlUtilization_t utilization; + result = nvmlDeviceGetUtilizationRates(device, &utilization); + if (result == NVML_SUCCESS) { + info->utilization = utilization.gpu; + } + + // Get memory info + nvmlMemory_t memory; + result = nvmlDeviceGetMemoryInfo(device, &memory); + if (result == NVML_SUCCESS) { + info->memory_used = memory.used; + info->memory_total = memory.total; + } + + // Get temperature + result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info->temperature); + if (result != NVML_SUCCESS) { + info->temperature = 0; // Not available + } + + // Get power draw + unsigned int power_mw; + result = nvmlDeviceGetPowerUsage(device, &power_mw); + if (result == NVML_SUCCESS) { + info->power_draw = power_mw; + } + + // Get clocks + unsigned int clock; + result = nvmlDeviceGetClock(device, NVML_CLOCK_SM, NVML_CLOCK_ID_CURRENT, &clock); + if (result == NVML_SUCCESS) { + info->clock_sm = clock; + } + result = nvmlDeviceGetClock(device, NVML_CLOCK_MEM, NVML_CLOCK_ID_CURRENT, &clock); + if (result == NVML_SUCCESS) { + info->clock_memory = clock; + } + + // Get PCIe info + nvmlPciInfo_t pciInfo; + result = nvmlDeviceGetPciInfo(device, &pciInfo); + if (result == NVML_SUCCESS) { + // PCIe generation and width require NVML 11.0+ + // For now, we leave them as 0 + info->pcie_gen = 0; + info->pcie_width = 0; + } + + return 0; +} + +int gpu_get_utilization(uint32_t index, uint32_t* utilization) { + if (!utilization) { + set_error("null utilization pointer"); + return -1; + } + + nvmlDevice_t device; + nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + + nvmlUtilization_t util; + result = nvmlDeviceGetUtilizationRates(device, &util); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + + *utilization = util.gpu; + return 0; +} + +int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total) { + if (!used || !total) { + set_error("null pointer"); + return -1; + } + + nvmlDevice_t device; + nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + + nvmlMemory_t memory; + result = nvmlDeviceGetMemoryInfo(device, &memory); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + + *used = memory.used; + *total = memory.total; + return 0; +} + +int gpu_get_temperature(uint32_t index, uint32_t* temp) { + if (!temp) { + set_error("null temp pointer"); + return -1; + } + + nvmlDevice_t device; + nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + + unsigned int temperature; + result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature); + if (result != NVML_SUCCESS) { + set_error(nvmlErrorString(result)); + return -1; + } + + *temp = temperature; + return 0; +} + +int gpu_is_available(void) { + nvmlReturn_t result = nvmlInit(); + if (result == NVML_SUCCESS) { + nvmlShutdown(); + return 1; + } + return 0; +} + +const char* gpu_last_error(void) { + return last_error_buffer; +} diff --git a/native/nvml_gpu/nvml_gpu.h b/native/nvml_gpu/nvml_gpu.h new file mode 100644 index 0000000..cc47877 --- /dev/null +++ b/native/nvml_gpu/nvml_gpu.h @@ -0,0 +1,66 @@ +#ifndef NVML_GPU_H +#define NVML_GPU_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// GPU information structure +typedef struct { + uint32_t index; + char name[256]; + uint32_t utilization; // GPU utilization percentage (0-100) + uint64_t memory_used; // Memory used in bytes + uint64_t memory_total; // Total memory in bytes + uint32_t temperature; // Temperature in Celsius + uint32_t power_draw; // Power draw in milliwatts + uint32_t clock_sm; // SM clock in MHz + uint32_t clock_memory; // Memory clock in MHz + uint32_t pcie_gen; // PCIe generation + uint32_t pcie_width; // PCIe link width + char uuid[80]; // GPU UUID + char vbios_version[32]; // VBIOS version +} gpu_info_t; + +// Initialize NVML library +// Returns 0 on success, non-zero on failure +int gpu_init(void); + +// Shutdown NVML library +void gpu_shutdown(void); + +// Get number of GPUs +// Returns -1 on error, >= 0 on success +int gpu_get_count(void); + +// Get GPU info by index +// Returns 0 on success, non-zero on failure +int gpu_get_info(uint32_t index, gpu_info_t* info); + +// Get current utilization for a GPU +// Returns 0 on success, non-zero on failure +int gpu_get_utilization(uint32_t index, uint32_t* utilization); + +// Get memory info for a GPU +// Returns 0 on success, non-zero on failure +int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total); + +// Get temperature for a GPU +// Returns 0 on success, non-zero on failure +int gpu_get_temperature(uint32_t index, uint32_t* temp); + +// Check if NVML is available (runtime detection) +// Returns 1 if available, 0 if not +int gpu_is_available(void); + +// Get last error message (thread-safe) +const char* gpu_last_error(void); + +#ifdef __cplusplus +} +#endif + +#endif // NVML_GPU_H