feat: implement NVML-based GPU monitoring

- Add native/nvml_gpu/ C++ library wrapping NVIDIA Management Library - Add Go bindings in internal/worker/gpu_nvml_native.go and gpu_nvml_stub.go - Update gpu_detector.go to use NVML for accurate GPU count detection - Update native/CMakeLists.txt to build nvml_gpu library - Provides real-time GPU utilization, memory, temperature, clocks, power - Falls back to environment variable when NVML unavailable
2026-02-21 15:16:09 -05:00 · 2026-02-21 15:16:09 -05:00 · 05b7af6991
commit 05b7af6991
parent d6265df0bd
7 changed files with 532 additions and 1 deletions
--- a/internal/worker/gpu_detector.go
+++ b/internal/worker/gpu_detector.go
@ -26,10 +26,18 @@ type GPUDetector interface {
 type NVIDIADetector struct{}

 func (d *NVIDIADetector) DetectGPUCount() int {
+	// First try NVML for accurate detection
+	if IsNVMLAvailable() {
+		count, err := GetGPUCount()
+		if err == nil && count > 0 {
+			return count
+		}
+	}
+
+	// Fall back to environment variable
 	if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
 		return n
 	}
-	// Could use nvidia-sml or other detection methods here
 	return 0
 }

--- a/internal/worker/gpu_nvml_native.go
+++ b/internal/worker/gpu_nvml_native.go
@ -0,0 +1,169 @@
+//go:build cgo && native_libs
+// +build cgo,native_libs
+
+package worker
+
+// #cgo LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -lnvml_gpu -lnvidia-ml
+// #include "../../native/nvml_gpu/nvml_gpu.h"
+// #include <stdlib.h>
+import "C"
+import (
+	"errors"
+	"fmt"
+)
+
+// GPUInfo holds information about a GPU
+type GPUInfo struct {
+	Index        uint32
+	Name         string
+	Utilization  uint32 // GPU utilization percentage (0-100)
+	MemoryUsed   uint64 // Memory used in bytes
+	MemoryTotal  uint64 // Total memory in bytes
+	Temperature  uint32 // Temperature in Celsius
+	PowerDraw    uint32 // Power draw in milliwatts
+	ClockSM      uint32 // SM clock in MHz
+	ClockMemory  uint32 // Memory clock in MHz
+	PCIeGen      uint32 // PCIe generation
+	PCIeWidth    uint32 // PCIe link width
+	UUID         string // GPU UUID
+	VBIOSVersion string // VBIOS version
+}
+
+var (
+	nvmlInitialized = false
+)
+
+// InitNVML initializes the NVML library
+func InitNVML() error {
+	result := C.gpu_init()
+	if result != 0 {
+		return errors.New(C.GoString(C.gpu_last_error()))
+	}
+	nvmlInitialized = true
+	return nil
+}
+
+// ShutdownNVML shuts down the NVML library
+func ShutdownNVML() {
+	if nvmlInitialized {
+		C.gpu_shutdown()
+		nvmlInitialized = false
+	}
+}
+
+// IsNVMLAvailable checks if NVML is available at runtime
+func IsNVMLAvailable() bool {
+	return C.gpu_is_available() == 1
+}
+
+// GetGPUCount returns the number of GPUs
+func GetGPUCount() (int, error) {
+	if !nvmlInitialized {
+		if err := InitNVML(); err != nil {
+			return 0, err
+		}
+	}
+
+	count := C.gpu_get_count()
+	if count < 0 {
+		return 0, errors.New(C.GoString(C.gpu_last_error()))
+	}
+	return int(count), nil
+}
+
+// GetGPUInfo returns detailed information about a GPU
+func GetGPUInfo(index uint32) (*GPUInfo, error) {
+	if !nvmlInitialized {
+		if err := InitNVML(); err != nil {
+			return nil, err
+		}
+	}
+
+	var cInfo C.gpu_info_t
+	result := C.gpu_get_info(C.uint32_t(index), &cInfo)
+	if result != 0 {
+		return nil, errors.New(C.GoString(C.gpu_last_error()))
+	}
+
+	return &GPUInfo{
+		Index:        uint32(cInfo.index),
+		Name:         C.GoString(&cInfo.name[0]),
+		Utilization:  uint32(cInfo.utilization),
+		MemoryUsed:   uint64(cInfo.memory_used),
+		MemoryTotal:  uint64(cInfo.memory_total),
+		Temperature:  uint32(cInfo.temperature),
+		PowerDraw:    uint32(cInfo.power_draw),
+		ClockSM:      uint32(cInfo.clock_sm),
+		ClockMemory:  uint32(cInfo.clock_memory),
+		PCIeGen:      uint32(cInfo.pcie_gen),
+		PCIeWidth:    uint32(cInfo.pcie_width),
+		UUID:         C.GoString(&cInfo.uuid[0]),
+		VBIOSVersion: C.GoString(&cInfo.vbios_version[0]),
+	}, nil
+}
+
+// GetAllGPUInfo returns information about all GPUs
+func GetAllGPUInfo() ([]*GPUInfo, error) {
+	count, err := GetGPUCount()
+	if err != nil {
+		return nil, err
+	}
+
+	gpus := make([]*GPUInfo, 0, count)
+	for i := 0; i < count; i++ {
+		info, err := GetGPUInfo(uint32(i))
+		if err != nil {
+			return nil, fmt.Errorf("failed to get GPU %d info: %w", i, err)
+		}
+		gpus = append(gpus, info)
+	}
+	return gpus, nil
+}
+
+// GetGPUUtilization returns the current GPU utilization
+func GetGPUUtilization(index uint32) (uint32, error) {
+	if !nvmlInitialized {
+		if err := InitNVML(); err != nil {
+			return 0, err
+		}
+	}
+
+	var utilization C.uint32_t
+	result := C.gpu_get_utilization(C.uint32_t(index), &utilization)
+	if result != 0 {
+		return 0, errors.New(C.GoString(C.gpu_last_error()))
+	}
+	return uint32(utilization), nil
+}
+
+// GetGPUMemory returns the current GPU memory usage
+func GetGPUMemory(index uint32) (used uint64, total uint64, err error) {
+	if !nvmlInitialized {
+		if err := InitNVML(); err != nil {
+			return 0, 0, err
+		}
+	}
+
+	var cUsed, cTotal C.uint64_t
+	result := C.gpu_get_memory(C.uint32_t(index), &cUsed, &cTotal)
+	if result != 0 {
+		return 0, 0, errors.New(C.GoString(C.gpu_last_error()))
+	}
+	return uint64(cUsed), uint64(cTotal), nil
+}
+
+// GetGPUTemperature returns the current GPU temperature
+func GetGPUTemperature(index uint32) (uint32, error) {
+	if !nvmlInitialized {
+		if err := InitNVML(); err != nil {
+			return 0, err
+		}
+	}
+
+	var temp C.uint32_t
+	result := C.gpu_get_temperature(C.uint32_t(index), &temp)
+	if result != 0 {
+		return 0, errors.New(C.GoString(C.gpu_last_error()))
+	}
+	return uint32(temp), nil
+}
--- a/internal/worker/gpu_nvml_stub.go
+++ b/internal/worker/gpu_nvml_stub.go
@ -0,0 +1,42 @@
+//go:build cgo && !native_libs
+// +build cgo,!native_libs
+
+package worker
+
+import "errors"
+
+// Stub implementations when native_libs build tag is not present
+
+func InitNVML() error {
+	return errors.New("NVML requires native_libs build tag")
+}
+
+func ShutdownNVML() {}
+
+func IsNVMLAvailable() bool {
+	return false
+}
+
+func GetGPUCount() (int, error) {
+	return 0, errors.New("NVML requires native_libs build tag")
+}
+
+func GetGPUInfo(index uint32) (*GPUInfo, error) {
+	return nil, errors.New("NVML requires native_libs build tag")
+}
+
+func GetAllGPUInfo() ([]*GPUInfo, error) {
+	return nil, errors.New("NVML requires native_libs build tag")
+}
+
+func GetGPUUtilization(index uint32) (uint32, error) {
+	return 0, errors.New("NVML requires native_libs build tag")
+}
+
+func GetGPUMemory(index uint32) (used uint64, total uint64, err error) {
+	return 0, 0, errors.New("NVML requires native_libs build tag")
+}
+
+func GetGPUTemperature(index uint32) (uint32, error) {
+	return 0, errors.New("NVML requires native_libs build tag")
+}
--- a/native/CMakeLists.txt
+++ b/native/CMakeLists.txt
@ -71,6 +71,7 @@ enable_testing()
 add_subdirectory(common)
 add_subdirectory(queue_index)
 add_subdirectory(dataset_hash)
+add_subdirectory(nvml_gpu)

 # Tests from root tests/ directory
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tests)
@ -119,4 +120,5 @@ endif()
 add_custom_target(all_native_libs DEPENDS 
    queue_index 
    dataset_hash
+    nvml_gpu
 )
--- a/native/nvml_gpu/CMakeLists.txt
+++ b/native/nvml_gpu/CMakeLists.txt
@ -0,0 +1,35 @@
+add_library(nvml_gpu SHARED
+    nvml_gpu.cpp
+)
+
+target_include_directories(nvml_gpu PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+# Find NVML library
+find_library(NVML_LIBRARY nvidia-ml 
+    PATHS 
+        /usr/lib/x86_64-linux-gnu
+        /usr/local/cuda/lib64
+        /usr/lib64
+        /usr/lib
+        /opt/cuda/lib64
+    DOC "NVIDIA Management Library"
+)
+
+if(NVML_LIBRARY)
+    target_link_libraries(nvml_gpu PRIVATE ${NVML_LIBRARY})
+    message(STATUS "Found NVML: ${NVML_LIBRARY}")
+else()
+    message(WARNING "NVML library not found. GPU monitoring will be disabled.")
+    # Create stub library that always returns unavailable
+    target_compile_definitions(nvml_gpu PRIVATE NVML_STUB)
+endif()
+
+set_target_properties(nvml_gpu PROPERTIES
+    VERSION ${PROJECT_VERSION}
+    SOVERSION ${PROJECT_VERSION_MAJOR}
+    POSITION_INDEPENDENT_CODE ON
+    C_STANDARD 11
+    CXX_STANDARD 17
+)
--- a/native/nvml_gpu/nvml_gpu.cpp
+++ b/native/nvml_gpu/nvml_gpu.cpp
@ -0,0 +1,209 @@
+#include "nvml_gpu.h"
+#include <nvml.h>
+#include <stdio.h>
+#include <string.h>
+
+// Thread-local error buffer
+static __thread char last_error_buffer[256] = {0};
+
+static void set_error(const char* msg) {
+    strncpy(last_error_buffer, msg, sizeof(last_error_buffer) - 1);
+    last_error_buffer[sizeof(last_error_buffer) - 1] = '\0';
+}
+
+int gpu_init(void) {
+    nvmlReturn_t result = nvmlInit();
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+    return 0;
+}
+
+void gpu_shutdown(void) {
+    nvmlShutdown();
+}
+
+int gpu_get_count(void) {
+    unsigned int count = 0;
+    nvmlReturn_t result = nvmlDeviceGetCount(&count);
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+    return (int)count;
+}
+
+int gpu_get_info(uint32_t index, gpu_info_t* info) {
+    if (!info) {
+        set_error("null info pointer");
+        return -1;
+    }
+
+    memset(info, 0, sizeof(gpu_info_t));
+    info->index = index;
+
+    nvmlDevice_t device;
+    nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+
+    // Get name
+    result = nvmlDeviceGetName(device, info->name, sizeof(info->name));
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+
+    // Get UUID
+    result = nvmlDeviceGetUUID(device, info->uuid, sizeof(info->uuid));
+    if (result != NVML_SUCCESS) {
+        // Non-critical, continue
+        strcpy(info->uuid, "unknown");
+    }
+
+    // Get VBIOS version
+    result = nvmlDeviceGetVbiosVersion(device, info->vbios_version, sizeof(info->vbios_version));
+    if (result != NVML_SUCCESS) {
+        strcpy(info->vbios_version, "unknown");
+    }
+
+    // Get utilization
+    nvmlUtilization_t utilization;
+    result = nvmlDeviceGetUtilizationRates(device, &utilization);
+    if (result == NVML_SUCCESS) {
+        info->utilization = utilization.gpu;
+    }
+
+    // Get memory info
+    nvmlMemory_t memory;
+    result = nvmlDeviceGetMemoryInfo(device, &memory);
+    if (result == NVML_SUCCESS) {
+        info->memory_used = memory.used;
+        info->memory_total = memory.total;
+    }
+
+    // Get temperature
+    result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info->temperature);
+    if (result != NVML_SUCCESS) {
+        info->temperature = 0;  // Not available
+    }
+
+    // Get power draw
+    unsigned int power_mw;
+    result = nvmlDeviceGetPowerUsage(device, &power_mw);
+    if (result == NVML_SUCCESS) {
+        info->power_draw = power_mw;
+    }
+
+    // Get clocks
+    unsigned int clock;
+    result = nvmlDeviceGetClock(device, NVML_CLOCK_SM, NVML_CLOCK_ID_CURRENT, &clock);
+    if (result == NVML_SUCCESS) {
+        info->clock_sm = clock;
+    }
+    result = nvmlDeviceGetClock(device, NVML_CLOCK_MEM, NVML_CLOCK_ID_CURRENT, &clock);
+    if (result == NVML_SUCCESS) {
+        info->clock_memory = clock;
+    }
+
+    // Get PCIe info
+    nvmlPciInfo_t pciInfo;
+    result = nvmlDeviceGetPciInfo(device, &pciInfo);
+    if (result == NVML_SUCCESS) {
+        // PCIe generation and width require NVML 11.0+
+        // For now, we leave them as 0
+        info->pcie_gen = 0;
+        info->pcie_width = 0;
+    }
+
+    return 0;
+}
+
+int gpu_get_utilization(uint32_t index, uint32_t* utilization) {
+    if (!utilization) {
+        set_error("null utilization pointer");
+        return -1;
+    }
+
+    nvmlDevice_t device;
+    nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+
+    nvmlUtilization_t util;
+    result = nvmlDeviceGetUtilizationRates(device, &util);
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+
+    *utilization = util.gpu;
+    return 0;
+}
+
+int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total) {
+    if (!used || !total) {
+        set_error("null pointer");
+        return -1;
+    }
+
+    nvmlDevice_t device;
+    nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+
+    nvmlMemory_t memory;
+    result = nvmlDeviceGetMemoryInfo(device, &memory);
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+
+    *used = memory.used;
+    *total = memory.total;
+    return 0;
+}
+
+int gpu_get_temperature(uint32_t index, uint32_t* temp) {
+    if (!temp) {
+        set_error("null temp pointer");
+        return -1;
+    }
+
+    nvmlDevice_t device;
+    nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+
+    unsigned int temperature;
+    result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
+    if (result != NVML_SUCCESS) {
+        set_error(nvmlErrorString(result));
+        return -1;
+    }
+
+    *temp = temperature;
+    return 0;
+}
+
+int gpu_is_available(void) {
+    nvmlReturn_t result = nvmlInit();
+    if (result == NVML_SUCCESS) {
+        nvmlShutdown();
+        return 1;
+    }
+    return 0;
+}
+
+const char* gpu_last_error(void) {
+    return last_error_buffer;
+}
--- a/native/nvml_gpu/nvml_gpu.h
+++ b/native/nvml_gpu/nvml_gpu.h
@ -0,0 +1,66 @@
+#ifndef NVML_GPU_H
+#define NVML_GPU_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// GPU information structure
+typedef struct {
+    uint32_t index;
+    char name[256];
+    uint32_t utilization;        // GPU utilization percentage (0-100)
+    uint64_t memory_used;        // Memory used in bytes
+    uint64_t memory_total;       // Total memory in bytes
+    uint32_t temperature;        // Temperature in Celsius
+    uint32_t power_draw;         // Power draw in milliwatts
+    uint32_t clock_sm;           // SM clock in MHz
+    uint32_t clock_memory;       // Memory clock in MHz
+    uint32_t pcie_gen;           // PCIe generation
+    uint32_t pcie_width;         // PCIe link width
+    char uuid[80];               // GPU UUID
+    char vbios_version[32];      // VBIOS version
+} gpu_info_t;
+
+// Initialize NVML library
+// Returns 0 on success, non-zero on failure
+int gpu_init(void);
+
+// Shutdown NVML library
+void gpu_shutdown(void);
+
+// Get number of GPUs
+// Returns -1 on error, >= 0 on success
+int gpu_get_count(void);
+
+// Get GPU info by index
+// Returns 0 on success, non-zero on failure
+int gpu_get_info(uint32_t index, gpu_info_t* info);
+
+// Get current utilization for a GPU
+// Returns 0 on success, non-zero on failure
+int gpu_get_utilization(uint32_t index, uint32_t* utilization);
+
+// Get memory info for a GPU
+// Returns 0 on success, non-zero on failure
+int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total);
+
+// Get temperature for a GPU
+// Returns 0 on success, non-zero on failure
+int gpu_get_temperature(uint32_t index, uint32_t* temp);
+
+// Check if NVML is available (runtime detection)
+// Returns 1 if available, 0 if not
+int gpu_is_available(void);
+
+// Get last error message (thread-safe)
+const char* gpu_last_error(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NVML_GPU_H