feat: implement NVML-based GPU monitoring
- Add native/nvml_gpu/ C++ library wrapping NVIDIA Management Library - Add Go bindings in internal/worker/gpu_nvml_native.go and gpu_nvml_stub.go - Update gpu_detector.go to use NVML for accurate GPU count detection - Update native/CMakeLists.txt to build nvml_gpu library - Provides real-time GPU utilization, memory, temperature, clocks, power - Falls back to environment variable when NVML unavailable
This commit is contained in:
parent
d6265df0bd
commit
05b7af6991
7 changed files with 532 additions and 1 deletions
|
|
@ -26,10 +26,18 @@ type GPUDetector interface {
|
|||
type NVIDIADetector struct{}
|
||||
|
||||
func (d *NVIDIADetector) DetectGPUCount() int {
|
||||
// First try NVML for accurate detection
|
||||
if IsNVMLAvailable() {
|
||||
count, err := GetGPUCount()
|
||||
if err == nil && count > 0 {
|
||||
return count
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to environment variable
|
||||
if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
|
||||
return n
|
||||
}
|
||||
// Could use nvidia-sml or other detection methods here
|
||||
return 0
|
||||
}
|
||||
|
||||
|
|
|
|||
169
internal/worker/gpu_nvml_native.go
Normal file
169
internal/worker/gpu_nvml_native.go
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
//go:build cgo && native_libs
|
||||
// +build cgo,native_libs
|
||||
|
||||
package worker
|
||||
|
||||
// #cgo LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -lnvml_gpu -lnvidia-ml
|
||||
// #include "../../native/nvml_gpu/nvml_gpu.h"
|
||||
// #include <stdlib.h>
|
||||
import "C"
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// GPUInfo holds information about a GPU
|
||||
type GPUInfo struct {
|
||||
Index uint32
|
||||
Name string
|
||||
Utilization uint32 // GPU utilization percentage (0-100)
|
||||
MemoryUsed uint64 // Memory used in bytes
|
||||
MemoryTotal uint64 // Total memory in bytes
|
||||
Temperature uint32 // Temperature in Celsius
|
||||
PowerDraw uint32 // Power draw in milliwatts
|
||||
ClockSM uint32 // SM clock in MHz
|
||||
ClockMemory uint32 // Memory clock in MHz
|
||||
PCIeGen uint32 // PCIe generation
|
||||
PCIeWidth uint32 // PCIe link width
|
||||
UUID string // GPU UUID
|
||||
VBIOSVersion string // VBIOS version
|
||||
}
|
||||
|
||||
var (
|
||||
nvmlInitialized = false
|
||||
)
|
||||
|
||||
// InitNVML initializes the NVML library
|
||||
func InitNVML() error {
|
||||
result := C.gpu_init()
|
||||
if result != 0 {
|
||||
return errors.New(C.GoString(C.gpu_last_error()))
|
||||
}
|
||||
nvmlInitialized = true
|
||||
return nil
|
||||
}
|
||||
|
||||
// ShutdownNVML shuts down the NVML library
|
||||
func ShutdownNVML() {
|
||||
if nvmlInitialized {
|
||||
C.gpu_shutdown()
|
||||
nvmlInitialized = false
|
||||
}
|
||||
}
|
||||
|
||||
// IsNVMLAvailable checks if NVML is available at runtime
|
||||
func IsNVMLAvailable() bool {
|
||||
return C.gpu_is_available() == 1
|
||||
}
|
||||
|
||||
// GetGPUCount returns the number of GPUs
|
||||
func GetGPUCount() (int, error) {
|
||||
if !nvmlInitialized {
|
||||
if err := InitNVML(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
count := C.gpu_get_count()
|
||||
if count < 0 {
|
||||
return 0, errors.New(C.GoString(C.gpu_last_error()))
|
||||
}
|
||||
return int(count), nil
|
||||
}
|
||||
|
||||
// GetGPUInfo returns detailed information about a GPU
|
||||
func GetGPUInfo(index uint32) (*GPUInfo, error) {
|
||||
if !nvmlInitialized {
|
||||
if err := InitNVML(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var cInfo C.gpu_info_t
|
||||
result := C.gpu_get_info(C.uint32_t(index), &cInfo)
|
||||
if result != 0 {
|
||||
return nil, errors.New(C.GoString(C.gpu_last_error()))
|
||||
}
|
||||
|
||||
return &GPUInfo{
|
||||
Index: uint32(cInfo.index),
|
||||
Name: C.GoString(&cInfo.name[0]),
|
||||
Utilization: uint32(cInfo.utilization),
|
||||
MemoryUsed: uint64(cInfo.memory_used),
|
||||
MemoryTotal: uint64(cInfo.memory_total),
|
||||
Temperature: uint32(cInfo.temperature),
|
||||
PowerDraw: uint32(cInfo.power_draw),
|
||||
ClockSM: uint32(cInfo.clock_sm),
|
||||
ClockMemory: uint32(cInfo.clock_memory),
|
||||
PCIeGen: uint32(cInfo.pcie_gen),
|
||||
PCIeWidth: uint32(cInfo.pcie_width),
|
||||
UUID: C.GoString(&cInfo.uuid[0]),
|
||||
VBIOSVersion: C.GoString(&cInfo.vbios_version[0]),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// GetAllGPUInfo returns information about all GPUs
|
||||
func GetAllGPUInfo() ([]*GPUInfo, error) {
|
||||
count, err := GetGPUCount()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
gpus := make([]*GPUInfo, 0, count)
|
||||
for i := 0; i < count; i++ {
|
||||
info, err := GetGPUInfo(uint32(i))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get GPU %d info: %w", i, err)
|
||||
}
|
||||
gpus = append(gpus, info)
|
||||
}
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
// GetGPUUtilization returns the current GPU utilization
|
||||
func GetGPUUtilization(index uint32) (uint32, error) {
|
||||
if !nvmlInitialized {
|
||||
if err := InitNVML(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
var utilization C.uint32_t
|
||||
result := C.gpu_get_utilization(C.uint32_t(index), &utilization)
|
||||
if result != 0 {
|
||||
return 0, errors.New(C.GoString(C.gpu_last_error()))
|
||||
}
|
||||
return uint32(utilization), nil
|
||||
}
|
||||
|
||||
// GetGPUMemory returns the current GPU memory usage
|
||||
func GetGPUMemory(index uint32) (used uint64, total uint64, err error) {
|
||||
if !nvmlInitialized {
|
||||
if err := InitNVML(); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
var cUsed, cTotal C.uint64_t
|
||||
result := C.gpu_get_memory(C.uint32_t(index), &cUsed, &cTotal)
|
||||
if result != 0 {
|
||||
return 0, 0, errors.New(C.GoString(C.gpu_last_error()))
|
||||
}
|
||||
return uint64(cUsed), uint64(cTotal), nil
|
||||
}
|
||||
|
||||
// GetGPUTemperature returns the current GPU temperature
|
||||
func GetGPUTemperature(index uint32) (uint32, error) {
|
||||
if !nvmlInitialized {
|
||||
if err := InitNVML(); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
var temp C.uint32_t
|
||||
result := C.gpu_get_temperature(C.uint32_t(index), &temp)
|
||||
if result != 0 {
|
||||
return 0, errors.New(C.GoString(C.gpu_last_error()))
|
||||
}
|
||||
return uint32(temp), nil
|
||||
}
|
||||
42
internal/worker/gpu_nvml_stub.go
Normal file
42
internal/worker/gpu_nvml_stub.go
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
//go:build cgo && !native_libs
|
||||
// +build cgo,!native_libs
|
||||
|
||||
package worker
|
||||
|
||||
import "errors"
|
||||
|
||||
// Stub implementations when native_libs build tag is not present
|
||||
|
||||
func InitNVML() error {
|
||||
return errors.New("NVML requires native_libs build tag")
|
||||
}
|
||||
|
||||
func ShutdownNVML() {}
|
||||
|
||||
func IsNVMLAvailable() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func GetGPUCount() (int, error) {
|
||||
return 0, errors.New("NVML requires native_libs build tag")
|
||||
}
|
||||
|
||||
func GetGPUInfo(index uint32) (*GPUInfo, error) {
|
||||
return nil, errors.New("NVML requires native_libs build tag")
|
||||
}
|
||||
|
||||
func GetAllGPUInfo() ([]*GPUInfo, error) {
|
||||
return nil, errors.New("NVML requires native_libs build tag")
|
||||
}
|
||||
|
||||
func GetGPUUtilization(index uint32) (uint32, error) {
|
||||
return 0, errors.New("NVML requires native_libs build tag")
|
||||
}
|
||||
|
||||
func GetGPUMemory(index uint32) (used uint64, total uint64, err error) {
|
||||
return 0, 0, errors.New("NVML requires native_libs build tag")
|
||||
}
|
||||
|
||||
func GetGPUTemperature(index uint32) (uint32, error) {
|
||||
return 0, errors.New("NVML requires native_libs build tag")
|
||||
}
|
||||
|
|
@ -71,6 +71,7 @@ enable_testing()
|
|||
add_subdirectory(common)
|
||||
add_subdirectory(queue_index)
|
||||
add_subdirectory(dataset_hash)
|
||||
add_subdirectory(nvml_gpu)
|
||||
|
||||
# Tests from root tests/ directory
|
||||
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/tests)
|
||||
|
|
@ -119,4 +120,5 @@ endif()
|
|||
add_custom_target(all_native_libs DEPENDS
|
||||
queue_index
|
||||
dataset_hash
|
||||
nvml_gpu
|
||||
)
|
||||
|
|
|
|||
35
native/nvml_gpu/CMakeLists.txt
Normal file
35
native/nvml_gpu/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
add_library(nvml_gpu SHARED
|
||||
nvml_gpu.cpp
|
||||
)
|
||||
|
||||
target_include_directories(nvml_gpu PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
|
||||
# Find NVML library
|
||||
find_library(NVML_LIBRARY nvidia-ml
|
||||
PATHS
|
||||
/usr/lib/x86_64-linux-gnu
|
||||
/usr/local/cuda/lib64
|
||||
/usr/lib64
|
||||
/usr/lib
|
||||
/opt/cuda/lib64
|
||||
DOC "NVIDIA Management Library"
|
||||
)
|
||||
|
||||
if(NVML_LIBRARY)
|
||||
target_link_libraries(nvml_gpu PRIVATE ${NVML_LIBRARY})
|
||||
message(STATUS "Found NVML: ${NVML_LIBRARY}")
|
||||
else()
|
||||
message(WARNING "NVML library not found. GPU monitoring will be disabled.")
|
||||
# Create stub library that always returns unavailable
|
||||
target_compile_definitions(nvml_gpu PRIVATE NVML_STUB)
|
||||
endif()
|
||||
|
||||
set_target_properties(nvml_gpu PROPERTIES
|
||||
VERSION ${PROJECT_VERSION}
|
||||
SOVERSION ${PROJECT_VERSION_MAJOR}
|
||||
POSITION_INDEPENDENT_CODE ON
|
||||
C_STANDARD 11
|
||||
CXX_STANDARD 17
|
||||
)
|
||||
209
native/nvml_gpu/nvml_gpu.cpp
Normal file
209
native/nvml_gpu/nvml_gpu.cpp
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
#include "nvml_gpu.h"
|
||||
#include <nvml.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
// Thread-local error buffer
|
||||
static __thread char last_error_buffer[256] = {0};
|
||||
|
||||
static void set_error(const char* msg) {
|
||||
strncpy(last_error_buffer, msg, sizeof(last_error_buffer) - 1);
|
||||
last_error_buffer[sizeof(last_error_buffer) - 1] = '\0';
|
||||
}
|
||||
|
||||
int gpu_init(void) {
|
||||
nvmlReturn_t result = nvmlInit();
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void gpu_shutdown(void) {
|
||||
nvmlShutdown();
|
||||
}
|
||||
|
||||
int gpu_get_count(void) {
|
||||
unsigned int count = 0;
|
||||
nvmlReturn_t result = nvmlDeviceGetCount(&count);
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
return (int)count;
|
||||
}
|
||||
|
||||
int gpu_get_info(uint32_t index, gpu_info_t* info) {
|
||||
if (!info) {
|
||||
set_error("null info pointer");
|
||||
return -1;
|
||||
}
|
||||
|
||||
memset(info, 0, sizeof(gpu_info_t));
|
||||
info->index = index;
|
||||
|
||||
nvmlDevice_t device;
|
||||
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Get name
|
||||
result = nvmlDeviceGetName(device, info->name, sizeof(info->name));
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Get UUID
|
||||
result = nvmlDeviceGetUUID(device, info->uuid, sizeof(info->uuid));
|
||||
if (result != NVML_SUCCESS) {
|
||||
// Non-critical, continue
|
||||
strcpy(info->uuid, "unknown");
|
||||
}
|
||||
|
||||
// Get VBIOS version
|
||||
result = nvmlDeviceGetVbiosVersion(device, info->vbios_version, sizeof(info->vbios_version));
|
||||
if (result != NVML_SUCCESS) {
|
||||
strcpy(info->vbios_version, "unknown");
|
||||
}
|
||||
|
||||
// Get utilization
|
||||
nvmlUtilization_t utilization;
|
||||
result = nvmlDeviceGetUtilizationRates(device, &utilization);
|
||||
if (result == NVML_SUCCESS) {
|
||||
info->utilization = utilization.gpu;
|
||||
}
|
||||
|
||||
// Get memory info
|
||||
nvmlMemory_t memory;
|
||||
result = nvmlDeviceGetMemoryInfo(device, &memory);
|
||||
if (result == NVML_SUCCESS) {
|
||||
info->memory_used = memory.used;
|
||||
info->memory_total = memory.total;
|
||||
}
|
||||
|
||||
// Get temperature
|
||||
result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info->temperature);
|
||||
if (result != NVML_SUCCESS) {
|
||||
info->temperature = 0; // Not available
|
||||
}
|
||||
|
||||
// Get power draw
|
||||
unsigned int power_mw;
|
||||
result = nvmlDeviceGetPowerUsage(device, &power_mw);
|
||||
if (result == NVML_SUCCESS) {
|
||||
info->power_draw = power_mw;
|
||||
}
|
||||
|
||||
// Get clocks
|
||||
unsigned int clock;
|
||||
result = nvmlDeviceGetClock(device, NVML_CLOCK_SM, NVML_CLOCK_ID_CURRENT, &clock);
|
||||
if (result == NVML_SUCCESS) {
|
||||
info->clock_sm = clock;
|
||||
}
|
||||
result = nvmlDeviceGetClock(device, NVML_CLOCK_MEM, NVML_CLOCK_ID_CURRENT, &clock);
|
||||
if (result == NVML_SUCCESS) {
|
||||
info->clock_memory = clock;
|
||||
}
|
||||
|
||||
// Get PCIe info
|
||||
nvmlPciInfo_t pciInfo;
|
||||
result = nvmlDeviceGetPciInfo(device, &pciInfo);
|
||||
if (result == NVML_SUCCESS) {
|
||||
// PCIe generation and width require NVML 11.0+
|
||||
// For now, we leave them as 0
|
||||
info->pcie_gen = 0;
|
||||
info->pcie_width = 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int gpu_get_utilization(uint32_t index, uint32_t* utilization) {
|
||||
if (!utilization) {
|
||||
set_error("null utilization pointer");
|
||||
return -1;
|
||||
}
|
||||
|
||||
nvmlDevice_t device;
|
||||
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
nvmlUtilization_t util;
|
||||
result = nvmlDeviceGetUtilizationRates(device, &util);
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
*utilization = util.gpu;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total) {
|
||||
if (!used || !total) {
|
||||
set_error("null pointer");
|
||||
return -1;
|
||||
}
|
||||
|
||||
nvmlDevice_t device;
|
||||
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
nvmlMemory_t memory;
|
||||
result = nvmlDeviceGetMemoryInfo(device, &memory);
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
*used = memory.used;
|
||||
*total = memory.total;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int gpu_get_temperature(uint32_t index, uint32_t* temp) {
|
||||
if (!temp) {
|
||||
set_error("null temp pointer");
|
||||
return -1;
|
||||
}
|
||||
|
||||
nvmlDevice_t device;
|
||||
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
unsigned int temperature;
|
||||
result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
|
||||
if (result != NVML_SUCCESS) {
|
||||
set_error(nvmlErrorString(result));
|
||||
return -1;
|
||||
}
|
||||
|
||||
*temp = temperature;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int gpu_is_available(void) {
|
||||
nvmlReturn_t result = nvmlInit();
|
||||
if (result == NVML_SUCCESS) {
|
||||
nvmlShutdown();
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char* gpu_last_error(void) {
|
||||
return last_error_buffer;
|
||||
}
|
||||
66
native/nvml_gpu/nvml_gpu.h
Normal file
66
native/nvml_gpu/nvml_gpu.h
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
#ifndef NVML_GPU_H
|
||||
#define NVML_GPU_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// GPU information structure
|
||||
typedef struct {
|
||||
uint32_t index;
|
||||
char name[256];
|
||||
uint32_t utilization; // GPU utilization percentage (0-100)
|
||||
uint64_t memory_used; // Memory used in bytes
|
||||
uint64_t memory_total; // Total memory in bytes
|
||||
uint32_t temperature; // Temperature in Celsius
|
||||
uint32_t power_draw; // Power draw in milliwatts
|
||||
uint32_t clock_sm; // SM clock in MHz
|
||||
uint32_t clock_memory; // Memory clock in MHz
|
||||
uint32_t pcie_gen; // PCIe generation
|
||||
uint32_t pcie_width; // PCIe link width
|
||||
char uuid[80]; // GPU UUID
|
||||
char vbios_version[32]; // VBIOS version
|
||||
} gpu_info_t;
|
||||
|
||||
// Initialize NVML library
|
||||
// Returns 0 on success, non-zero on failure
|
||||
int gpu_init(void);
|
||||
|
||||
// Shutdown NVML library
|
||||
void gpu_shutdown(void);
|
||||
|
||||
// Get number of GPUs
|
||||
// Returns -1 on error, >= 0 on success
|
||||
int gpu_get_count(void);
|
||||
|
||||
// Get GPU info by index
|
||||
// Returns 0 on success, non-zero on failure
|
||||
int gpu_get_info(uint32_t index, gpu_info_t* info);
|
||||
|
||||
// Get current utilization for a GPU
|
||||
// Returns 0 on success, non-zero on failure
|
||||
int gpu_get_utilization(uint32_t index, uint32_t* utilization);
|
||||
|
||||
// Get memory info for a GPU
|
||||
// Returns 0 on success, non-zero on failure
|
||||
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total);
|
||||
|
||||
// Get temperature for a GPU
|
||||
// Returns 0 on success, non-zero on failure
|
||||
int gpu_get_temperature(uint32_t index, uint32_t* temp);
|
||||
|
||||
// Check if NVML is available (runtime detection)
|
||||
// Returns 1 if available, 0 if not
|
||||
int gpu_is_available(void);
|
||||
|
||||
// Get last error message (thread-safe)
|
||||
const char* gpu_last_error(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // NVML_GPU_H
|
||||
Loading…
Reference in a new issue