- Add native/nvml_gpu/ C++ library wrapping NVIDIA Management Library - Add Go bindings in internal/worker/gpu_nvml_native.go and gpu_nvml_stub.go - Update gpu_detector.go to use NVML for accurate GPU count detection - Update native/CMakeLists.txt to build nvml_gpu library - Provides real-time GPU utilization, memory, temperature, clocks, power - Falls back to environment variable when NVML unavailable
209 lines
5.2 KiB
C++
209 lines
5.2 KiB
C++
#include "nvml_gpu.h"
|
|
#include <nvml.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
// Thread-local error buffer
|
|
static __thread char last_error_buffer[256] = {0};
|
|
|
|
static void set_error(const char* msg) {
|
|
strncpy(last_error_buffer, msg, sizeof(last_error_buffer) - 1);
|
|
last_error_buffer[sizeof(last_error_buffer) - 1] = '\0';
|
|
}
|
|
|
|
int gpu_init(void) {
|
|
nvmlReturn_t result = nvmlInit();
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void gpu_shutdown(void) {
|
|
nvmlShutdown();
|
|
}
|
|
|
|
int gpu_get_count(void) {
|
|
unsigned int count = 0;
|
|
nvmlReturn_t result = nvmlDeviceGetCount(&count);
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
return (int)count;
|
|
}
|
|
|
|
int gpu_get_info(uint32_t index, gpu_info_t* info) {
|
|
if (!info) {
|
|
set_error("null info pointer");
|
|
return -1;
|
|
}
|
|
|
|
memset(info, 0, sizeof(gpu_info_t));
|
|
info->index = index;
|
|
|
|
nvmlDevice_t device;
|
|
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
|
|
// Get name
|
|
result = nvmlDeviceGetName(device, info->name, sizeof(info->name));
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
|
|
// Get UUID
|
|
result = nvmlDeviceGetUUID(device, info->uuid, sizeof(info->uuid));
|
|
if (result != NVML_SUCCESS) {
|
|
// Non-critical, continue
|
|
strcpy(info->uuid, "unknown");
|
|
}
|
|
|
|
// Get VBIOS version
|
|
result = nvmlDeviceGetVbiosVersion(device, info->vbios_version, sizeof(info->vbios_version));
|
|
if (result != NVML_SUCCESS) {
|
|
strcpy(info->vbios_version, "unknown");
|
|
}
|
|
|
|
// Get utilization
|
|
nvmlUtilization_t utilization;
|
|
result = nvmlDeviceGetUtilizationRates(device, &utilization);
|
|
if (result == NVML_SUCCESS) {
|
|
info->utilization = utilization.gpu;
|
|
}
|
|
|
|
// Get memory info
|
|
nvmlMemory_t memory;
|
|
result = nvmlDeviceGetMemoryInfo(device, &memory);
|
|
if (result == NVML_SUCCESS) {
|
|
info->memory_used = memory.used;
|
|
info->memory_total = memory.total;
|
|
}
|
|
|
|
// Get temperature
|
|
result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info->temperature);
|
|
if (result != NVML_SUCCESS) {
|
|
info->temperature = 0; // Not available
|
|
}
|
|
|
|
// Get power draw
|
|
unsigned int power_mw;
|
|
result = nvmlDeviceGetPowerUsage(device, &power_mw);
|
|
if (result == NVML_SUCCESS) {
|
|
info->power_draw = power_mw;
|
|
}
|
|
|
|
// Get clocks
|
|
unsigned int clock;
|
|
result = nvmlDeviceGetClock(device, NVML_CLOCK_SM, NVML_CLOCK_ID_CURRENT, &clock);
|
|
if (result == NVML_SUCCESS) {
|
|
info->clock_sm = clock;
|
|
}
|
|
result = nvmlDeviceGetClock(device, NVML_CLOCK_MEM, NVML_CLOCK_ID_CURRENT, &clock);
|
|
if (result == NVML_SUCCESS) {
|
|
info->clock_memory = clock;
|
|
}
|
|
|
|
// Get PCIe info
|
|
nvmlPciInfo_t pciInfo;
|
|
result = nvmlDeviceGetPciInfo(device, &pciInfo);
|
|
if (result == NVML_SUCCESS) {
|
|
// PCIe generation and width require NVML 11.0+
|
|
// For now, we leave them as 0
|
|
info->pcie_gen = 0;
|
|
info->pcie_width = 0;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int gpu_get_utilization(uint32_t index, uint32_t* utilization) {
|
|
if (!utilization) {
|
|
set_error("null utilization pointer");
|
|
return -1;
|
|
}
|
|
|
|
nvmlDevice_t device;
|
|
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
|
|
nvmlUtilization_t util;
|
|
result = nvmlDeviceGetUtilizationRates(device, &util);
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
|
|
*utilization = util.gpu;
|
|
return 0;
|
|
}
|
|
|
|
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total) {
|
|
if (!used || !total) {
|
|
set_error("null pointer");
|
|
return -1;
|
|
}
|
|
|
|
nvmlDevice_t device;
|
|
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
|
|
nvmlMemory_t memory;
|
|
result = nvmlDeviceGetMemoryInfo(device, &memory);
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
|
|
*used = memory.used;
|
|
*total = memory.total;
|
|
return 0;
|
|
}
|
|
|
|
int gpu_get_temperature(uint32_t index, uint32_t* temp) {
|
|
if (!temp) {
|
|
set_error("null temp pointer");
|
|
return -1;
|
|
}
|
|
|
|
nvmlDevice_t device;
|
|
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
|
|
unsigned int temperature;
|
|
result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
|
|
if (result != NVML_SUCCESS) {
|
|
set_error(nvmlErrorString(result));
|
|
return -1;
|
|
}
|
|
|
|
*temp = temperature;
|
|
return 0;
|
|
}
|
|
|
|
int gpu_is_available(void) {
|
|
nvmlReturn_t result = nvmlInit();
|
|
if (result == NVML_SUCCESS) {
|
|
nvmlShutdown();
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
const char* gpu_last_error(void) {
|
|
return last_error_buffer;
|
|
}
|