fetch_ml/native/nvml_gpu/nvml_gpu.cpp
Jeremie Fraeys c56e53cb52
fix: NVML stub support for systems without NVIDIA drivers
- Add stub implementation in nvml_gpu.cpp when NVML not available
- CMakeLists.txt checks for both NVML library and headers
- Build succeeds on macOS/non-NVIDIA systems with stub
- Runtime detection via gpu_is_available() prevents runtime errors
2026-02-21 15:16:54 -05:00

227 lines
5.8 KiB
C++

#include "nvml_gpu.h"
#include <string.h>
#ifdef NVML_STUB
// Stub implementation when NVML is not available
int gpu_init(void) { return -1; }
void gpu_shutdown(void) {}
int gpu_get_count(void) { return -1; }
int gpu_get_info(uint32_t index, gpu_info_t* info) { return -1; }
int gpu_get_utilization(uint32_t index, uint32_t* utilization) { return -1; }
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total) { return -1; }
int gpu_get_temperature(uint32_t index, uint32_t* temp) { return -1; }
int gpu_is_available(void) { return 0; }
const char* gpu_last_error(void) { return "NVML not available"; }
#else
// Full NVML implementation
#include <nvml.h>
#include <stdio.h>
// Thread-local error buffer
static __thread char last_error_buffer[256] = {0};
static void set_error(const char* msg) {
strncpy(last_error_buffer, msg, sizeof(last_error_buffer) - 1);
last_error_buffer[sizeof(last_error_buffer) - 1] = '\0';
}
int gpu_init(void) {
nvmlReturn_t result = nvmlInit();
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
return 0;
}
void gpu_shutdown(void) {
nvmlShutdown();
}
int gpu_get_count(void) {
unsigned int count = 0;
nvmlReturn_t result = nvmlDeviceGetCount(&count);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
return (int)count;
}
int gpu_get_info(uint32_t index, gpu_info_t* info) {
if (!info) {
set_error("null info pointer");
return -1;
}
memset(info, 0, sizeof(gpu_info_t));
info->index = index;
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
// Get name
result = nvmlDeviceGetName(device, info->name, sizeof(info->name));
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
// Get UUID
result = nvmlDeviceGetUUID(device, info->uuid, sizeof(info->uuid));
if (result != NVML_SUCCESS) {
// Non-critical, continue
strcpy(info->uuid, "unknown");
}
// Get VBIOS version
result = nvmlDeviceGetVbiosVersion(device, info->vbios_version, sizeof(info->vbios_version));
if (result != NVML_SUCCESS) {
strcpy(info->vbios_version, "unknown");
}
// Get utilization
nvmlUtilization_t utilization;
result = nvmlDeviceGetUtilizationRates(device, &utilization);
if (result == NVML_SUCCESS) {
info->utilization = utilization.gpu;
}
// Get memory info
nvmlMemory_t memory;
result = nvmlDeviceGetMemoryInfo(device, &memory);
if (result == NVML_SUCCESS) {
info->memory_used = memory.used;
info->memory_total = memory.total;
}
// Get temperature
result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info->temperature);
if (result != NVML_SUCCESS) {
info->temperature = 0; // Not available
}
// Get power draw
unsigned int power_mw;
result = nvmlDeviceGetPowerUsage(device, &power_mw);
if (result == NVML_SUCCESS) {
info->power_draw = power_mw;
}
// Get clocks
unsigned int clock;
result = nvmlDeviceGetClock(device, NVML_CLOCK_SM, NVML_CLOCK_ID_CURRENT, &clock);
if (result == NVML_SUCCESS) {
info->clock_sm = clock;
}
result = nvmlDeviceGetClock(device, NVML_CLOCK_MEM, NVML_CLOCK_ID_CURRENT, &clock);
if (result == NVML_SUCCESS) {
info->clock_memory = clock;
}
// Get PCIe info
nvmlPciInfo_t pciInfo;
result = nvmlDeviceGetPciInfo(device, &pciInfo);
if (result == NVML_SUCCESS) {
// PCIe generation and width require NVML 11.0+
// For now, we leave them as 0
info->pcie_gen = 0;
info->pcie_width = 0;
}
return 0;
}
int gpu_get_utilization(uint32_t index, uint32_t* utilization) {
if (!utilization) {
set_error("null utilization pointer");
return -1;
}
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
nvmlUtilization_t util;
result = nvmlDeviceGetUtilizationRates(device, &util);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
*utilization = util.gpu;
return 0;
}
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total) {
if (!used || !total) {
set_error("null pointer");
return -1;
}
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
nvmlMemory_t memory;
result = nvmlDeviceGetMemoryInfo(device, &memory);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
*used = memory.used;
*total = memory.total;
return 0;
}
int gpu_get_temperature(uint32_t index, uint32_t* temp) {
if (!temp) {
set_error("null temp pointer");
return -1;
}
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex(index, &device);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
unsigned int temperature;
result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
if (result != NVML_SUCCESS) {
set_error(nvmlErrorString(result));
return -1;
}
*temp = temperature;
return 0;
}
int gpu_is_available(void) {
nvmlReturn_t result = nvmlInit();
if (result == NVML_SUCCESS) {
nvmlShutdown();
return 1;
}
return 0;
}
const char* gpu_last_error(void) {
return last_error_buffer;
}
#endif // NVML_STUB