fetch_ml/native/nvml_gpu/nvml_gpu.h
Jeremie Fraeys 05b7af6991
feat: implement NVML-based GPU monitoring
- Add native/nvml_gpu/ C++ library wrapping NVIDIA Management Library
- Add Go bindings in internal/worker/gpu_nvml_native.go and gpu_nvml_stub.go
- Update gpu_detector.go to use NVML for accurate GPU count detection
- Update native/CMakeLists.txt to build nvml_gpu library
- Provides real-time GPU utilization, memory, temperature, clocks, power
- Falls back to environment variable when NVML unavailable
2026-02-21 15:16:09 -05:00

66 lines
1.8 KiB
C

#ifndef NVML_GPU_H
#define NVML_GPU_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
// GPU information structure
typedef struct {
uint32_t index;
char name[256];
uint32_t utilization; // GPU utilization percentage (0-100)
uint64_t memory_used; // Memory used in bytes
uint64_t memory_total; // Total memory in bytes
uint32_t temperature; // Temperature in Celsius
uint32_t power_draw; // Power draw in milliwatts
uint32_t clock_sm; // SM clock in MHz
uint32_t clock_memory; // Memory clock in MHz
uint32_t pcie_gen; // PCIe generation
uint32_t pcie_width; // PCIe link width
char uuid[80]; // GPU UUID
char vbios_version[32]; // VBIOS version
} gpu_info_t;
// Initialize NVML library
// Returns 0 on success, non-zero on failure
int gpu_init(void);
// Shutdown NVML library
void gpu_shutdown(void);
// Get number of GPUs
// Returns -1 on error, >= 0 on success
int gpu_get_count(void);
// Get GPU info by index
// Returns 0 on success, non-zero on failure
int gpu_get_info(uint32_t index, gpu_info_t* info);
// Get current utilization for a GPU
// Returns 0 on success, non-zero on failure
int gpu_get_utilization(uint32_t index, uint32_t* utilization);
// Get memory info for a GPU
// Returns 0 on success, non-zero on failure
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total);
// Get temperature for a GPU
// Returns 0 on success, non-zero on failure
int gpu_get_temperature(uint32_t index, uint32_t* temp);
// Check if NVML is available (runtime detection)
// Returns 1 if available, 0 if not
int gpu_is_available(void);
// Get last error message (thread-safe)
const char* gpu_last_error(void);
#ifdef __cplusplus
}
#endif
#endif // NVML_GPU_H