- Add native/nvml_gpu/ C++ library wrapping NVIDIA Management Library - Add Go bindings in internal/worker/gpu_nvml_native.go and gpu_nvml_stub.go - Update gpu_detector.go to use NVML for accurate GPU count detection - Update native/CMakeLists.txt to build nvml_gpu library - Provides real-time GPU utilization, memory, temperature, clocks, power - Falls back to environment variable when NVML unavailable
66 lines
1.8 KiB
C
66 lines
1.8 KiB
C
#ifndef NVML_GPU_H
|
|
#define NVML_GPU_H
|
|
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
// GPU information structure
|
|
typedef struct {
|
|
uint32_t index;
|
|
char name[256];
|
|
uint32_t utilization; // GPU utilization percentage (0-100)
|
|
uint64_t memory_used; // Memory used in bytes
|
|
uint64_t memory_total; // Total memory in bytes
|
|
uint32_t temperature; // Temperature in Celsius
|
|
uint32_t power_draw; // Power draw in milliwatts
|
|
uint32_t clock_sm; // SM clock in MHz
|
|
uint32_t clock_memory; // Memory clock in MHz
|
|
uint32_t pcie_gen; // PCIe generation
|
|
uint32_t pcie_width; // PCIe link width
|
|
char uuid[80]; // GPU UUID
|
|
char vbios_version[32]; // VBIOS version
|
|
} gpu_info_t;
|
|
|
|
// Initialize NVML library
|
|
// Returns 0 on success, non-zero on failure
|
|
int gpu_init(void);
|
|
|
|
// Shutdown NVML library
|
|
void gpu_shutdown(void);
|
|
|
|
// Get number of GPUs
|
|
// Returns -1 on error, >= 0 on success
|
|
int gpu_get_count(void);
|
|
|
|
// Get GPU info by index
|
|
// Returns 0 on success, non-zero on failure
|
|
int gpu_get_info(uint32_t index, gpu_info_t* info);
|
|
|
|
// Get current utilization for a GPU
|
|
// Returns 0 on success, non-zero on failure
|
|
int gpu_get_utilization(uint32_t index, uint32_t* utilization);
|
|
|
|
// Get memory info for a GPU
|
|
// Returns 0 on success, non-zero on failure
|
|
int gpu_get_memory(uint32_t index, uint64_t* used, uint64_t* total);
|
|
|
|
// Get temperature for a GPU
|
|
// Returns 0 on success, non-zero on failure
|
|
int gpu_get_temperature(uint32_t index, uint32_t* temp);
|
|
|
|
// Check if NVML is available (runtime detection)
|
|
// Returns 1 if available, 0 if not
|
|
int gpu_is_available(void);
|
|
|
|
// Get last error message (thread-safe)
|
|
const char* gpu_last_error(void);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif // NVML_GPU_H
|