//go:build cgo && native_libs && linux // +build cgo,native_libs,linux package worker // #cgo LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -lnvml_gpu -lnvidia-ml // #include "../../native/nvml_gpu/nvml_gpu.h" // #include import "C" import ( "errors" "fmt" ) // GPUInfo holds information about a GPU type GPUInfo struct { Index uint32 Name string Utilization uint32 // GPU utilization percentage (0-100) MemoryUsed uint64 // Memory used in bytes MemoryTotal uint64 // Total memory in bytes Temperature uint32 // Temperature in Celsius PowerDraw uint32 // Power draw in milliwatts ClockSM uint32 // SM clock in MHz ClockMemory uint32 // Memory clock in MHz PCIeGen uint32 // PCIe generation PCIeWidth uint32 // PCIe link width UUID string // GPU UUID VBIOSVersion string // VBIOS version } var ( nvmlInitialized = false ) // InitNVML initializes the NVML library func InitNVML() error { result := C.gpu_init() if result != 0 { return errors.New(C.GoString(C.gpu_last_error())) } nvmlInitialized = true return nil } // ShutdownNVML shuts down the NVML library func ShutdownNVML() { if nvmlInitialized { C.gpu_shutdown() nvmlInitialized = false } } // IsNVMLAvailable checks if NVML is available at runtime func IsNVMLAvailable() bool { return C.gpu_is_available() == 1 } // GetGPUCount returns the number of GPUs func GetGPUCount() (int, error) { if !nvmlInitialized { if err := InitNVML(); err != nil { return 0, err } } count := C.gpu_get_count() if count < 0 { return 0, errors.New(C.GoString(C.gpu_last_error())) } return int(count), nil } // GetGPUInfo returns detailed information about a GPU func GetGPUInfo(index uint32) (*GPUInfo, error) { if !nvmlInitialized { if err := InitNVML(); err != nil { return nil, err } } var cInfo C.gpu_info_t result := C.gpu_get_info(C.uint32_t(index), &cInfo) if result != 0 { return nil, errors.New(C.GoString(C.gpu_last_error())) } return &GPUInfo{ Index: uint32(cInfo.index), Name: C.GoString(&cInfo.name[0]), Utilization: uint32(cInfo.utilization), MemoryUsed: uint64(cInfo.memory_used), MemoryTotal: uint64(cInfo.memory_total), Temperature: uint32(cInfo.temperature), PowerDraw: uint32(cInfo.power_draw), ClockSM: uint32(cInfo.clock_sm), ClockMemory: uint32(cInfo.clock_memory), PCIeGen: uint32(cInfo.pcie_gen), PCIeWidth: uint32(cInfo.pcie_width), UUID: C.GoString(&cInfo.uuid[0]), VBIOSVersion: C.GoString(&cInfo.vbios_version[0]), }, nil } // GetAllGPUInfo returns information about all GPUs func GetAllGPUInfo() ([]*GPUInfo, error) { count, err := GetGPUCount() if err != nil { return nil, err } gpus := make([]*GPUInfo, 0, count) for i := 0; i < count; i++ { info, err := GetGPUInfo(uint32(i)) if err != nil { return nil, fmt.Errorf("failed to get GPU %d info: %w", i, err) } gpus = append(gpus, info) } return gpus, nil } // GetGPUUtilization returns the current GPU utilization func GetGPUUtilization(index uint32) (uint32, error) { if !nvmlInitialized { if err := InitNVML(); err != nil { return 0, err } } var utilization C.uint32_t result := C.gpu_get_utilization(C.uint32_t(index), &utilization) if result != 0 { return 0, errors.New(C.GoString(C.gpu_last_error())) } return uint32(utilization), nil } // GetGPUMemory returns the current GPU memory usage func GetGPUMemory(index uint32) (used uint64, total uint64, err error) { if !nvmlInitialized { if err := InitNVML(); err != nil { return 0, 0, err } } var cUsed, cTotal C.uint64_t result := C.gpu_get_memory(C.uint32_t(index), &cUsed, &cTotal) if result != 0 { return 0, 0, errors.New(C.GoString(C.gpu_last_error())) } return uint64(cUsed), uint64(cTotal), nil } // GetGPUTemperature returns the current GPU temperature func GetGPUTemperature(index uint32) (uint32, error) { if !nvmlInitialized { if err := InitNVML(); err != nil { return 0, err } } var temp C.uint32_t result := C.gpu_get_temperature(C.uint32_t(index), &temp) if result != 0 { return 0, errors.New(C.GoString(C.gpu_last_error())) } return uint32(temp), nil }