fetch_ml/cli/src/native/nvml.zig
Jeremie Fraeys 6faa13aabf
refactor(cli): Remove progress UI and update native/server code
Delete cli/src/ui/progress.zig (removing progress bars/spinners)

Update native GPU detection modules

Update server experiment API
2026-02-23 14:12:48 -05:00

372 lines
12 KiB
Zig

const std = @import("std");
const builtin = @import("builtin");
/// NVML Dynamic Loader for CLI
/// Pure Zig implementation using dlopen/LoadLibrary
/// No build-time dependency on NVIDIA SDK
// Platform-specific dynamic loading
const DynLib = switch (builtin.os.tag) {
.windows => struct {
handle: std.os.windows.HMODULE,
fn open(path: []const u8) !@This() {
const wide_path = try std.os.windows.sliceToPrefixedFileW(path);
const handle = std.os.windows.LoadLibraryW(&wide_path.data) orelse return error.LibraryNotFound;
return .{ .handle = handle };
}
fn close(self: *@This()) void {
_ = std.os.windows.FreeLibrary(self.handle);
}
fn lookup(self: @This(), name: []const u8) ?*anyopaque {
return std.os.windows.GetProcAddress(self.handle, name);
}
},
else => struct {
handle: *anyopaque,
// Extern declarations for dlopen/dlsym
extern "c" fn dlopen(pathname: [*:0]const u8, mode: c_int) ?*anyopaque;
extern "c" fn dlsym(handle: *anyopaque, symbol: [*:0]const u8) ?*anyopaque;
extern "c" fn dlclose(handle: *anyopaque) c_int;
const RTLD_NOW = 2;
fn open(path: []const u8) !@This() {
const c_path = try std.cstr.addNullByte(std.heap.c_allocator, path);
defer std.heap.c_allocator.free(c_path);
const handle = dlopen(c_path.ptr, RTLD_NOW) orelse return error.LibraryNotFound;
return .{ .handle = handle };
}
fn close(self: *@This()) void {
_ = dlclose(self.handle);
}
fn lookup(self: @This(), name: []const u8) ?*anyopaque {
const c_name = std.cstr.addNullByte(std.heap.c_allocator, name) catch return null;
defer std.heap.c_allocator.free(c_name);
return dlsym(self.handle, c_name.ptr);
}
},
};
// NVML type definitions (mirrors nvml.h)
pub const nvmlReturn_t = c_int;
pub const nvmlDevice_t = *anyopaque;
pub const nvmlUtilization_t = extern struct {
gpu: c_uint,
memory: c_uint,
};
pub const nvmlMemory_t = extern struct {
total: c_ulonglong,
free: c_ulonglong,
used: c_ulonglong,
};
// NVML constants
const NVML_SUCCESS = 0;
const NVML_TEMPERATURE_GPU = 0;
const NVML_CLOCK_SM = 0;
const NVML_CLOCK_MEM = 1;
// NVML function types
const nvmlInit_v2_fn = *const fn () callconv(.C) nvmlReturn_t;
const nvmlShutdown_fn = *const fn () callconv(.C) nvmlReturn_t;
const nvmlDeviceGetCount_fn = *const fn (*c_uint) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetHandleByIndex_v2_fn = *const fn (c_uint, *nvmlDevice_t) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetName_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetUtilizationRates_fn = *const fn (nvmlDevice_t, *nvmlUtilization_t) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetMemoryInfo_fn = *const fn (nvmlDevice_t, *nvmlMemory_t) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetTemperature_fn = *const fn (nvmlDevice_t, c_uint, *c_uint) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetPowerUsage_fn = *const fn (nvmlDevice_t, *c_uint) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetClockInfo_fn = *const fn (nvmlDevice_t, c_uint, *c_uint) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetUUID_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
const nvmlDeviceGetVbiosVersion_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
/// GPU information structure
pub const GPUInfo = struct {
index: u32,
name: [256:0]u8,
utilization: u32,
memory_used: u64,
memory_total: u64,
temperature: u32,
power_draw: u32,
clock_sm: u32,
clock_memory: u32,
uuid: [64:0]u8,
vbios_version: [32:0]u8,
};
/// NVML handle with loaded functions
pub const NVML = struct {
lib: DynLib,
available: bool,
// Function pointers
init: nvmlInit_v2_fn,
shutdown: nvmlShutdown_fn,
get_count: nvmlDeviceGetCount_fn,
get_handle_by_index: nvmlDeviceGetHandleByIndex_v2_fn,
get_name: ?nvmlDeviceGetName_fn,
get_utilization: ?nvmlDeviceGetUtilizationRates_fn,
get_memory: ?nvmlDeviceGetMemoryInfo_fn,
get_temperature: ?nvmlDeviceGetTemperature_fn,
get_power_usage: ?nvmlDeviceGetPowerUsage_fn,
get_clock: ?nvmlDeviceGetClockInfo_fn,
get_uuid: ?nvmlDeviceGetUUID_fn,
get_vbios: ?nvmlDeviceGetVbiosVersion_fn,
last_error: [256:0]u8,
/// Load NVML dynamically
pub fn load() !?NVML {
var nvml: NVML = undefined;
// Try platform-specific library names
const lib_names = switch (builtin.os.tag) {
.windows => &[_][]const u8{
"nvml.dll",
"C:\\Windows\\System32\\nvml.dll",
},
.linux => &[_][]const u8{
"libnvidia-ml.so.1",
"libnvidia-ml.so",
},
else => return null, // NVML not supported on other platforms
};
// Try to load library
var loaded = false;
for (lib_names) |name| {
if (DynLib.open(name)) |lib| {
nvml.lib = lib;
loaded = true;
break;
} else |_| continue;
}
if (!loaded) {
return null; // NVML not available (no NVIDIA driver)
}
// Load required functions
nvml.init = @ptrCast(nvml.lib.lookup("nvmlInit_v2") orelse return error.InitNotFound);
nvml.shutdown = @ptrCast(nvml.lib.lookup("nvmlShutdown") orelse return error.ShutdownNotFound);
nvml.get_count = @ptrCast(nvml.lib.lookup("nvmlDeviceGetCount") orelse return error.GetCountNotFound);
nvml.get_handle_by_index = @ptrCast(nvml.lib.lookup("nvmlDeviceGetHandleByIndex_v2") orelse return error.GetHandleNotFound);
// Load optional functions
nvml.get_name = @ptrCast(nvml.lib.lookup("nvmlDeviceGetName"));
nvml.get_utilization = @ptrCast(nvml.lib.lookup("nvmlDeviceGetUtilizationRates"));
nvml.get_memory = @ptrCast(nvml.lib.lookup("nvmlDeviceGetMemoryInfo"));
nvml.get_temperature = @ptrCast(nvml.lib.lookup("nvmlDeviceGetTemperature"));
nvml.get_power_usage = @ptrCast(nvml.lib.lookup("nvmlDeviceGetPowerUsage"));
nvml.get_clock = @ptrCast(nvml.lib.lookup("nvmlDeviceGetClockInfo"));
nvml.get_uuid = @ptrCast(nvml.lib.lookup("nvmlDeviceGetUUID"));
nvml.get_vbios = @ptrCast(nvml.lib.lookup("nvmlDeviceGetVbiosVersion"));
// Initialize NVML
const result = nvml.init();
if (result != NVML_SUCCESS) {
nvml.setError("NVML initialization failed");
nvml.lib.close();
return error.NVMLInitFailed;
}
nvml.available = true;
return nvml;
}
/// Unload NVML
pub fn unload(self: *NVML) void {
if (self.available) {
_ = self.shutdown();
}
self.lib.close();
}
/// Check if NVML is available
pub fn isAvailable(self: NVML) bool {
return self.available;
}
/// Get last error message
pub fn getLastError(self: NVML) []const u8 {
return std.mem.sliceTo(&self.last_error, 0);
}
fn setError(self: *NVML, msg: []const u8) void {
@memset(&self.last_error, 0);
const len = @min(msg.len, self.last_error.len - 1);
@memcpy(self.last_error[0..len], msg[0..len]);
}
/// Get number of GPUs
pub fn getGPUCount(self: *NVML) !u32 {
var count: c_uint = 0;
const result = self.get_count(&count);
if (result != NVML_SUCCESS) {
self.setError("Failed to get GPU count");
return error.GetCountFailed;
}
return @intCast(count);
}
/// Get GPU info by index
pub fn getGPUInfo(self: *NVML, index: u32) !GPUInfo {
var info: GPUInfo = .{
.index = index,
.name = std.mem.zeroes([256:0]u8),
.utilization = 0,
.memory_used = 0,
.memory_total = 0,
.temperature = 0,
.power_draw = 0,
.clock_sm = 0,
.clock_memory = 0,
.uuid = std.mem.zeroes([64:0]u8),
.vbios_version = std.mem.zeroes([32:0]u8),
};
var device: nvmlDevice_t = undefined;
var result = self.get_handle_by_index(index, &device);
if (result != NVML_SUCCESS) {
self.setError("Failed to get device handle");
return error.GetHandleFailed;
}
// Get name
if (self.get_name) |func| {
_ = func(device, &info.name, @sizeOf(@TypeOf(info.name)));
}
// Get utilization
if (self.get_utilization) |func| {
var util: nvmlUtilization_t = undefined;
result = func(device, &util);
if (result == NVML_SUCCESS) {
info.utilization = @intCast(util.gpu);
}
}
// Get memory
if (self.get_memory) |func| {
var mem: nvmlMemory_t = undefined;
result = func(device, &mem);
if (result == NVML_SUCCESS) {
info.memory_used = mem.used;
info.memory_total = mem.total;
}
}
// Get temperature
if (self.get_temperature) |func| {
var temp: c_uint = 0;
result = func(device, NVML_TEMPERATURE_GPU, &temp);
if (result == NVML_SUCCESS) {
info.temperature = @intCast(temp);
}
}
// Get power usage
if (self.get_power_usage) |func| {
var power: c_uint = 0;
result = func(device, &power);
if (result == NVML_SUCCESS) {
info.power_draw = @intCast(power);
}
}
// Get clocks
if (self.get_clock) |func| {
var clock: c_uint = 0;
result = func(device, NVML_CLOCK_SM, &clock);
if (result == NVML_SUCCESS) {
info.clock_sm = @intCast(clock);
}
result = func(device, NVML_CLOCK_MEM, &clock);
if (result == NVML_SUCCESS) {
info.clock_memory = @intCast(clock);
}
}
// Get UUID
if (self.get_uuid) |func| {
_ = func(device, &info.uuid, @sizeOf(@TypeOf(info.uuid)));
}
// Get VBIOS version
if (self.get_vbios) |func| {
_ = func(device, &info.vbios_version, @sizeOf(@TypeOf(info.vbios_version)));
}
return info;
}
/// Get info for all GPUs
pub fn getAllGPUInfo(self: *NVML, allocator: std.mem.Allocator) ![]GPUInfo {
const count = try self.getGPUCount();
if (count == 0) return &[_]GPUInfo{};
var gpus = try allocator.alloc(GPUInfo, count);
errdefer allocator.free(gpus);
for (0..count) |i| {
gpus[i] = try self.getGPUInfo(@intCast(i));
}
return gpus;
}
};
// Convenience functions for simple use cases
/// Quick check if NVML is available (creates and destroys temporary handle)
pub fn isNVMLAvailable() bool {
if (NVML.load()) |maybe_nvml| {
if (maybe_nvml) |nvml| {
var nvml_mut = nvml;
defer nvml_mut.unload();
return nvml_mut.isAvailable();
}
} else |_| {}
return false;
}
/// Format GPU info as string for display
pub fn formatGPUInfo(allocator: std.mem.Allocator, gpus: []const GPUInfo) ![]u8 {
var buf = std.ArrayList(u8).init(allocator);
defer buf.deinit();
const writer = buf.writer();
try writer.writeAll("GPU Status (NVML)\n");
try writer.writeAll("" ** 50);
try writer.writeAll("\n\n");
for (gpus) |gpu| {
const name = std.mem.sliceTo(&gpu.name, 0);
try writer.print("GPU {d}: {s}\n", .{ gpu.index, name });
try writer.print("\tUtilization: {d}%\n", .{gpu.utilization});
try writer.print("\tMemory: {d}/{d} MB\n", .{
gpu.memory_used / 1024 / 1024,
gpu.memory_total / 1024 / 1024,
});
try writer.print("\tTemperature: {d}°C\n", .{gpu.temperature});
if (gpu.power_draw > 0) {
try writer.print("\tPower: {d:.1} W\n", .{@as(f64, @floatFromInt(gpu.power_draw)) / 1000.0});
}
if (gpu.clock_sm > 0) {
try writer.print("\tSM Clock: {d} MHz\n", .{gpu.clock_sm});
}
try writer.writeAll("\n");
}
return buf.toOwnedSlice();
}