Delete cli/src/ui/progress.zig (removing progress bars/spinners) Update native GPU detection modules Update server experiment API
372 lines
12 KiB
Zig
372 lines
12 KiB
Zig
const std = @import("std");
|
|
const builtin = @import("builtin");
|
|
|
|
/// NVML Dynamic Loader for CLI
|
|
/// Pure Zig implementation using dlopen/LoadLibrary
|
|
/// No build-time dependency on NVIDIA SDK
|
|
|
|
// Platform-specific dynamic loading
|
|
const DynLib = switch (builtin.os.tag) {
|
|
.windows => struct {
|
|
handle: std.os.windows.HMODULE,
|
|
|
|
fn open(path: []const u8) !@This() {
|
|
const wide_path = try std.os.windows.sliceToPrefixedFileW(path);
|
|
const handle = std.os.windows.LoadLibraryW(&wide_path.data) orelse return error.LibraryNotFound;
|
|
return .{ .handle = handle };
|
|
}
|
|
|
|
fn close(self: *@This()) void {
|
|
_ = std.os.windows.FreeLibrary(self.handle);
|
|
}
|
|
|
|
fn lookup(self: @This(), name: []const u8) ?*anyopaque {
|
|
return std.os.windows.GetProcAddress(self.handle, name);
|
|
}
|
|
},
|
|
else => struct {
|
|
handle: *anyopaque,
|
|
|
|
// Extern declarations for dlopen/dlsym
|
|
extern "c" fn dlopen(pathname: [*:0]const u8, mode: c_int) ?*anyopaque;
|
|
extern "c" fn dlsym(handle: *anyopaque, symbol: [*:0]const u8) ?*anyopaque;
|
|
extern "c" fn dlclose(handle: *anyopaque) c_int;
|
|
|
|
const RTLD_NOW = 2;
|
|
|
|
fn open(path: []const u8) !@This() {
|
|
const c_path = try std.cstr.addNullByte(std.heap.c_allocator, path);
|
|
defer std.heap.c_allocator.free(c_path);
|
|
const handle = dlopen(c_path.ptr, RTLD_NOW) orelse return error.LibraryNotFound;
|
|
return .{ .handle = handle };
|
|
}
|
|
|
|
fn close(self: *@This()) void {
|
|
_ = dlclose(self.handle);
|
|
}
|
|
|
|
fn lookup(self: @This(), name: []const u8) ?*anyopaque {
|
|
const c_name = std.cstr.addNullByte(std.heap.c_allocator, name) catch return null;
|
|
defer std.heap.c_allocator.free(c_name);
|
|
return dlsym(self.handle, c_name.ptr);
|
|
}
|
|
},
|
|
};
|
|
|
|
// NVML type definitions (mirrors nvml.h)
|
|
pub const nvmlReturn_t = c_int;
|
|
pub const nvmlDevice_t = *anyopaque;
|
|
|
|
pub const nvmlUtilization_t = extern struct {
|
|
gpu: c_uint,
|
|
memory: c_uint,
|
|
};
|
|
|
|
pub const nvmlMemory_t = extern struct {
|
|
total: c_ulonglong,
|
|
free: c_ulonglong,
|
|
used: c_ulonglong,
|
|
};
|
|
|
|
// NVML constants
|
|
const NVML_SUCCESS = 0;
|
|
const NVML_TEMPERATURE_GPU = 0;
|
|
const NVML_CLOCK_SM = 0;
|
|
const NVML_CLOCK_MEM = 1;
|
|
|
|
// NVML function types
|
|
const nvmlInit_v2_fn = *const fn () callconv(.C) nvmlReturn_t;
|
|
const nvmlShutdown_fn = *const fn () callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetCount_fn = *const fn (*c_uint) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetHandleByIndex_v2_fn = *const fn (c_uint, *nvmlDevice_t) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetName_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetUtilizationRates_fn = *const fn (nvmlDevice_t, *nvmlUtilization_t) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetMemoryInfo_fn = *const fn (nvmlDevice_t, *nvmlMemory_t) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetTemperature_fn = *const fn (nvmlDevice_t, c_uint, *c_uint) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetPowerUsage_fn = *const fn (nvmlDevice_t, *c_uint) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetClockInfo_fn = *const fn (nvmlDevice_t, c_uint, *c_uint) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetUUID_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
|
|
const nvmlDeviceGetVbiosVersion_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
|
|
|
|
/// GPU information structure
|
|
pub const GPUInfo = struct {
|
|
index: u32,
|
|
name: [256:0]u8,
|
|
utilization: u32,
|
|
memory_used: u64,
|
|
memory_total: u64,
|
|
temperature: u32,
|
|
power_draw: u32,
|
|
clock_sm: u32,
|
|
clock_memory: u32,
|
|
uuid: [64:0]u8,
|
|
vbios_version: [32:0]u8,
|
|
};
|
|
|
|
/// NVML handle with loaded functions
|
|
pub const NVML = struct {
|
|
lib: DynLib,
|
|
available: bool,
|
|
|
|
// Function pointers
|
|
init: nvmlInit_v2_fn,
|
|
shutdown: nvmlShutdown_fn,
|
|
get_count: nvmlDeviceGetCount_fn,
|
|
get_handle_by_index: nvmlDeviceGetHandleByIndex_v2_fn,
|
|
get_name: ?nvmlDeviceGetName_fn,
|
|
get_utilization: ?nvmlDeviceGetUtilizationRates_fn,
|
|
get_memory: ?nvmlDeviceGetMemoryInfo_fn,
|
|
get_temperature: ?nvmlDeviceGetTemperature_fn,
|
|
get_power_usage: ?nvmlDeviceGetPowerUsage_fn,
|
|
get_clock: ?nvmlDeviceGetClockInfo_fn,
|
|
get_uuid: ?nvmlDeviceGetUUID_fn,
|
|
get_vbios: ?nvmlDeviceGetVbiosVersion_fn,
|
|
|
|
last_error: [256:0]u8,
|
|
|
|
/// Load NVML dynamically
|
|
pub fn load() !?NVML {
|
|
var nvml: NVML = undefined;
|
|
|
|
// Try platform-specific library names
|
|
const lib_names = switch (builtin.os.tag) {
|
|
.windows => &[_][]const u8{
|
|
"nvml.dll",
|
|
"C:\\Windows\\System32\\nvml.dll",
|
|
},
|
|
.linux => &[_][]const u8{
|
|
"libnvidia-ml.so.1",
|
|
"libnvidia-ml.so",
|
|
},
|
|
else => return null, // NVML not supported on other platforms
|
|
};
|
|
|
|
// Try to load library
|
|
var loaded = false;
|
|
for (lib_names) |name| {
|
|
if (DynLib.open(name)) |lib| {
|
|
nvml.lib = lib;
|
|
loaded = true;
|
|
break;
|
|
} else |_| continue;
|
|
}
|
|
|
|
if (!loaded) {
|
|
return null; // NVML not available (no NVIDIA driver)
|
|
}
|
|
|
|
// Load required functions
|
|
nvml.init = @ptrCast(nvml.lib.lookup("nvmlInit_v2") orelse return error.InitNotFound);
|
|
nvml.shutdown = @ptrCast(nvml.lib.lookup("nvmlShutdown") orelse return error.ShutdownNotFound);
|
|
nvml.get_count = @ptrCast(nvml.lib.lookup("nvmlDeviceGetCount") orelse return error.GetCountNotFound);
|
|
nvml.get_handle_by_index = @ptrCast(nvml.lib.lookup("nvmlDeviceGetHandleByIndex_v2") orelse return error.GetHandleNotFound);
|
|
|
|
// Load optional functions
|
|
nvml.get_name = @ptrCast(nvml.lib.lookup("nvmlDeviceGetName"));
|
|
nvml.get_utilization = @ptrCast(nvml.lib.lookup("nvmlDeviceGetUtilizationRates"));
|
|
nvml.get_memory = @ptrCast(nvml.lib.lookup("nvmlDeviceGetMemoryInfo"));
|
|
nvml.get_temperature = @ptrCast(nvml.lib.lookup("nvmlDeviceGetTemperature"));
|
|
nvml.get_power_usage = @ptrCast(nvml.lib.lookup("nvmlDeviceGetPowerUsage"));
|
|
nvml.get_clock = @ptrCast(nvml.lib.lookup("nvmlDeviceGetClockInfo"));
|
|
nvml.get_uuid = @ptrCast(nvml.lib.lookup("nvmlDeviceGetUUID"));
|
|
nvml.get_vbios = @ptrCast(nvml.lib.lookup("nvmlDeviceGetVbiosVersion"));
|
|
|
|
// Initialize NVML
|
|
const result = nvml.init();
|
|
if (result != NVML_SUCCESS) {
|
|
nvml.setError("NVML initialization failed");
|
|
nvml.lib.close();
|
|
return error.NVMLInitFailed;
|
|
}
|
|
|
|
nvml.available = true;
|
|
return nvml;
|
|
}
|
|
|
|
/// Unload NVML
|
|
pub fn unload(self: *NVML) void {
|
|
if (self.available) {
|
|
_ = self.shutdown();
|
|
}
|
|
self.lib.close();
|
|
}
|
|
|
|
/// Check if NVML is available
|
|
pub fn isAvailable(self: NVML) bool {
|
|
return self.available;
|
|
}
|
|
|
|
/// Get last error message
|
|
pub fn getLastError(self: NVML) []const u8 {
|
|
return std.mem.sliceTo(&self.last_error, 0);
|
|
}
|
|
|
|
fn setError(self: *NVML, msg: []const u8) void {
|
|
@memset(&self.last_error, 0);
|
|
const len = @min(msg.len, self.last_error.len - 1);
|
|
@memcpy(self.last_error[0..len], msg[0..len]);
|
|
}
|
|
|
|
/// Get number of GPUs
|
|
pub fn getGPUCount(self: *NVML) !u32 {
|
|
var count: c_uint = 0;
|
|
const result = self.get_count(&count);
|
|
if (result != NVML_SUCCESS) {
|
|
self.setError("Failed to get GPU count");
|
|
return error.GetCountFailed;
|
|
}
|
|
return @intCast(count);
|
|
}
|
|
|
|
/// Get GPU info by index
|
|
pub fn getGPUInfo(self: *NVML, index: u32) !GPUInfo {
|
|
var info: GPUInfo = .{
|
|
.index = index,
|
|
.name = std.mem.zeroes([256:0]u8),
|
|
.utilization = 0,
|
|
.memory_used = 0,
|
|
.memory_total = 0,
|
|
.temperature = 0,
|
|
.power_draw = 0,
|
|
.clock_sm = 0,
|
|
.clock_memory = 0,
|
|
.uuid = std.mem.zeroes([64:0]u8),
|
|
.vbios_version = std.mem.zeroes([32:0]u8),
|
|
};
|
|
|
|
var device: nvmlDevice_t = undefined;
|
|
var result = self.get_handle_by_index(index, &device);
|
|
if (result != NVML_SUCCESS) {
|
|
self.setError("Failed to get device handle");
|
|
return error.GetHandleFailed;
|
|
}
|
|
|
|
// Get name
|
|
if (self.get_name) |func| {
|
|
_ = func(device, &info.name, @sizeOf(@TypeOf(info.name)));
|
|
}
|
|
|
|
// Get utilization
|
|
if (self.get_utilization) |func| {
|
|
var util: nvmlUtilization_t = undefined;
|
|
result = func(device, &util);
|
|
if (result == NVML_SUCCESS) {
|
|
info.utilization = @intCast(util.gpu);
|
|
}
|
|
}
|
|
|
|
// Get memory
|
|
if (self.get_memory) |func| {
|
|
var mem: nvmlMemory_t = undefined;
|
|
result = func(device, &mem);
|
|
if (result == NVML_SUCCESS) {
|
|
info.memory_used = mem.used;
|
|
info.memory_total = mem.total;
|
|
}
|
|
}
|
|
|
|
// Get temperature
|
|
if (self.get_temperature) |func| {
|
|
var temp: c_uint = 0;
|
|
result = func(device, NVML_TEMPERATURE_GPU, &temp);
|
|
if (result == NVML_SUCCESS) {
|
|
info.temperature = @intCast(temp);
|
|
}
|
|
}
|
|
|
|
// Get power usage
|
|
if (self.get_power_usage) |func| {
|
|
var power: c_uint = 0;
|
|
result = func(device, &power);
|
|
if (result == NVML_SUCCESS) {
|
|
info.power_draw = @intCast(power);
|
|
}
|
|
}
|
|
|
|
// Get clocks
|
|
if (self.get_clock) |func| {
|
|
var clock: c_uint = 0;
|
|
result = func(device, NVML_CLOCK_SM, &clock);
|
|
if (result == NVML_SUCCESS) {
|
|
info.clock_sm = @intCast(clock);
|
|
}
|
|
result = func(device, NVML_CLOCK_MEM, &clock);
|
|
if (result == NVML_SUCCESS) {
|
|
info.clock_memory = @intCast(clock);
|
|
}
|
|
}
|
|
|
|
// Get UUID
|
|
if (self.get_uuid) |func| {
|
|
_ = func(device, &info.uuid, @sizeOf(@TypeOf(info.uuid)));
|
|
}
|
|
|
|
// Get VBIOS version
|
|
if (self.get_vbios) |func| {
|
|
_ = func(device, &info.vbios_version, @sizeOf(@TypeOf(info.vbios_version)));
|
|
}
|
|
|
|
return info;
|
|
}
|
|
|
|
/// Get info for all GPUs
|
|
pub fn getAllGPUInfo(self: *NVML, allocator: std.mem.Allocator) ![]GPUInfo {
|
|
const count = try self.getGPUCount();
|
|
if (count == 0) return &[_]GPUInfo{};
|
|
|
|
var gpus = try allocator.alloc(GPUInfo, count);
|
|
errdefer allocator.free(gpus);
|
|
|
|
for (0..count) |i| {
|
|
gpus[i] = try self.getGPUInfo(@intCast(i));
|
|
}
|
|
|
|
return gpus;
|
|
}
|
|
};
|
|
|
|
// Convenience functions for simple use cases
|
|
|
|
/// Quick check if NVML is available (creates and destroys temporary handle)
|
|
pub fn isNVMLAvailable() bool {
|
|
if (NVML.load()) |maybe_nvml| {
|
|
if (maybe_nvml) |nvml| {
|
|
var nvml_mut = nvml;
|
|
defer nvml_mut.unload();
|
|
return nvml_mut.isAvailable();
|
|
}
|
|
} else |_| {}
|
|
return false;
|
|
}
|
|
|
|
/// Format GPU info as string for display
|
|
pub fn formatGPUInfo(allocator: std.mem.Allocator, gpus: []const GPUInfo) ![]u8 {
|
|
var buf = std.ArrayList(u8).init(allocator);
|
|
defer buf.deinit();
|
|
|
|
const writer = buf.writer();
|
|
|
|
try writer.writeAll("GPU Status (NVML)\n");
|
|
try writer.writeAll("═" ** 50);
|
|
try writer.writeAll("\n\n");
|
|
|
|
for (gpus) |gpu| {
|
|
const name = std.mem.sliceTo(&gpu.name, 0);
|
|
try writer.print("GPU {d}: {s}\n", .{ gpu.index, name });
|
|
try writer.print("\tUtilization: {d}%\n", .{gpu.utilization});
|
|
try writer.print("\tMemory: {d}/{d} MB\n", .{
|
|
gpu.memory_used / 1024 / 1024,
|
|
gpu.memory_total / 1024 / 1024,
|
|
});
|
|
try writer.print("\tTemperature: {d}°C\n", .{gpu.temperature});
|
|
if (gpu.power_draw > 0) {
|
|
try writer.print("\tPower: {d:.1} W\n", .{@as(f64, @floatFromInt(gpu.power_draw)) / 1000.0});
|
|
}
|
|
if (gpu.clock_sm > 0) {
|
|
try writer.print("\tSM Clock: {d} MHz\n", .{gpu.clock_sm});
|
|
}
|
|
try writer.writeAll("\n");
|
|
}
|
|
|
|
return buf.toOwnedSlice();
|
|
}
|