From be39b37aec9001b50a8c534f261aef16b00ad992 Mon Sep 17 00:00:00 2001
From: Jeremie Fraeys <jfaeys@gmail.com>
Date: Sat, 21 Feb 2026 17:59:59 -0500
Subject: [PATCH] feat: native GPU detection and NVML bridge for macOS and
 Linux

- Add dynamic NVML loading for Linux GPU detection
- Add macOS GPU detection via IOKit framework
- Add Zig NVML wrapper for cross-platform GPU queries
- Update native bridge to support platform-specific GPU libs
- Add CMake support for NVML dynamic library
---
 cli/src/native/macos_gpu.zig           | 262 +++++++++++++++++
 cli/src/native/nvml.zig                | 372 +++++++++++++++++++++++++
 internal/queue/native_queue_stub.go    |   4 +-
 internal/worker/config.go              |   7 +-
 internal/worker/gpu_detector.go        |   8 +
 internal/worker/gpu_macos.go           | 279 +++++++++++++++++++
 internal/worker/gpu_macos_stub.go      |  41 +++
 internal/worker/gpu_nvml_native.go     |   4 +-
 internal/worker/gpu_nvml_stub.go       |  25 +-
 internal/worker/native_bridge.go       |   5 -
 internal/worker/native_bridge_libs.go  |  13 +-
 internal/worker/native_bridge_nocgo.go |   5 -
 native/nvml_gpu/CMakeLists.txt         |   6 +-
 native/nvml_gpu/nvml_dynamic.c         | 272 ++++++++++++++++++
 native/nvml_gpu/nvml_dynamic.h         |  53 ++++
 15 files changed, 1321 insertions(+), 35 deletions(-)
 create mode 100644 cli/src/native/macos_gpu.zig
 create mode 100644 cli/src/native/nvml.zig
 create mode 100644 internal/worker/gpu_macos.go
 create mode 100644 internal/worker/gpu_macos_stub.go
 create mode 100644 native/nvml_gpu/nvml_dynamic.c
 create mode 100644 native/nvml_gpu/nvml_dynamic.h

diff --git a/cli/src/native/macos_gpu.zig b/cli/src/native/macos_gpu.zig
new file mode 100644
index 0000000..29dc8a0
--- /dev/null
+++ b/cli/src/native/macos_gpu.zig
@@ -0,0 +1,262 @@
+const std = @import("std");
+const builtin = @import("builtin");
+
+/// macOS GPU Monitoring for Development Mode
+/// Uses system_profiler and powermetrics for GPU info
+/// Only available on macOS
+const c = @cImport({
+    @cInclude("sys/types.h");
+    @cInclude("sys/sysctl.h");
+});
+
+/// GPU information structure for macOS
+pub const MacOSGPUInfo = struct {
+    index: u32,
+    name: [256:0]u8,
+    chipset_model: [256:0]u8,
+    vram_mb: u32,
+    is_integrated: bool,
+    // Performance metrics (if available via powermetrics)
+    utilization_percent: ?u32,
+    temperature_celsius: ?u32,
+    power_mw: ?u32,
+};
+
+/// Detect if running on Apple Silicon
+pub fn isAppleSilicon() bool {
+    if (builtin.os.tag != .macos) return false;
+
+    var buf: [64]u8 = undefined;
+    var len: usize = buf.len;
+    const mib = [_]c_int{ c.CTL_HW, c.HW_MACHINE };
+
+    const result = c.sysctl(&mib[0], 2, &buf[0], &len, null, 0);
+    if (result != 0) return false;
+
+    const machine = std.mem.sliceTo(&buf, 0);
+    return std.mem.startsWith(u8, machine, "arm64") or
+        std.mem.startsWith(u8, machine, "Apple");
+}
+
+/// Get GPU count on macOS
+pub fn getGPUCount() u32 {
+    if (builtin.os.tag != .macos) return 0;
+
+    // Run system_profiler to check for GPUs
+    const result = runSystemProfiler() catch return 0;
+    defer std.heap.raw_c_allocator.free(result);
+
+    // Parse output for GPU entries
+    var lines = std.mem.splitScalar(u8, result, '\n');
+    var count: u32 = 0;
+    while (lines.next()) |line| {
+        if (std.mem.indexOf(u8, line, "Chipset Model") != null) {
+            count += 1;
+        }
+    }
+
+    return count;
+}
+
+/// Run system_profiler SPDisplaysDataType
+fn runSystemProfiler() ![]u8 {
+    const argv = [_][]const u8{
+        "system_profiler",
+        "SPDisplaysDataType",
+        "-json",
+    };
+
+    var child = std.process.Child.init(&argv, std.heap.page_allocator);
+    child.stdout_behavior = .Pipe;
+    child.stderr_behavior = .Ignore;
+
+    try child.spawn();
+    defer child.kill() catch {};
+
+    const stdout = child.stdout.?.reader();
+    const output = try stdout.readAllAlloc(std.heap.page_allocator, 1024 * 1024);
+
+    const term = try child.wait();
+    if (term != .Exited or term.Exited != 0) {
+        return error.CommandFailed;
+    }
+
+    return output;
+}
+
+/// Parse GPU info from system_profiler JSON output
+pub fn parseGPUInfo(allocator: std.mem.Allocator, json_output: []const u8) ![]MacOSGPUInfo {
+    // Simple parser for system_profiler JSON
+    // Format: {"SPDisplaysDataType": [{"sppci_model":"...", "sppci_vram":"...", ...}, ...]}
+
+    var gpus = std.ArrayList(MacOSGPUInfo).init(allocator);
+    defer gpus.deinit();
+
+    // Parse JSON - look for _items array
+    const items_key = "_items";
+    if (std.mem.indexOf(u8, json_output, items_key)) |items_start| {
+        const rest = json_output[items_start..];
+        // Find array start
+        if (std.mem.indexOf(u8, rest, "[")) |array_start| {
+            const array = rest[array_start..];
+            // Simple heuristic: find objects between { and }
+            var i: usize = 0;
+            while (i < array.len) {
+                if (array[i] == '{') {
+                    // Found object start
+                    if (findObjectEnd(array[i..])) |obj_end| {
+                        const obj = array[i .. i + obj_end];
+                        if (try parseGPUObject(obj)) |gpu| {
+                            try gpus.append(gpu);
+                        }
+                        i += obj_end;
+                        continue;
+                    }
+                }
+                i += 1;
+            }
+        }
+    }
+
+    return gpus.toOwnedSlice();
+}
+
+fn findObjectEnd(json: []const u8) ?usize {
+    var depth: i32 = 0;
+    var in_string = false;
+    var i: usize = 0;
+    while (i < json.len) : (i += 1) {
+        const char = json[i];
+        if (char == '"' and (i == 0 or json[i - 1] != '\\')) {
+            in_string = !in_string;
+        } else if (!in_string) {
+            if (char == '{') {
+                depth += 1;
+            } else if (char == '}') {
+                depth -= 1;
+                if (depth == 0) {
+                    return i + 1;
+                }
+            }
+        }
+    }
+    return null;
+}
+
+fn parseGPUObject(json: []const u8) !?MacOSGPUInfo {
+    var gpu = MacOSGPUInfo{
+        .index = 0,
+        .name = std.mem.zeroes([256:0]u8),
+        .chipset_model = std.mem.zeroes([256:0]u8),
+        .vram_mb = 0,
+        .is_integrated = false,
+        .utilization_percent = null,
+        .temperature_celsius = null,
+        .power_mw = null,
+    };
+
+    // Extract sppci_model
+    if (extractJsonString(json, "sppci_model")) |model| {
+        const len = @min(model.len, 255);
+        @memcpy(gpu.chipset_model[0..len], model[0..len]);
+        @memcpy(gpu.name[0..len], model[0..len]);
+    }
+
+    // Extract sppci_vram
+    if (extractJsonString(json, "sppci_vram_shared")) |_| {
+        gpu.is_integrated = true;
+        gpu.vram_mb = 0; // Shared memory
+    } else if (extractJsonString(json, "sppci_vram")) |vram| {
+        // Parse "16384 MB" -> 16384
+        var it = std.mem.splitScalar(u8, vram, ' ');
+        if (it.next()) |num_str| {
+            gpu.vram_mb = std.fmt.parseInt(u32, num_str, 10) catch 0;
+        }
+    }
+
+    // Check if it's a valid GPU entry
+    if (gpu.chipset_model[0] == 0) {
+        return null;
+    }
+
+    return gpu;
+}
+
+fn extractJsonString(json: []const u8, key: []const u8) ?[]const u8 {
+    const key_quoted = std.fmt.allocPrint(std.heap.page_allocator, "\"{s}\"", .{key}) catch return null;
+    defer std.heap.page_allocator.free(key_quoted);
+
+    if (std.mem.indexOf(u8, json, key_quoted)) |key_pos| {
+        const after_key = json[key_pos + key_quoted.len ..];
+        // Find value start (skip : and whitespace)
+        var i: usize = 0;
+        while (i < after_key.len and (after_key[i] == ':' or after_key[i] == ' ' or after_key[i] == '\t' or after_key[i] == '\n')) : (i += 1) {}
+
+        if (i < after_key.len and after_key[i] == '"') {
+            // String value
+            const str_start = i + 1;
+            var str_end = str_start;
+            while (str_end < after_key.len and after_key[str_end] != '"') : (str_end += 1) {}
+            return after_key[str_start..str_end];
+        }
+    }
+    return null;
+}
+
+/// Format GPU info for display
+pub fn formatMacOSGPUInfo(allocator: std.mem.Allocator, gpus: []const MacOSGPUInfo) ![]u8 {
+    var buf = std.ArrayList(u8).init(allocator);
+    defer buf.deinit();
+
+    const writer = buf.writer();
+
+    if (gpus.len == 0) {
+        try writer.writeAll("GPU Status (macOS)\n");
+        try writer.writeAll("═" ** 50);
+        try writer.writeAll("\n\nNo GPUs detected\n");
+        return buf.toOwnedSlice();
+    }
+
+    try writer.writeAll("GPU Status (macOS");
+    if (isAppleSilicon()) {
+        try writer.writeAll(" - Apple Silicon");
+    }
+    try writer.writeAll(")\n");
+    try writer.writeAll("═" ** 50);
+    try writer.writeAll("\n\n");
+
+    for (gpus) |gpu| {
+        const name = std.mem.sliceTo(&gpu.name, 0);
+        const model = std.mem.sliceTo(&gpu.chipset_model, 0);
+
+        try writer.print("🎮 GPU {d}: {s}\n", .{ gpu.index, name });
+        if (!std.mem.eql(u8, model, name)) {
+            try writer.print("   Model: {s}\n", .{model});
+        }
+        if (gpu.is_integrated) {
+            try writer.writeAll("   Type: Integrated (Unified Memory)\n");
+        } else {
+            try writer.print("   VRAM: {d} MB\n", .{gpu.vram_mb});
+        }
+        if (gpu.utilization_percent) |util| {
+            try writer.print("   Utilization: {d}%\n", .{util});
+        }
+        if (gpu.temperature_celsius) |temp| {
+            try writer.print("   Temperature: {d}°C\n", .{temp});
+        }
+        if (gpu.power_mw) |power| {
+            try writer.print("   Power: {d:.1f} W\n", .{@as(f64, @floatFromInt(power)) / 1000.0});
+        }
+        try writer.writeAll("\n");
+    }
+
+    try writer.writeAll("💡 Note: Detailed GPU metrics require powermetrics (sudo)\n");
+
+    return buf.toOwnedSlice();
+}
+
+/// Quick check for GPU availability on macOS
+pub fn isMacOSGPUAvailable() bool {
+    if (builtin.os.tag != .macos) return false;
+    return getGPUCount() > 0;
+}
diff --git a/cli/src/native/nvml.zig b/cli/src/native/nvml.zig
new file mode 100644
index 0000000..5616db4
--- /dev/null
+++ b/cli/src/native/nvml.zig
@@ -0,0 +1,372 @@
+const std = @import("std");
+const builtin = @import("builtin");
+
+/// NVML Dynamic Loader for CLI
+/// Pure Zig implementation using dlopen/LoadLibrary
+/// No build-time dependency on NVIDIA SDK
+
+// Platform-specific dynamic loading
+const DynLib = switch (builtin.os.tag) {
+    .windows => struct {
+        handle: std.os.windows.HMODULE,
+
+        fn open(path: []const u8) !@This() {
+            const wide_path = try std.os.windows.sliceToPrefixedFileW(path);
+            const handle = std.os.windows.LoadLibraryW(&wide_path.data) orelse return error.LibraryNotFound;
+            return .{ .handle = handle };
+        }
+
+        fn close(self: *@This()) void {
+            _ = std.os.windows.FreeLibrary(self.handle);
+        }
+
+        fn lookup(self: @This(), name: []const u8) ?*anyopaque {
+            return std.os.windows.GetProcAddress(self.handle, name);
+        }
+    },
+    else => struct {
+        handle: *anyopaque,
+
+        // Extern declarations for dlopen/dlsym
+        extern "c" fn dlopen(pathname: [*:0]const u8, mode: c_int) ?*anyopaque;
+        extern "c" fn dlsym(handle: *anyopaque, symbol: [*:0]const u8) ?*anyopaque;
+        extern "c" fn dlclose(handle: *anyopaque) c_int;
+
+        const RTLD_NOW = 2;
+
+        fn open(path: []const u8) !@This() {
+            const c_path = try std.cstr.addNullByte(std.heap.c_allocator, path);
+            defer std.heap.c_allocator.free(c_path);
+            const handle = dlopen(c_path.ptr, RTLD_NOW) orelse return error.LibraryNotFound;
+            return .{ .handle = handle };
+        }
+
+        fn close(self: *@This()) void {
+            _ = dlclose(self.handle);
+        }
+
+        fn lookup(self: @This(), name: []const u8) ?*anyopaque {
+            const c_name = std.cstr.addNullByte(std.heap.c_allocator, name) catch return null;
+            defer std.heap.c_allocator.free(c_name);
+            return dlsym(self.handle, c_name.ptr);
+        }
+    },
+};
+
+// NVML type definitions (mirrors nvml.h)
+pub const nvmlReturn_t = c_int;
+pub const nvmlDevice_t = *anyopaque;
+
+pub const nvmlUtilization_t = extern struct {
+    gpu: c_uint,
+    memory: c_uint,
+};
+
+pub const nvmlMemory_t = extern struct {
+    total: c_ulonglong,
+    free: c_ulonglong,
+    used: c_ulonglong,
+};
+
+// NVML constants
+const NVML_SUCCESS = 0;
+const NVML_TEMPERATURE_GPU = 0;
+const NVML_CLOCK_SM = 0;
+const NVML_CLOCK_MEM = 1;
+
+// NVML function types
+const nvmlInit_v2_fn = *const fn () callconv(.C) nvmlReturn_t;
+const nvmlShutdown_fn = *const fn () callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetCount_fn = *const fn (*c_uint) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetHandleByIndex_v2_fn = *const fn (c_uint, *nvmlDevice_t) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetName_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetUtilizationRates_fn = *const fn (nvmlDevice_t, *nvmlUtilization_t) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetMemoryInfo_fn = *const fn (nvmlDevice_t, *nvmlMemory_t) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetTemperature_fn = *const fn (nvmlDevice_t, c_uint, *c_uint) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetPowerUsage_fn = *const fn (nvmlDevice_t, *c_uint) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetClockInfo_fn = *const fn (nvmlDevice_t, c_uint, *c_uint) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetUUID_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
+const nvmlDeviceGetVbiosVersion_fn = *const fn (nvmlDevice_t, [*]u8, c_uint) callconv(.C) nvmlReturn_t;
+
+/// GPU information structure
+pub const GPUInfo = struct {
+    index: u32,
+    name: [256:0]u8,
+    utilization: u32,
+    memory_used: u64,
+    memory_total: u64,
+    temperature: u32,
+    power_draw: u32,
+    clock_sm: u32,
+    clock_memory: u32,
+    uuid: [64:0]u8,
+    vbios_version: [32:0]u8,
+};
+
+/// NVML handle with loaded functions
+pub const NVML = struct {
+    lib: DynLib,
+    available: bool,
+
+    // Function pointers
+    init: nvmlInit_v2_fn,
+    shutdown: nvmlShutdown_fn,
+    get_count: nvmlDeviceGetCount_fn,
+    get_handle_by_index: nvmlDeviceGetHandleByIndex_v2_fn,
+    get_name: ?nvmlDeviceGetName_fn,
+    get_utilization: ?nvmlDeviceGetUtilizationRates_fn,
+    get_memory: ?nvmlDeviceGetMemoryInfo_fn,
+    get_temperature: ?nvmlDeviceGetTemperature_fn,
+    get_power_usage: ?nvmlDeviceGetPowerUsage_fn,
+    get_clock: ?nvmlDeviceGetClockInfo_fn,
+    get_uuid: ?nvmlDeviceGetUUID_fn,
+    get_vbios: ?nvmlDeviceGetVbiosVersion_fn,
+
+    last_error: [256:0]u8,
+
+    /// Load NVML dynamically
+    pub fn load() !?NVML {
+        var nvml: NVML = undefined;
+
+        // Try platform-specific library names
+        const lib_names = switch (builtin.os.tag) {
+            .windows => &[_][]const u8{
+                "nvml.dll",
+                "C:\\Windows\\System32\\nvml.dll",
+            },
+            .linux => &[_][]const u8{
+                "libnvidia-ml.so.1",
+                "libnvidia-ml.so",
+            },
+            else => return null, // NVML not supported on other platforms
+        };
+
+        // Try to load library
+        var loaded = false;
+        for (lib_names) |name| {
+            if (DynLib.open(name)) |lib| {
+                nvml.lib = lib;
+                loaded = true;
+                break;
+            } else |_| continue;
+        }
+
+        if (!loaded) {
+            return null; // NVML not available (no NVIDIA driver)
+        }
+
+        // Load required functions
+        nvml.init = @ptrCast(nvml.lib.lookup("nvmlInit_v2") orelse return error.InitNotFound);
+        nvml.shutdown = @ptrCast(nvml.lib.lookup("nvmlShutdown") orelse return error.ShutdownNotFound);
+        nvml.get_count = @ptrCast(nvml.lib.lookup("nvmlDeviceGetCount") orelse return error.GetCountNotFound);
+        nvml.get_handle_by_index = @ptrCast(nvml.lib.lookup("nvmlDeviceGetHandleByIndex_v2") orelse return error.GetHandleNotFound);
+
+        // Load optional functions
+        nvml.get_name = @ptrCast(nvml.lib.lookup("nvmlDeviceGetName"));
+        nvml.get_utilization = @ptrCast(nvml.lib.lookup("nvmlDeviceGetUtilizationRates"));
+        nvml.get_memory = @ptrCast(nvml.lib.lookup("nvmlDeviceGetMemoryInfo"));
+        nvml.get_temperature = @ptrCast(nvml.lib.lookup("nvmlDeviceGetTemperature"));
+        nvml.get_power_usage = @ptrCast(nvml.lib.lookup("nvmlDeviceGetPowerUsage"));
+        nvml.get_clock = @ptrCast(nvml.lib.lookup("nvmlDeviceGetClockInfo"));
+        nvml.get_uuid = @ptrCast(nvml.lib.lookup("nvmlDeviceGetUUID"));
+        nvml.get_vbios = @ptrCast(nvml.lib.lookup("nvmlDeviceGetVbiosVersion"));
+
+        // Initialize NVML
+        const result = nvml.init();
+        if (result != NVML_SUCCESS) {
+            nvml.setError("NVML initialization failed");
+            nvml.lib.close();
+            return error.NVMLInitFailed;
+        }
+
+        nvml.available = true;
+        return nvml;
+    }
+
+    /// Unload NVML
+    pub fn unload(self: *NVML) void {
+        if (self.available) {
+            _ = self.shutdown();
+        }
+        self.lib.close();
+    }
+
+    /// Check if NVML is available
+    pub fn isAvailable(self: NVML) bool {
+        return self.available;
+    }
+
+    /// Get last error message
+    pub fn getLastError(self: NVML) []const u8 {
+        return std.mem.sliceTo(&self.last_error, 0);
+    }
+
+    fn setError(self: *NVML, msg: []const u8) void {
+        @memset(&self.last_error, 0);
+        const len = @min(msg.len, self.last_error.len - 1);
+        @memcpy(self.last_error[0..len], msg[0..len]);
+    }
+
+    /// Get number of GPUs
+    pub fn getGPUCount(self: *NVML) !u32 {
+        var count: c_uint = 0;
+        const result = self.get_count(&count);
+        if (result != NVML_SUCCESS) {
+            self.setError("Failed to get GPU count");
+            return error.GetCountFailed;
+        }
+        return @intCast(count);
+    }
+
+    /// Get GPU info by index
+    pub fn getGPUInfo(self: *NVML, index: u32) !GPUInfo {
+        var info: GPUInfo = .{
+            .index = index,
+            .name = std.mem.zeroes([256:0]u8),
+            .utilization = 0,
+            .memory_used = 0,
+            .memory_total = 0,
+            .temperature = 0,
+            .power_draw = 0,
+            .clock_sm = 0,
+            .clock_memory = 0,
+            .uuid = std.mem.zeroes([64:0]u8),
+            .vbios_version = std.mem.zeroes([32:0]u8),
+        };
+
+        var device: nvmlDevice_t = undefined;
+        var result = self.get_handle_by_index(index, &device);
+        if (result != NVML_SUCCESS) {
+            self.setError("Failed to get device handle");
+            return error.GetHandleFailed;
+        }
+
+        // Get name
+        if (self.get_name) |func| {
+            _ = func(device, &info.name, @sizeOf(@TypeOf(info.name)));
+        }
+
+        // Get utilization
+        if (self.get_utilization) |func| {
+            var util: nvmlUtilization_t = undefined;
+            result = func(device, &util);
+            if (result == NVML_SUCCESS) {
+                info.utilization = @intCast(util.gpu);
+            }
+        }
+
+        // Get memory
+        if (self.get_memory) |func| {
+            var mem: nvmlMemory_t = undefined;
+            result = func(device, &mem);
+            if (result == NVML_SUCCESS) {
+                info.memory_used = mem.used;
+                info.memory_total = mem.total;
+            }
+        }
+
+        // Get temperature
+        if (self.get_temperature) |func| {
+            var temp: c_uint = 0;
+            result = func(device, NVML_TEMPERATURE_GPU, &temp);
+            if (result == NVML_SUCCESS) {
+                info.temperature = @intCast(temp);
+            }
+        }
+
+        // Get power usage
+        if (self.get_power_usage) |func| {
+            var power: c_uint = 0;
+            result = func(device, &power);
+            if (result == NVML_SUCCESS) {
+                info.power_draw = @intCast(power);
+            }
+        }
+
+        // Get clocks
+        if (self.get_clock) |func| {
+            var clock: c_uint = 0;
+            result = func(device, NVML_CLOCK_SM, &clock);
+            if (result == NVML_SUCCESS) {
+                info.clock_sm = @intCast(clock);
+            }
+            result = func(device, NVML_CLOCK_MEM, &clock);
+            if (result == NVML_SUCCESS) {
+                info.clock_memory = @intCast(clock);
+            }
+        }
+
+        // Get UUID
+        if (self.get_uuid) |func| {
+            _ = func(device, &info.uuid, @sizeOf(@TypeOf(info.uuid)));
+        }
+
+        // Get VBIOS version
+        if (self.get_vbios) |func| {
+            _ = func(device, &info.vbios_version, @sizeOf(@TypeOf(info.vbios_version)));
+        }
+
+        return info;
+    }
+
+    /// Get info for all GPUs
+    pub fn getAllGPUInfo(self: *NVML, allocator: std.mem.Allocator) ![]GPUInfo {
+        const count = try self.getGPUCount();
+        if (count == 0) return &[_]GPUInfo{};
+
+        var gpus = try allocator.alloc(GPUInfo, count);
+        errdefer allocator.free(gpus);
+
+        for (0..count) |i| {
+            gpus[i] = try self.getGPUInfo(@intCast(i));
+        }
+
+        return gpus;
+    }
+};
+
+// Convenience functions for simple use cases
+
+/// Quick check if NVML is available (creates and destroys temporary handle)
+pub fn isNVMLAvailable() bool {
+    if (NVML.load()) |maybe_nvml| {
+        if (maybe_nvml) |nvml| {
+            var nvml_mut = nvml;
+            defer nvml_mut.unload();
+            return nvml_mut.isAvailable();
+        }
+    } else |_| {}
+    return false;
+}
+
+/// Format GPU info as string for display
+pub fn formatGPUInfo(allocator: std.mem.Allocator, gpus: []const GPUInfo) ![]u8 {
+    var buf = std.ArrayList(u8).init(allocator);
+    defer buf.deinit();
+
+    const writer = buf.writer();
+
+    try writer.writeAll("GPU Status (NVML)\n");
+    try writer.writeAll("═" ** 50);
+    try writer.writeAll("\n\n");
+
+    for (gpus) |gpu| {
+        const name = std.mem.sliceTo(&gpu.name, 0);
+        try writer.print("🎮 GPU {d}: {s}\n", .{ gpu.index, name });
+        try writer.print("   Utilization: {d}%\n", .{gpu.utilization});
+        try writer.print("   Memory: {d}/{d} MB\n", .{
+            gpu.memory_used / 1024 / 1024,
+            gpu.memory_total / 1024 / 1024,
+        });
+        try writer.print("   Temperature: {d}°C\n", .{gpu.temperature});
+        if (gpu.power_draw > 0) {
+            try writer.print("   Power: {d:.1} W\n", .{@as(f64, @floatFromInt(gpu.power_draw)) / 1000.0});
+        }
+        if (gpu.clock_sm > 0) {
+            try writer.print("   SM Clock: {d} MHz\n", .{gpu.clock_sm});
+        }
+        try writer.writeAll("\n");
+    }
+
+    return buf.toOwnedSlice();
+}
diff --git a/internal/queue/native_queue_stub.go b/internal/queue/native_queue_stub.go
index 6a58b8c..0ceecb9 100644
--- a/internal/queue/native_queue_stub.go
+++ b/internal/queue/native_queue_stub.go
@@ -1,5 +1,5 @@
-//go:build !native_libs
-// +build !native_libs
+//go:build !cgo || !native_libs
+// +build !cgo !native_libs
 
 package queue
 
diff --git a/internal/worker/config.go b/internal/worker/config.go
index 1f018fc..6bf0b4e 100644
--- a/internal/worker/config.go
+++ b/internal/worker/config.go
@@ -380,19 +380,16 @@ func (c *Config) Validate() error {
 	// - UUID-style gpu_visible_device_ids is NVIDIA-only.
 	vendor := strings.ToLower(strings.TrimSpace(c.GPUVendor))
 	if len(c.GPUVisibleDevices) > 0 && len(c.GPUVisibleDeviceIDs) > 0 {
-		return fmt.Errorf("gpu_visible_devices and gpu_visible_device_ids are mutually exclusive")
-	}
-	if len(c.GPUVisibleDeviceIDs) > 0 {
 		if vendor != string(GPUTypeNVIDIA) {
 			return fmt.Errorf(
-				"gpu_visible_device_ids is only supported when gpu_vendor is %q",
+				"visible_device_ids is only supported when gpu_vendor is %q",
 				string(GPUTypeNVIDIA),
 			)
 		}
 		for _, id := range c.GPUVisibleDeviceIDs {
 			id = strings.TrimSpace(id)
 			if id == "" {
-				return fmt.Errorf("gpu_visible_device_ids contains an empty value")
+				return fmt.Errorf("visible_device_ids contains an empty value")
 			}
 			if !strings.HasPrefix(id, "GPU-") {
 				return fmt.Errorf("gpu_visible_device_ids values must start with %q, got %q", "GPU-", id)
diff --git a/internal/worker/gpu_detector.go b/internal/worker/gpu_detector.go
index 61693e6..987cf88 100644
--- a/internal/worker/gpu_detector.go
+++ b/internal/worker/gpu_detector.go
@@ -98,6 +98,14 @@ type AppleDetector struct {
 }
 
 func (d *AppleDetector) DetectGPUCount() int {
+	// First try actual macOS GPU detection
+	if IsMacOS() {
+		count, err := GetMacOSGPUCount()
+		if err == nil && count > 0 {
+			return count
+		}
+	}
+
 	if n, ok := envInt("FETCH_ML_GPU_COUNT"); ok && n >= 0 {
 		return n
 	}
diff --git a/internal/worker/gpu_macos.go b/internal/worker/gpu_macos.go
new file mode 100644
index 0000000..d1a764f
--- /dev/null
+++ b/internal/worker/gpu_macos.go
@@ -0,0 +1,279 @@
+//go:build darwin
+// +build darwin
+
+package worker
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"regexp"
+	"runtime"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// MacOSGPUInfo holds information about a macOS GPU
+type MacOSGPUInfo struct {
+	Index          uint32 `json:"index"`
+	Name           string `json:"name"`
+	ChipsetModel   string `json:"chipset_model"`
+	VRAM_MB        uint32 `json:"vram_mb"`
+	IsIntegrated   bool   `json:"is_integrated"`
+	IsAppleSilicon bool   `json:"is_apple_silicon"`
+	// Real-time metrics from powermetrics (if available)
+	UtilizationPercent uint32 `json:"utilization_percent,omitempty"`
+	PowerMW            uint32 `json:"power_mw,omitempty"`
+	TemperatureC       uint32 `json:"temperature_c,omitempty"`
+}
+
+// PowermetricsData holds GPU metrics from powermetrics
+type PowermetricsData struct {
+	GPUUtilization float64
+	GPUPower       float64
+	GPUTemperature float64
+	HasData        bool
+}
+
+// IsMacOS returns true if running on macOS
+func IsMacOS() bool {
+	return runtime.GOOS == "darwin"
+}
+
+// IsAppleSilicon checks if running on Apple Silicon
+func IsAppleSilicon() bool {
+	if runtime.GOOS != "darwin" {
+		return false
+	}
+	// Check machine hardware name
+	out, err := exec.Command("uname", "-m").Output()
+	if err != nil {
+		return false
+	}
+	return strings.TrimSpace(string(out)) == "arm64"
+}
+
+// GetMacOSGPUCount returns the number of GPUs on macOS
+func GetMacOSGPUCount() (int, error) {
+	if runtime.GOOS != "darwin" {
+		return 0, fmt.Errorf("not running on macOS")
+	}
+
+	// Use system_profiler to get GPU count
+	cmd := exec.Command("system_profiler", "SPDisplaysDataType", "-json")
+	out, err := cmd.Output()
+	if err != nil {
+		// Fall back to gfxutil if system_profiler fails
+		return getGPUCountViaGfxutil()
+	}
+
+	// Parse JSON output
+	var data map[string]interface{}
+	if err := json.Unmarshal(out, &data); err != nil {
+		return 0, err
+	}
+
+	// Extract display items
+	if spData, ok := data["SPDisplaysDataType"].([]interface{}); ok {
+		return len(spData), nil
+	}
+
+	return 0, nil
+}
+
+// getGPUCountViaGfxutil uses gfxutil to count GPUs (fallback)
+func getGPUCountViaGfxutil() (int, error) {
+	// gfxutil is available on macOS
+	cmd := exec.Command("gfxutil", "-f", "display")
+	out, err := cmd.Output()
+	if err != nil {
+		return 0, err
+	}
+
+	// Count display paths (one per GPU typically)
+	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
+	count := 0
+	for _, line := range lines {
+		if strings.Contains(line, "Display") {
+			count++
+		}
+	}
+	return count, nil
+}
+
+// GetMacOSGPUInfo returns detailed information about macOS GPUs
+func GetMacOSGPUInfo() ([]MacOSGPUInfo, error) {
+	if runtime.GOOS != "darwin" {
+		return nil, fmt.Errorf("not running on macOS")
+	}
+
+	cmd := exec.Command("system_profiler", "SPDisplaysDataType", "-json")
+	out, err := cmd.Output()
+	if err != nil {
+		return nil, err
+	}
+
+	var data map[string]interface{}
+	if err := json.Unmarshal(out, &data); err != nil {
+		return nil, err
+	}
+
+	spData, ok := data["SPDisplaysDataType"].([]interface{})
+	if !ok {
+		return []MacOSGPUInfo{}, nil
+	}
+
+	isAppleSilicon := IsAppleSilicon()
+	var gpus []MacOSGPUInfo
+
+	for i, item := range spData {
+		if gpuData, ok := item.(map[string]interface{}); ok {
+			info := MacOSGPUInfo{
+				Index:          uint32(i),
+				IsAppleSilicon: isAppleSilicon,
+			}
+
+			// Extract chipset model
+			if model, ok := gpuData["sppci_model"].(string); ok {
+				info.ChipsetModel = model
+				info.Name = model
+			}
+
+			// Check for shared memory (integrated GPU)
+			if _, ok := gpuData["sppci_vram_shared"]; ok {
+				info.IsIntegrated = true
+			}
+
+			// Extract VRAM
+			if vram, ok := gpuData["sppci_vram"].(string); ok {
+				// Parse "16384 MB"
+				parts := strings.Fields(vram)
+				if len(parts) >= 1 {
+					if mb, err := strconv.ParseUint(parts[0], 10, 32); err == nil {
+						info.VRAM_MB = uint32(mb)
+					}
+				}
+			}
+
+			gpus = append(gpus, info)
+		}
+	}
+
+	return gpus, nil
+}
+
+// GetPowermetricsData tries to get real-time GPU metrics from powermetrics
+// Requires sudo access. Returns empty data if not available.
+func GetPowermetricsData() (*PowermetricsData, error) {
+	// powermetrics requires sudo, so this may fail
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "powermetrics", "--samplers", "gpu_power", "-n", "1", "-i", "100")
+	out, err := cmd.Output()
+	if err != nil {
+		// powermetrics not available or no permission
+		return &PowermetricsData{HasData: false}, nil
+	}
+
+	data := &PowermetricsData{HasData: false}
+
+	// Parse powermetrics output
+	// Example: "GPU Power: 5000 mW" or "GPU utilization: 45%"
+	scanner := bufio.NewScanner(strings.NewReader(string(out)))
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		// Parse GPU utilization
+		if strings.Contains(line, "GPU utilization") || strings.Contains(line, "GPU active") {
+			re := regexp.MustCompile(`(\d+(?:\.\d+)?)\s*%`)
+			if matches := re.FindStringSubmatch(line); len(matches) > 1 {
+				if util, err := strconv.ParseFloat(matches[1], 64); err == nil {
+					data.GPUUtilization = util
+					data.HasData = true
+				}
+			}
+		}
+
+		// Parse GPU power
+		if strings.Contains(line, "GPU Power") || strings.Contains(line, "GPU power") {
+			re := regexp.MustCompile(`(\d+(?:\.\d+)?)\s*mW`)
+			if matches := re.FindStringSubmatch(line); len(matches) > 1 {
+				if power, err := strconv.ParseFloat(matches[1], 64); err == nil {
+					data.GPUPower = power
+					data.HasData = true
+				}
+			}
+		}
+
+		// Parse GPU temperature (if available)
+		if strings.Contains(line, "GPU Temperature") || strings.Contains(line, "GPU temp") {
+			re := regexp.MustCompile(`(\d+(?:\.\d+)?)\s*C`)
+			if matches := re.FindStringSubmatch(line); len(matches) > 1 {
+				if temp, err := strconv.ParseFloat(matches[1], 64); err == nil {
+					data.GPUTemperature = temp
+					data.HasData = true
+				}
+			}
+		}
+	}
+
+	return data, nil
+}
+
+// FormatMacOSGPUStatus formats GPU status for display
+func FormatMacOSGPUStatus() (string, error) {
+	gpus, err := GetMacOSGPUInfo()
+	if err != nil {
+		return "", err
+	}
+
+	// Try to get real-time metrics from powermetrics
+	powermetrics, _ := GetPowermetricsData()
+
+	if len(gpus) == 0 {
+		return "GPU info unavailable\n\nRun on a system with NVIDIA GPU or macOS", nil
+	}
+
+	var b strings.Builder
+
+	if IsAppleSilicon() {
+		b.WriteString("GPU Status (macOS - Apple Silicon)\n")
+	} else {
+		b.WriteString("GPU Status (macOS)\n")
+	}
+	b.WriteString(strings.Repeat("═", 50) + "\n\n")
+
+	for _, gpu := range gpus {
+		fmt.Fprintf(&b, "🎮 GPU %d: %s\n", gpu.Index, gpu.Name)
+		if gpu.IsAppleSilicon {
+			b.WriteString("   Type: Apple Silicon (Unified Memory)\n")
+		} else if gpu.IsIntegrated {
+			b.WriteString("   Type: Integrated (Shared Memory)\n")
+		} else {
+			fmt.Fprintf(&b, "   VRAM: %d MB\n", gpu.VRAM_MB)
+		}
+
+		// Display powermetrics data if available
+		if powermetrics != nil && powermetrics.HasData {
+			if powermetrics.GPUUtilization > 0 {
+				b.WriteString(fmt.Sprintf("   Utilization: %.1f%%\n", powermetrics.GPUUtilization))
+			}
+			if powermetrics.GPUPower > 0 {
+				b.WriteString(fmt.Sprintf("   Power: %.1f W\n", powermetrics.GPUPower/1000))
+			}
+			if powermetrics.GPUTemperature > 0 {
+				b.WriteString(fmt.Sprintf("   Temperature: %.0f°C\n", powermetrics.GPUTemperature))
+			}
+		}
+		b.WriteString("\n")
+	}
+
+	if powermetrics == nil || !powermetrics.HasData {
+		b.WriteString("💡 Note: Run with sudo for real-time GPU metrics via powermetrics\n")
+	}
+	return b.String(), nil
+}
diff --git a/internal/worker/gpu_macos_stub.go b/internal/worker/gpu_macos_stub.go
new file mode 100644
index 0000000..c59e683
--- /dev/null
+++ b/internal/worker/gpu_macos_stub.go
@@ -0,0 +1,41 @@
+//go:build !darwin
+// +build !darwin
+
+package worker
+
+import "errors"
+
+// MacOSGPUInfo placeholder for non-macOS builds
+type MacOSGPUInfo struct {
+	Index          uint32
+	Name           string
+	ChipsetModel   string
+	VRAM_MB        uint32
+	IsIntegrated   bool
+	IsAppleSilicon bool
+}
+
+// IsMacOS returns false on non-macOS
+func IsMacOS() bool {
+	return false
+}
+
+// IsAppleSilicon returns false on non-macOS
+func IsAppleSilicon() bool {
+	return false
+}
+
+// GetMacOSGPUCount returns error on non-macOS
+func GetMacOSGPUCount() (int, error) {
+	return 0, errors.New("macOS GPU monitoring only available on macOS")
+}
+
+// GetMacOSGPUInfo returns error on non-macOS
+func GetMacOSGPUInfo() ([]MacOSGPUInfo, error) {
+	return nil, errors.New("macOS GPU monitoring only available on macOS")
+}
+
+// FormatMacOSGPUStatus returns error on non-macOS
+func FormatMacOSGPUStatus() (string, error) {
+	return "", errors.New("macOS GPU monitoring only available on macOS")
+}
diff --git a/internal/worker/gpu_nvml_native.go b/internal/worker/gpu_nvml_native.go
index 2feb72a..7390251 100644
--- a/internal/worker/gpu_nvml_native.go
+++ b/internal/worker/gpu_nvml_native.go
@@ -1,5 +1,5 @@
-//go:build cgo && native_libs
-// +build cgo,native_libs
+//go:build cgo && native_libs && linux
+// +build cgo,native_libs,linux
 
 package worker
 
diff --git a/internal/worker/gpu_nvml_stub.go b/internal/worker/gpu_nvml_stub.go
index 337a7f8..47779ff 100644
--- a/internal/worker/gpu_nvml_stub.go
+++ b/internal/worker/gpu_nvml_stub.go
@@ -1,11 +1,26 @@
-//go:build cgo && !native_libs
-// +build cgo,!native_libs
+//go:build !cgo || !native_libs || !linux
+// +build !cgo !native_libs !linux
 
 package worker
 
 import "errors"
 
-// Stub implementations when native_libs build tag is not present
+// GPUInfo provides comprehensive GPU information
+type GPUInfo struct {
+	Index        uint32
+	Name         string
+	Utilization  uint32
+	MemoryUsed   uint64
+	MemoryTotal  uint64
+	Temperature  uint32
+	PowerDraw    uint32
+	ClockSM      uint32
+	ClockMemory  uint32
+	PCIeGen      uint32
+	PCIeWidth    uint32
+	UUID         string
+	VBIOSVersion string
+}
 
 func InitNVML() error {
 	return errors.New("NVML requires native_libs build tag")
@@ -18,10 +33,10 @@ func IsNVMLAvailable() bool {
 }
 
 func GetGPUCount() (int, error) {
-	return 0, errors.New("NVML requires native_libs build tag")
+	return 0, nil
 }
 
-func GetGPUInfo(index uint32) (*GPUInfo, error) {
+func GetGPUInfo(index uint32) (*GPUInfo, error) { // <-- was missing
 	return nil, errors.New("NVML requires native_libs build tag")
 }
 
diff --git a/internal/worker/native_bridge.go b/internal/worker/native_bridge.go
index 29597ab..500e827 100644
--- a/internal/worker/native_bridge.go
+++ b/internal/worker/native_bridge.go
@@ -15,11 +15,6 @@ func init() {
 	log.Printf("[native] Native libraries disabled (build with -tags native_libs to enable)")
 }
 
-// dirOverallSHA256HexNative is not available without native_libs build tag.
-func dirOverallSHA256HexNative(_ string) (string, error) {
-	return "", errors.New("native hash requires native_libs build tag")
-}
-
 // HashFilesBatchNative is not available without native_libs build tag.
 func HashFilesBatchNative(paths []string) ([]string, error) {
 	return nil, errors.New("native batch hash requires native_libs build tag")
diff --git a/internal/worker/native_bridge_libs.go b/internal/worker/native_bridge_libs.go
index cff1f87..58bbe3b 100644
--- a/internal/worker/native_bridge_libs.go
+++ b/internal/worker/native_bridge_libs.go
@@ -3,7 +3,8 @@
 
 package worker
 
-// #cgo LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -ldataset_hash
+// #cgo darwin LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -ldataset_hash
+// #cgo linux LDFLAGS: -L${SRCDIR}/../../native/build -Wl,-rpath,${SRCDIR}/../../native/build -ldataset_hash -lnvml_gpu -lnvidia-ml
 // #include "../../native/dataset_hash/dataset_hash.h"
 // #include <stdlib.h>
 import "C"
@@ -25,8 +26,6 @@ var (
 	ctxInitTime time.Time
 )
 
-// getHashContext returns a cached hash context, initializing it once.
-// Context reuse eliminates 5-20ms of thread pool creation per hash operation.
 func getHashContext() *C.fh_context_t {
 	hashCtxOnce.Do(func() {
 		start := time.Now()
@@ -38,9 +37,8 @@ func getHashContext() *C.fh_context_t {
 	return hashCtx
 }
 
-// dirOverallSHA256HexNative implementation with native library.
 func dirOverallSHA256HexNative(root string) (string, error) {
-	ctx := getHashContext() // Reuse cached context: ~0.1μs vs 5-20ms
+	ctx := getHashContext()
 
 	croot := C.CString(root)
 	defer C.free(unsafe.Pointer(croot))
@@ -58,28 +56,23 @@ func dirOverallSHA256HexNative(root string) (string, error) {
 	return C.GoString(result), nil
 }
 
-// GetSIMDImplName returns the native SHA256 implementation name.
 func GetSIMDImplName() string {
 	return C.GoString(C.fh_get_simd_impl_name())
 }
 
-// HasSIMDSHA256 returns true if SIMD SHA256 is available.
 func HasSIMDSHA256() bool {
 	return C.fh_has_simd_sha256() == 1
 }
 
-// ScanArtifactsNative falls back to Go implementation.
 func ScanArtifactsNative(runDir string) (*manifest.Artifacts, error) {
 	return ScanArtifacts(runDir)
 }
 
-// ExtractTarGzNative falls back to Go implementation.
 func ExtractTarGzNative(archivePath, dstDir string) error {
 	return ExtractTarGz(archivePath, dstDir)
 }
 
 // DirOverallSHA256HexNative exports the native hash implementation for benchmarks.
-// This allows explicit native library usage when -tags native_libs is enabled.
 func DirOverallSHA256HexNative(root string) (string, error) {
 	return dirOverallSHA256HexNative(root)
 }
diff --git a/internal/worker/native_bridge_nocgo.go b/internal/worker/native_bridge_nocgo.go
index 10e4be1..ed5de83 100644
--- a/internal/worker/native_bridge_nocgo.go
+++ b/internal/worker/native_bridge_nocgo.go
@@ -9,11 +9,6 @@ import (
 	"github.com/jfraeys/fetch_ml/internal/manifest"
 )
 
-// dirOverallSHA256HexNative is not available without CGO.
-func dirOverallSHA256HexNative(root string) (string, error) {
-	return "", errors.New("native hash requires CGO")
-}
-
 // HashFilesBatchNative is not available without CGO.
 func HashFilesBatchNative(paths []string) ([]string, error) {
 	return nil, errors.New("native batch hash requires CGO")
diff --git a/native/nvml_gpu/CMakeLists.txt b/native/nvml_gpu/CMakeLists.txt
index e4ee3b5..0fbe4f0 100644
--- a/native/nvml_gpu/CMakeLists.txt
+++ b/native/nvml_gpu/CMakeLists.txt
@@ -31,7 +31,11 @@ if(NVML_LIBRARY AND NVML_INCLUDE_DIR)
     message(STATUS "Found NVML: ${NVML_LIBRARY}")
     message(STATUS "NVML include: ${NVML_INCLUDE_DIR}")
 else()
-    message(WARNING "NVML not found. GPU monitoring will be disabled.")
+    if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+        message(WARNING "NVML not found. NVIDIA GPU monitoring will be disabled.")
+    else()
+        message(STATUS "NVML not available on ${CMAKE_SYSTEM_NAME}. Using platform-specific GPU monitoring.")
+    endif()
     # Create stub library
     target_compile_definitions(nvml_gpu PRIVATE NVML_STUB)
 endif()
diff --git a/native/nvml_gpu/nvml_dynamic.c b/native/nvml_gpu/nvml_dynamic.c
new file mode 100644
index 0000000..15c457a
--- /dev/null
+++ b/native/nvml_gpu/nvml_dynamic.c
@@ -0,0 +1,272 @@
+#include "nvml_dynamic.h"
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+// NVML type definitions (from nvml.h)
+typedef int nvmlReturn_t;
+typedef void* nvmlDevice_t;
+typedef struct {
+    unsigned int gpu;
+    unsigned int memory;
+} nvmlUtilization_t;
+typedef struct {
+    unsigned long long total;
+    unsigned long long free;
+    unsigned long long used;
+} nvmlMemory_t;
+
+// Function pointer types
+typedef nvmlReturn_t (*nvmlInit_v2_fn)(void);
+typedef nvmlReturn_t (*nvmlShutdown_fn)(void);
+typedef nvmlReturn_t (*nvmlSystemGetDriverVersion_fn)(char*, unsigned int);
+typedef nvmlReturn_t (*nvmlDeviceGetCount_fn)(unsigned int*);
+typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndex_v2_fn)(unsigned int, nvmlDevice_t*);
+typedef nvmlReturn_t (*nvmlDeviceGetName_fn)(nvmlDevice_t, char*, unsigned int);
+typedef nvmlReturn_t (*nvmlDeviceGetUtilizationRates_fn)(nvmlDevice_t, nvmlUtilization_t*);
+typedef nvmlReturn_t (*nvmlDeviceGetMemoryInfo_fn)(nvmlDevice_t, nvmlMemory_t*);
+typedef nvmlReturn_t (*nvmlDeviceGetTemperature_fn)(nvmlDevice_t, unsigned int, unsigned int*);
+typedef nvmlReturn_t (*nvmlDeviceGetPowerUsage_fn)(nvmlDevice_t, unsigned int*);
+typedef nvmlReturn_t (*nvmlDeviceGetClockInfo_fn)(nvmlDevice_t, unsigned int, unsigned int*);
+typedef nvmlReturn_t (*nvmlDeviceGetPcieThroughput_fn)(nvmlDevice_t, unsigned int, unsigned int*);
+typedef nvmlReturn_t (*nvmlDeviceGetUUID_fn)(nvmlDevice_t, char*, unsigned int);
+typedef nvmlReturn_t (*nvmlDeviceGetVbiosVersion_fn)(nvmlDevice_t, char*, unsigned int);
+
+// NVML constants
+#define NVML_SUCCESS 0
+#define NVML_TEMPERATURE_GPU 0
+#define NVML_CLOCK_SM 0
+#define NVML_CLOCK_MEM 1
+#define NVML_PCIE_UTIL_TX_BYTES 0
+#define NVML_PCIE_UTIL_RX_BYTES 1
+
+struct nvml_dynamic {
+    void* handle;
+    char last_error[256];
+    int available;
+
+    // Function pointers
+    nvmlInit_v2_fn init;
+    nvmlShutdown_fn shutdown;
+    nvmlSystemGetDriverVersion_fn get_driver_version;
+    nvmlDeviceGetCount_fn get_count;
+    nvmlDeviceGetHandleByIndex_v2_fn get_handle_by_index;
+    nvmlDeviceGetName_fn get_name;
+    nvmlDeviceGetUtilizationRates_fn get_utilization;
+    nvmlDeviceGetMemoryInfo_fn get_memory;
+    nvmlDeviceGetTemperature_fn get_temperature;
+    nvmlDeviceGetPowerUsage_fn get_power_usage;
+    nvmlDeviceGetClockInfo_fn get_clock;
+    nvmlDeviceGetUUID_fn get_uuid;
+    nvmlDeviceGetVbiosVersion_fn get_vbios;
+};
+
+static void set_error(nvml_dynamic_t* nvml, const char* msg) {
+    if (nvml) {
+        strncpy(nvml->last_error, msg, sizeof(nvml->last_error) - 1);
+        nvml->last_error[sizeof(nvml->last_error) - 1] = '\0';
+    }
+}
+
+#ifdef _WIN32
+static void* load_lib(const char* name) {
+    return LoadLibraryA(name);
+}
+static void* get_sym(void* handle, const char* name) {
+    return (void*)GetProcAddress((HMODULE)handle, name);
+}
+static void close_lib(void* handle) {
+    FreeLibrary((HMODULE)handle);
+}
+#else
+static void* load_lib(const char* name) {
+    return dlopen(name, RTLD_NOW);
+}
+static void* get_sym(void* handle, const char* name) {
+    return dlsym(handle, name);
+}
+static void close_lib(void* handle) {
+    dlclose(handle);
+}
+#endif
+
+nvml_dynamic_t* nvml_load(void) {
+    nvml_dynamic_t* nvml = (nvml_dynamic_t*)calloc(1, sizeof(nvml_dynamic_t));
+    if (!nvml) return NULL;
+
+    // Try to load NVML library
+#ifdef _WIN32
+    nvml->handle = load_lib("nvml.dll");
+    if (!nvml->handle) {
+        nvml->handle = load_lib("C:\\Windows\\System32\\nvml.dll");
+    }
+#else
+    nvml->handle = load_lib("libnvidia-ml.so.1");
+    if (!nvml->handle) {
+        nvml->handle = load_lib("libnvidia-ml.so");
+    }
+#endif
+
+    if (!nvml->handle) {
+        set_error(nvml, "NVML library not found - NVIDIA driver may not be installed");
+        nvml->available = 0;
+        return nvml;
+    }
+
+    // Load function pointers
+    nvml->init = (nvmlInit_v2_fn)get_sym(nvml->handle, "nvmlInit_v2");
+    nvml->shutdown = (nvmlShutdown_fn)get_sym(nvml->handle, "nvmlShutdown");
+    nvml->get_driver_version = (nvmlSystemGetDriverVersion_fn)get_sym(nvml->handle, "nvmlSystemGetDriverVersion");
+    nvml->get_count = (nvmlDeviceGetCount_fn)get_sym(nvml->handle, "nvmlDeviceGetCount");
+    nvml->get_handle_by_index = (nvmlDeviceGetHandleByIndex_v2_fn)get_sym(nvml->handle, "nvmlDeviceGetHandleByIndex_v2");
+    nvml->get_name = (nvmlDeviceGetName_fn)get_sym(nvml->handle, "nvmlDeviceGetName");
+    nvml->get_utilization = (nvmlDeviceGetUtilizationRates_fn)get_sym(nvml->handle, "nvmlDeviceGetUtilizationRates");
+    nvml->get_memory = (nvmlDeviceGetMemoryInfo_fn)get_sym(nvml->handle, "nvmlDeviceGetMemoryInfo");
+    nvml->get_temperature = (nvmlDeviceGetTemperature_fn)get_sym(nvml->handle, "nvmlDeviceGetTemperature");
+    nvml->get_power_usage = (nvmlDeviceGetPowerUsage_fn)get_sym(nvml->handle, "nvmlDeviceGetPowerUsage");
+    nvml->get_clock = (nvmlDeviceGetClockInfo_fn)get_sym(nvml->handle, "nvmlDeviceGetClockInfo");
+    nvml->get_uuid = (nvmlDeviceGetUUID_fn)get_sym(nvml->handle, "nvmlDeviceGetUUID");
+    nvml->get_vbios = (nvmlDeviceGetVbiosVersion_fn)get_sym(nvml->handle, "nvmlDeviceGetVbiosVersion");
+
+    // Check required functions
+    if (!nvml->init || !nvml->shutdown || !nvml->get_count || !nvml->get_handle_by_index) {
+        set_error(nvml, "Failed to load required NVML functions");
+        close_lib(nvml->handle);
+        nvml->handle = NULL;
+        nvml->available = 0;
+        return nvml;
+    }
+
+    // Initialize NVML
+    nvmlReturn_t result = nvml->init();
+    if (result != NVML_SUCCESS) {
+        set_error(nvml, "Failed to initialize NVML");
+        close_lib(nvml->handle);
+        nvml->handle = NULL;
+        nvml->available = 0;
+        return nvml;
+    }
+
+    nvml->available = 1;
+    return nvml;
+}
+
+void nvml_unload(nvml_dynamic_t* nvml) {
+    if (!nvml) return;
+    if (nvml->handle) {
+        if (nvml->shutdown) {
+            nvml->shutdown();
+        }
+        close_lib(nvml->handle);
+    }
+    free(nvml);
+}
+
+int nvml_is_available(const nvml_dynamic_t* nvml) {
+    return nvml ? nvml->available : 0;
+}
+
+const char* nvml_last_error(const nvml_dynamic_t* nvml) {
+    return nvml ? nvml->last_error : "NULL nvml handle";
+}
+
+int nvml_get_gpu_count(nvml_dynamic_t* nvml) {
+    if (!nvml || !nvml->available || !nvml->get_count) {
+        return -1;
+    }
+    unsigned int count = 0;
+    nvmlReturn_t result = nvml->get_count(&count);
+    if (result != NVML_SUCCESS) {
+        set_error(nvml, "Failed to get GPU count");
+        return -1;
+    }
+    return (int)count;
+}
+
+int nvml_get_gpu_info(nvml_dynamic_t* nvml, uint32_t index, gpu_info_t* info) {
+    if (!nvml || !nvml->available || !info) {
+        return -1;
+    }
+
+    memset(info, 0, sizeof(*info));
+    info->index = index;
+
+    nvmlDevice_t device;
+    nvmlReturn_t result = nvml->get_handle_by_index(index, &device);
+    if (result != NVML_SUCCESS) {
+        set_error(nvml, "Failed to get device handle");
+        return -1;
+    }
+
+    // Get name
+    if (nvml->get_name) {
+        nvml->get_name(device, info->name, sizeof(info->name));
+    }
+
+    // Get utilization
+    if (nvml->get_utilization) {
+        nvmlUtilization_t util;
+        result = nvml->get_utilization(device, &util);
+        if (result == NVML_SUCCESS) {
+            info->utilization = util.gpu;
+        }
+    }
+
+    // Get memory
+    if (nvml->get_memory) {
+        nvmlMemory_t mem;
+        result = nvml->get_memory(device, &mem);
+        if (result == NVML_SUCCESS) {
+            info->memory_used = mem.used;
+            info->memory_total = mem.total;
+        }
+    }
+
+    // Get temperature
+    if (nvml->get_temperature) {
+        unsigned int temp;
+        result = nvml->get_temperature(device, NVML_TEMPERATURE_GPU, &temp);
+        if (result == NVML_SUCCESS) {
+            info->temperature = temp;
+        }
+    }
+
+    // Get power usage
+    if (nvml->get_power_usage) {
+        unsigned int power;
+        result = nvml->get_power_usage(device, &power);
+        if (result == NVML_SUCCESS) {
+            info->power_draw = power;
+        }
+    }
+
+    // Get clocks
+    if (nvml->get_clock) {
+        unsigned int clock;
+        result = nvml->get_clock(device, NVML_CLOCK_SM, &clock);
+        if (result == NVML_SUCCESS) {
+            info->clock_sm = clock;
+        }
+        result = nvml->get_clock(device, NVML_CLOCK_MEM, &clock);
+        if (result == NVML_SUCCESS) {
+            info->clock_memory = clock;
+        }
+    }
+
+    // Get UUID
+    if (nvml->get_uuid) {
+        nvml->get_uuid(device, info->uuid, sizeof(info->uuid));
+    }
+
+    // Get VBIOS version
+    if (nvml->get_vbios) {
+        nvml->get_vbios(device, info->vbios_version, sizeof(info->vbios_version));
+    }
+
+    return 0;
+}
diff --git a/native/nvml_gpu/nvml_dynamic.h b/native/nvml_gpu/nvml_dynamic.h
new file mode 100644
index 0000000..3b72fb0
--- /dev/null
+++ b/native/nvml_gpu/nvml_dynamic.h
@@ -0,0 +1,53 @@
+#ifndef NVML_DYNAMIC_H
+#define NVML_DYNAMIC_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque handle
+typedef struct nvml_dynamic nvml_dynamic_t;
+
+// GPU info structure
+typedef struct {
+    uint32_t index;
+    char name[256];
+    uint32_t utilization;     // GPU utilization (0-100)
+    uint64_t memory_used;     // Memory used in bytes
+    uint64_t memory_total;    // Total memory in bytes
+    uint32_t temperature;     // Temperature in Celsius
+    uint32_t power_draw;      // Power draw in milliwatts
+    uint32_t clock_sm;        // SM clock in MHz
+    uint32_t clock_memory;    // Memory clock in MHz
+    uint32_t pcie_gen;        // PCIe generation
+    uint32_t pcie_width;      // PCIe link width
+    char uuid[64];            // GPU UUID
+    char vbios_version[32];   // VBIOS version
+} gpu_info_t;
+
+// Load NVML dynamically (returns NULL if not available)
+nvml_dynamic_t* nvml_load(void);
+
+// Unload NVML and free resources
+void nvml_unload(nvml_dynamic_t* nvml);
+
+// Check if NVML is available and loaded
+int nvml_is_available(const nvml_dynamic_t* nvml);
+
+// Get number of GPUs (-1 on error)
+int nvml_get_gpu_count(nvml_dynamic_t* nvml);
+
+// Get GPU info by index (returns 0 on success)
+int nvml_get_gpu_info(nvml_dynamic_t* nvml, uint32_t index, gpu_info_t* info);
+
+// Get last error message
+const char* nvml_last_error(const nvml_dynamic_t* nvml);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NVML_DYNAMIC_H