Radix/runtimes/cuda/cuda.zig

const std = @import("std");
const builtin = @import("builtin");

const asynk = @import("async");
const bazel_builtin = @import("bazel_builtin");
const c = @import("c");
const pjrt = @import("pjrt");
const runfiles = @import("runfiles");
const stdx = @import("stdx");

const nvidiaLibsPath = "/cuda/";

const log = std.log.scoped(.@"zml/runtime/cuda");

pub fn isEnabled() bool {
    return @hasDecl(c, "ZML_RUNTIME_CUDA");
}

fn hasNvidiaDevice() bool {
    asynk.File.access("/dev/nvidiactl", .{ .mode = .read_only }) catch return false;
    return true;
}

fn hasCudaPathInLDPath() bool {
    const ldLibraryPath = std.c.getenv("LD_LIBRARY_PATH") orelse return false;
    return std.ascii.indexOfIgnoreCase(std.mem.span(ldLibraryPath), nvidiaLibsPath) != null;
}

fn setupXlaGpuCudaDirFlag(allocator: std.mem.Allocator, sandbox: []const u8) !void {
    const xla_flags = std.process.getEnvVarOwned(allocator, "XLA_FLAGS") catch "";
    const new_xla_flagsZ = try std.fmt.allocPrintZ(allocator, "{s} --xla_gpu_cuda_data_dir={s}", .{ xla_flags, sandbox });

    _ = c.setenv("XLA_FLAGS", new_xla_flagsZ, 1);
}

pub fn load() !*const pjrt.Api {
    if (comptime !isEnabled()) {
        return error.Unavailable;
    }
    if (comptime builtin.os.tag != .linux) {
        return error.Unavailable;
    }
    if (!hasNvidiaDevice()) {
        return error.Unavailable;
    }
    if (hasCudaPathInLDPath()) {
        log.warn("Detected {s} in LD_LIBRARY_PATH. This can lead to undefined behaviors and crashes", .{nvidiaLibsPath});
    }

    var arena = std.heap.ArenaAllocator.init(std.heap.c_allocator);
    defer arena.deinit();

    var r_ = try runfiles.Runfiles.create(.{ .allocator = arena.allocator() }) orelse {
        stdx.debug.panic("Unable to find runfiles", .{});
    };

    const source_repo = bazel_builtin.current_repository;
    const r = r_.withSourceRepo(source_repo);

    var path_buf: [std.fs.max_path_bytes]u8 = undefined;
    const sandbox_path = try r.rlocation("libpjrt_cuda/sandbox", &path_buf) orelse {
        log.err("Failed to find sandbox path for CUDA runtime", .{});
        return error.FileNotFound;
    };

    // CUDA path has to be set _before_ loading the PJRT plugin.
    // See https://github.com/openxla/xla/issues/21428
    try setupXlaGpuCudaDirFlag(arena.allocator(), sandbox_path);

    {
        var lib_path_buf: [std.fs.max_path_bytes]u8 = undefined;
        const path = try stdx.fs.path.bufJoinZ(&lib_path_buf, &.{ sandbox_path, "lib", "libnvToolsExt.so.1" });
        _ = std.c.dlopen(path, .{ .NOW = true, .GLOBAL = true }) orelse {
            log.err("Unable to dlopen libnvToolsExt.so.1: {s}", .{std.c.dlerror().?});
            return error.DlError;
        };
    }

    return blk: {
        var lib_path_buf: [std.fs.max_path_bytes]u8 = undefined;
        const path = try stdx.fs.path.bufJoinZ(&lib_path_buf, &.{ sandbox_path, "lib", "libpjrt_cuda.so" });
        break :blk asynk.callBlocking(pjrt.Api.loadFrom, .{path});
    };
}
Fix CUDA and ROCm sandbox discovery, update epoll libxev patch to prevent high CPU usage, enable XLA GPU latency‑hiding scheduler, and upgrade cuDNN to 9.6.0. 2024-01-15 09:41:42 +00:00			`const std = @import("std");`
workspace: run buildifier, drop rules_uv, refactor tools/hf dependencies 2025-07-16 10:01:41 +00:00			`const builtin = @import("builtin");`
Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00
Introduce a thin abstraction layer between ZML and PJRT to manage plugin loading decisions, enable compile‑time detection of linked runtimes, and handle cases such as libtpu blocking metadata access. 2023-05-15 09:36:41 +00:00			`const asynk = @import("async");`
Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00			`const bazel_builtin = @import("bazel_builtin");`
Upgrade XLA to version 20250122.0-cc075be, switch to nvptx compiler and nvlink with nvjitlink support, add warning for CUDA path in LD_LIBRARY_PATH, and revert the previous CUDA sandbox fix. 2024-02-06 09:31:48 +00:00			`const c = @import("c");`
Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00			`const pjrt = @import("pjrt");`
			`const runfiles = @import("runfiles");`
			`const stdx = @import("stdx");`
Upgrade XLA to version 20250122.0-cc075be, switch to nvptx compiler and nvlink with nvjitlink support, add warning for CUDA path in LD_LIBRARY_PATH, and revert the previous CUDA sandbox fix. 2024-02-06 09:31:48 +00:00
Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00			`const nvidiaLibsPath = "/cuda/";`

			`const log = std.log.scoped(.@"zml/runtime/cuda");`
Introduce a thin abstraction layer between ZML and PJRT to manage plugin loading decisions, enable compile‑time detection of linked runtimes, and handle cases such as libtpu blocking metadata access. 2023-05-15 09:36:41 +00:00
			`pub fn isEnabled() bool {`
			`return @hasDecl(c, "ZML_RUNTIME_CUDA");`
			`}`

			`fn hasNvidiaDevice() bool {`
Update CUDA runtime sandboxing and dynamic symbol renaming, switch to pre‑built jax‑cuda‑pjrt plugin, and bump CUDA to 12.6.2 and cuDNN to 9.5.1. 2023-09-14 13:28:25 +00:00			`asynk.File.access("/dev/nvidiactl", .{ .mode = .read_only }) catch return false;`
Introduce a thin abstraction layer between ZML and PJRT to manage plugin loading decisions, enable compile‑time detection of linked runtimes, and handle cases such as libtpu blocking metadata access. 2023-05-15 09:36:41 +00:00			`return true;`
			`}`

Upgrade XLA to version 20250122.0-cc075be, switch to nvptx compiler and nvlink with nvjitlink support, add warning for CUDA path in LD_LIBRARY_PATH, and revert the previous CUDA sandbox fix. 2024-02-06 09:31:48 +00:00			`fn hasCudaPathInLDPath() bool {`
workspace: run buildifier, drop rules_uv, refactor tools/hf dependencies 2025-07-16 10:01:41 +00:00			`const ldLibraryPath = std.c.getenv("LD_LIBRARY_PATH") orelse return false;`
Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00			`return std.ascii.indexOfIgnoreCase(std.mem.span(ldLibraryPath), nvidiaLibsPath) != null;`
			`}`

runtimes/cuda: sandbox CUDA dependencies by removing them from the leaf binary, sandboxing the dependency graph, marking dlopen direct dependencies as NEEDED, setting RPATH to the sandbox, loading the PJRT plugin from the sandbox, and enabling weak CUDA symbols without direct linking. 2025-03-26 11:18:29 +00:00			`fn setupXlaGpuCudaDirFlag(allocator: std.mem.Allocator, sandbox: []const u8) !void {`
			`const xla_flags = std.process.getEnvVarOwned(allocator, "XLA_FLAGS") catch "";`
			`const new_xla_flagsZ = try std.fmt.allocPrintZ(allocator, "{s} --xla_gpu_cuda_data_dir={s}", .{ xla_flags, sandbox });`
Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00
			`_ = c.setenv("XLA_FLAGS", new_xla_flagsZ, 1);`
Fix CUDA and ROCm sandbox discovery, update epoll libxev patch to prevent high CPU usage, enable XLA GPU latency‑hiding scheduler, and upgrade cuDNN to 9.6.0. 2024-01-15 09:41:42 +00:00			`}`

Introduce a thin abstraction layer between ZML and PJRT to manage plugin loading decisions, enable compile‑time detection of linked runtimes, and handle cases such as libtpu blocking metadata access. 2023-05-15 09:36:41 +00:00			`pub fn load() !*const pjrt.Api {`
			`if (comptime !isEnabled()) {`
			`return error.Unavailable;`
			`}`
			`if (comptime builtin.os.tag != .linux) {`
			`return error.Unavailable;`
			`}`
			`if (!hasNvidiaDevice()) {`
			`return error.Unavailable;`
			`}`
Upgrade XLA to version 20250122.0-cc075be, switch to nvptx compiler and nvlink with nvjitlink support, add warning for CUDA path in LD_LIBRARY_PATH, and revert the previous CUDA sandbox fix. 2024-02-06 09:31:48 +00:00			`if (hasCudaPathInLDPath()) {`
Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00			`log.warn("Detected {s} in LD_LIBRARY_PATH. This can lead to undefined behaviors and crashes", .{nvidiaLibsPath});`
Upgrade XLA to version 20250122.0-cc075be, switch to nvptx compiler and nvlink with nvjitlink support, add warning for CUDA path in LD_LIBRARY_PATH, and revert the previous CUDA sandbox fix. 2024-02-06 09:31:48 +00:00			`}`
Fix CUDA and ROCm sandbox discovery, update epoll libxev patch to prevent high CPU usage, enable XLA GPU latency‑hiding scheduler, and upgrade cuDNN to 9.6.0. 2024-01-15 09:41:42 +00:00
runtimes/cuda: sandbox CUDA dependencies by removing them from the leaf binary, sandboxing the dependency graph, marking dlopen direct dependencies as NEEDED, setting RPATH to the sandbox, loading the PJRT plugin from the sandbox, and enabling weak CUDA symbols without direct linking. 2025-03-26 11:18:29 +00:00			`var arena = std.heap.ArenaAllocator.init(std.heap.c_allocator);`
			`defer arena.deinit();`

			`var r_ = try runfiles.Runfiles.create(.{ .allocator = arena.allocator() }) orelse {`
			`stdx.debug.panic("Unable to find runfiles", .{});`
			`};`

			`const source_repo = bazel_builtin.current_repository;`
			`const r = r_.withSourceRepo(source_repo);`

			`var path_buf: [std.fs.max_path_bytes]u8 = undefined;`
			`const sandbox_path = try r.rlocation("libpjrt_cuda/sandbox", &path_buf) orelse {`
			`log.err("Failed to find sandbox path for CUDA runtime", .{});`
			`return error.FileNotFound;`
			`};`

Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00			`// CUDA path has to be set _before_ loading the PJRT plugin.`
			`// See https://github.com/openxla/xla/issues/21428`
runtimes/cuda: sandbox CUDA dependencies by removing them from the leaf binary, sandboxing the dependency graph, marking dlopen direct dependencies as NEEDED, setting RPATH to the sandbox, loading the PJRT plugin from the sandbox, and enabling weak CUDA symbols without direct linking. 2025-03-26 11:18:29 +00:00			`try setupXlaGpuCudaDirFlag(arena.allocator(), sandbox_path);`
Revert CUDA PJRT plugin version to 0.4.38 to address performance regression on XLA master. 2024-03-05 17:04:42 +00:00
runtimes/cuda: sandbox CUDA dependencies by removing them from the leaf binary, sandboxing the dependency graph, marking dlopen direct dependencies as NEEDED, setting RPATH to the sandbox, loading the PJRT plugin from the sandbox, and enabling weak CUDA symbols without direct linking. 2025-03-26 11:18:29 +00:00			`{`
			`var lib_path_buf: [std.fs.max_path_bytes]u8 = undefined;`
			`const path = try stdx.fs.path.bufJoinZ(&lib_path_buf, &.{ sandbox_path, "lib", "libnvToolsExt.so.1" });`
			`_ = std.c.dlopen(path, .{ .NOW = true, .GLOBAL = true }) orelse {`
			`log.err("Unable to dlopen libnvToolsExt.so.1: {s}", .{std.c.dlerror().?});`
			`return error.DlError;`
			`};`
			`}`

			`return blk: {`
			`var lib_path_buf: [std.fs.max_path_bytes]u8 = undefined;`
			`const path = try stdx.fs.path.bufJoinZ(&lib_path_buf, &.{ sandbox_path, "lib", "libpjrt_cuda.so" });`
			`break :blk asynk.callBlocking(pjrt.Api.loadFrom, .{path});`
			`};`
Introduce a thin abstraction layer between ZML and PJRT to manage plugin loading decisions, enable compile‑time detection of linked runtimes, and handle cases such as libtpu blocking metadata access. 2023-05-15 09:36:41 +00:00			`}`