Fix CUDA and ROCm sandbox discovery, update epoll libxev patch to prevent high CPU usage, enable XLA GPU latency‑hiding scheduler, and upgrade cuDNN to 9.6.0.

2024-01-15 09:41:42 +00:00 · 2024-01-15 09:41:42 +00:00 · 434cee3a6c
commit 434cee3a6c
parent 5b8e42f9a9
21 changed files with 4157 additions and 8602 deletions
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -71,7 +71,7 @@ cuda = use_extension("//runtimes/cuda:cuda.bzl", "cuda_packages")
 use_repo(cuda, "libpjrt_cuda")

 rocm = use_extension("//runtimes/rocm:rocm.bzl", "rocm_packages")
-use_repo(rocm, "libpjrt_rocm")
+use_repo(rocm, "libpjrt_rocm", "hipblaslt", "rocblas")

 tpu = use_extension("//runtimes/tpu:tpu.bzl", "tpu_packages")
 use_repo(tpu, "libpjrt_tpu")
@ -84,7 +84,7 @@ use_repo(zls, "zls_aarch64-macos", "zls_x86_64-linux")

 register_toolchains("//third_party/zls:all")

-bazel_dep(name = "libxev", version = "20241119.0-6afcde9")
+bazel_dep(name = "libxev", version = "20241208.0-db6a52b")
 bazel_dep(name = "llvm-raw", version = "20250102.0-f739aa4")

 llvm = use_extension("@llvm-raw//utils/bazel:extension.bzl", "llvm")
--- a/runtimes/cuda/BUILD.bazel
+++ b/runtimes/cuda/BUILD.bazel
@ -27,6 +27,8 @@ zig_library(
        "//runtimes:cuda.enabled": [
            ":libpjrt_cuda",
            "//async",
+            "//stdx",
+            "@rules_zig//zig/runfiles",
        ],
        "//conditions:default": [":empty"],
    }),
--- a/runtimes/cuda/cuda.bzl
+++ b/runtimes/cuda/cuda.bzl
@ -5,7 +5,7 @@ load("//bazel:http_deb_archive.bzl", "http_deb_archive")
 ARCH = "linux-x86_64"

 CUDA_VERSION = "12.6.3"
-CUDNN_VERSION = "9.5.1"
+CUDNN_VERSION = "9.6.0"

 def _filegroup(name, srcs):
    return """\
--- a/runtimes/cuda/cuda.zig
+++ b/runtimes/cuda/cuda.zig
@ -1,7 +1,12 @@
-const builtin = @import("builtin");
+const std = @import("std");
+
 const asynk = @import("async");
-const pjrt = @import("pjrt");
+const bazel_builtin = @import("bazel_builtin");
+const builtin = @import("builtin");
 const c = @import("c");
+const pjrt = @import("pjrt");
+const runfiles = @import("runfiles");
+const stdx = @import("stdx");

 pub fn isEnabled() bool {
    return @hasDecl(c, "ZML_RUNTIME_CUDA");
@ -12,6 +17,23 @@ fn hasNvidiaDevice() bool {
    return true;
 }

+fn setupXlaGpuCudaDirFlag() !void {
+    var arena = std.heap.ArenaAllocator.init(std.heap.c_allocator);
+    defer arena.deinit();
+
+    var r_ = try runfiles.Runfiles.create(.{ .allocator = arena.allocator() }) orelse {
+        stdx.debug.panic("Unable to find CUDA directory", .{});
+    };
+
+    const source_repo = bazel_builtin.current_repository;
+    const r = r_.withSourceRepo(source_repo);
+    const cuda_data_dir = (try r.rlocationAlloc(arena.allocator(), "libpjrt_cuda/sandbox")).?;
+    const xla_flags = std.process.getEnvVarOwned(arena.allocator(), "XLA_FLAGS") catch "";
+    const new_xla_flagsZ = try std.fmt.allocPrintZ(arena.allocator(), "--xla_gpu_cuda_data_dir={s} {s}", .{ cuda_data_dir, xla_flags });
+
+    _ = c.setenv("XLA_FLAGS", new_xla_flagsZ, 1);
+}
+
 pub fn load() !*const pjrt.Api {
    if (comptime !isEnabled()) {
        return error.Unavailable;
@ -23,5 +45,9 @@ pub fn load() !*const pjrt.Api {
        return error.Unavailable;
    }

+    // CUDA path has to be set _before_ loading the PJRT plugin.
+    // See https://github.com/openxla/xla/issues/21428
+    try setupXlaGpuCudaDirFlag();
+
    return try pjrt.Api.loadFrom("libpjrt_cuda.so");
 }
--- a/runtimes/cuda/cudnn.redistrib_9.5.1.json
+++ b/runtimes/cuda/cudnn.redistrib_9.5.1.json
@ -1,77 +0,0 @@
-{
-    "release_date": "2024-10-25",
-    "release_label": "9.5.1",
-    "release_product": "cudnn",
-    "cudnn": {
-        "name": "NVIDIA CUDA Deep Neural Network library",
-        "license": "cudnn",
-        "license_path": "cudnn/LICENSE.txt",
-        "version": "9.5.1.17",
-        "linux-x86_64": {
-            "cuda11": {
-                "relative_path": "cudnn/linux-x86_64/cudnn-linux-x86_64-9.5.1.17_cuda11-archive.tar.xz",
-                "sha256": "b1f5050cd2bfd7fa9d3d0dd00d417cc2124692d8421295e12f841be6c8e3a426",
-                "md5": "5da3b0533fcd3d6a9020d08f3b78ddba",
-                "size": "736935276"
-            },
-            "cuda12": {
-                "relative_path": "cudnn/linux-x86_64/cudnn-linux-x86_64-9.5.1.17_cuda12-archive.tar.xz",
-                "sha256": "35dd20b9c68324ae1288ac36f66ab1f318d2bfecfafb703a82617aa283272be4",
-                "md5": "a8604f6b80f42ec60e98ba9c8f681572",
-                "size": "744697316"
-            }
-        },
-        "cuda_variant": [
-            "11",
-            "12"
-        ],
-        "linux-sbsa": {
-            "cuda11": {
-                "relative_path": "cudnn/linux-sbsa/cudnn-linux-sbsa-9.5.1.17_cuda11-archive.tar.xz",
-                "sha256": "ad68d12ee351b5f3478078fc8188eefb8712721c3e501c9345ec5ffb0b85fae8",
-                "md5": "a9438457a47b2bca7951a19736e8d4e8",
-                "size": "735387008"
-            },
-            "cuda12": {
-                "relative_path": "cudnn/linux-sbsa/cudnn-linux-sbsa-9.5.1.17_cuda12-archive.tar.xz",
-                "sha256": "340c49b32c133b0321c5c5b00d14fb64887dcac83ee8fd24195d9191061f1ad7",
-                "md5": "83c9f3f9eddadd0c1941d7f3e763174c",
-                "size": "743147752"
-            }
-        },
-        "windows-x86_64": {
-            "cuda11": {
-                "relative_path": "cudnn/windows-x86_64/cudnn-windows-x86_64-9.5.1.17_cuda11-archive.zip",
-                "sha256": "8318e93ab017af2356d3b6cf35aab2238e2a51c426450842eb4ade12e4619bbb",
-                "md5": "b7c456ddab820ec335a724be7a969091",
-                "size": "554195447"
-            },
-            "cuda12": {
-                "relative_path": "cudnn/windows-x86_64/cudnn-windows-x86_64-9.5.1.17_cuda12-archive.zip",
-                "sha256": "3a4cecc8b6d6aa7f6777620e6f2c129b76be635357c4506f2c4ccdbe0e2a1641",
-                "md5": "fda9196a60fb8e2b4c78e8a19ff056a3",
-                "size": "557597538"
-            }
-        },
-        "linux-aarch64": {
-            "cuda12": {
-                "relative_path": "cudnn/linux-aarch64/cudnn-linux-aarch64-9.5.1.17_cuda12-archive.tar.xz",
-                "sha256": "0099b8e4081ac146f802e769cdd30d9e01a289ea0fd056e64e44297a13e1aa0c",
-                "md5": "9d20deeb313a05c442fbff036ca29581",
-                "size": "780854928"
-            }
-        }
-    },
-    "cudnn_samples": {
-        "name": "NVIDIA cuDNN samples",
-        "license": "cudnn",
-        "license_path": "cudnn_samples/LICENSE.txt",
-        "version": "9.5.1.17",
-        "source": {
-            "relative_path": "cudnn_samples/source/cudnn_samples-source-9.5.1.17-archive.tar.xz",
-            "sha256": "bb79dc528c6a3b2a019a60d4af13cb4cb3d56146b692b3f3badec3fd8bfc98e7",
-            "md5": "76fe86423261f1ae984b00b1de2e40f3",
-            "size": "1664836"
-        }
-    }
-}
--- a/runtimes/cuda/cudnn.redistrib_9.6.0.json
+++ b/runtimes/cuda/cudnn.redistrib_9.6.0.json
@ -0,0 +1,77 @@
+{
+    "release_date": "2024-12-02",
+    "release_label": "9.6.0",
+    "release_product": "cudnn",
+    "cudnn": {
+        "name": "NVIDIA CUDA Deep Neural Network library",
+        "license": "cudnn",
+        "license_path": "cudnn/LICENSE.txt",
+        "version": "9.6.0.74",
+        "linux-x86_64": {
+            "cuda11": {
+                "relative_path": "cudnn/linux-x86_64/cudnn-linux-x86_64-9.6.0.74_cuda11-archive.tar.xz",
+                "sha256": "9717b0022d4f5ea88ccd9796bef7ad1cc5d04b3bd53f690041767aabfb98d14d",
+                "md5": "aa68c3eda5ad616c0eead9f646d4885b",
+                "size": "655044868"
+            },
+            "cuda12": {
+                "relative_path": "cudnn/linux-x86_64/cudnn-linux-x86_64-9.6.0.74_cuda12-archive.tar.xz",
+                "sha256": "72595f0d17d952cf568c1d76e370a9c303bb08c2f80888a8cf33e316a65d46a8",
+                "md5": "16afe7a88f576525d45d9e00c4ac4277",
+                "size": "662367552"
+            }
+        },
+        "cuda_variant": [
+            "11",
+            "12"
+        ],
+        "linux-sbsa": {
+            "cuda11": {
+                "relative_path": "cudnn/linux-sbsa/cudnn-linux-sbsa-9.6.0.74_cuda11-archive.tar.xz",
+                "sha256": "5f7440fd8269f7a7986bff89dd6924e4145644a94958ab49f146b6f8c0230d46",
+                "md5": "b125b9988b0c71592561163e6c64d8c6",
+                "size": "653894704"
+            },
+            "cuda12": {
+                "relative_path": "cudnn/linux-sbsa/cudnn-linux-sbsa-9.6.0.74_cuda12-archive.tar.xz",
+                "sha256": "f71fb008833fa92f9eac02c0b786a21f5e383470235ddeb1eee98fe370148ace",
+                "md5": "9a35d77abda9f279b3fe32023d3b2e47",
+                "size": "661271556"
+            }
+        },
+        "windows-x86_64": {
+            "cuda11": {
+                "relative_path": "cudnn/windows-x86_64/cudnn-windows-x86_64-9.6.0.74_cuda11-archive.zip",
+                "sha256": "388cce5d31919ef41a4231d21ed3fb7b60609d0bd6baaf6173cd9583a00b25c9",
+                "md5": "6a8c3bea2bad7ca83f356a7832f3671f",
+                "size": "492545214"
+            },
+            "cuda12": {
+                "relative_path": "cudnn/windows-x86_64/cudnn-windows-x86_64-9.6.0.74_cuda12-archive.zip",
+                "sha256": "65ca0f2d77a46de1def35e289780b8d8729ef2fa39cf8dd0c8448e381dd2978c",
+                "md5": "b969339363b43cc80f6184929a6633fa",
+                "size": "495911494"
+            }
+        },
+        "linux-aarch64": {
+            "cuda12": {
+                "relative_path": "cudnn/linux-aarch64/cudnn-linux-aarch64-9.6.0.74_cuda12-archive.tar.xz",
+                "sha256": "6f907bf97731d30ffd55dcc53fe8aa666b583b2c0c6b20e88c7341f98bb0b594",
+                "md5": "751f516ce47fb0e504b878bfc97176e4",
+                "size": "766134984"
+            }
+        }
+    },
+    "cudnn_samples": {
+        "name": "NVIDIA cuDNN samples",
+        "license": "cudnn",
+        "license_path": "cudnn_samples/LICENSE.txt",
+        "version": "9.6.0.74",
+        "source": {
+            "relative_path": "cudnn_samples/source/cudnn_samples-source-9.6.0.74-archive.tar.xz",
+            "sha256": "2cad2fb38ef359a1956daf73b6b4c0faf826a865fcc3dc791437ca83863a6cb9",
+            "md5": "8e46375025bdd14ba55a695e12ee3694",
+            "size": "1667016"
+        }
+    }
+}
--- a/runtimes/rocm/BUILD.bazel
+++ b/runtimes/rocm/BUILD.bazel
@ -1,8 +1,23 @@
 load("@rules_zig//zig:defs.bzl", "zig_library")

-filegroup(
-    name = "zmlxrocm_srcs",
-    srcs = ["zmlxrocm.cc"],
+cc_library(
+    name = "zmlxrocm_lib",
+    srcs = ["zmlxrocm.c"],
+    linkopts = [
+        "-lc",
+        "-ldl",
+    ],
+)
+
+cc_shared_library(
+    name = "zmlxrocm_so",
+    shared_lib_name = "libzmlxrocm.so.0",
+    deps = [":zmlxrocm_lib"],
+)
+
+cc_import(
+    name = "zmlxrocm",
+    shared_library = ":zmlxrocm_so",
    visibility = ["@libpjrt_rocm//:__subpackages__"],
 )

@ -37,6 +52,8 @@ zig_library(
        "//runtimes:rocm.enabled": [
            ":libpjrt_rocm",
            "//async",
+            "//stdx",
+            "@rules_zig//zig/runfiles",
        ],
        "//conditions:default": [":empty"],
    }),
--- a/runtimes/rocm/gfx.bzl
+++ b/runtimes/rocm/gfx.bzl
@ -40,10 +40,8 @@ bytecode_select = rule(
    },
 )

-
 def if_gfx(gfx, value):
    return select({
        "@zml//runtimes/rocm:_{}".format(gfx): value,
        "//conditions:default": [],
    })
-
--- a/runtimes/rocm/libpjrt_rocm.BUILD.bazel
+++ b/runtimes/rocm/libpjrt_rocm.BUILD.bazel
@ -7,7 +7,7 @@ string_list_flag(
    build_setting_default = ["all"],
    visibility = [
        "@rocblas//:__subpackages__",
-        "@hipblaslt-dev//:__subpackages__",
+        "@hipblaslt//:__subpackages__",
    ],
 )

@ -38,7 +38,6 @@ cc_library(
        "-lc",
        "-ldl",
    ],
-    deps = ["@bazel_tools//tools/cpp/runfiles"],
 )

 cc_shared_library(
@ -58,18 +57,23 @@ cc_import(
        ":sandbox",
        "@rocblas//:runfiles",
    ] + select({
-        ":_hipblaslt": ["@hipblaslt-dev//:runfiles"],
+        ":_hipblaslt": ["@hipblaslt//:runfiles"],
        "//conditions:default": [],
    }),
    shared_library = "libpjrt_rocm.so",
    visibility = ["//visibility:public"],
    deps = [
-        ":zmlxrocm",
        "@comgr//:amd_comgr",
        "@hip-runtime-amd//:amdhip",
        "@hipblaslt",
        "@hsa-amd-aqlprofile//:hsa-amd-aqlprofile",
        "@hsa-rocr//:hsa-runtime",
+        "@libdrm-amdgpu",
+        "@libdrm",
+        "@libelf",
+        "@libnuma",
+        "@libtinfo",
+        "@libzstd",
        "@miopen-hip//:MIOpen",
        "@rccl",
        "@rocblas",
@ -77,12 +81,7 @@ cc_import(
        "@rocm-smi-lib//:rocm_smi",
        "@rocprofiler-register",
        "@roctracer",
-        "@libelf",
-        "@libdrm",
-        "@libnuma",
-        "@libzstd",
-        "@libdrm-amdgpu",
-        "@libtinfo",
        "@zlib1g",
+        "@zml//runtimes/rocm:zmlxrocm",
    ],
 )
--- a/runtimes/rocm/packages.amd64.txt
+++ b/runtimes/rocm/packages.amd64.txt
--- a/runtimes/rocm/rocm.bzl
+++ b/runtimes/rocm/rocm.bzl
@ -2,9 +2,9 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//bazel:dpkg.bzl", "dpkg")
 load("//bazel:http_deb_archive.bzl", "http_deb_archive")

-ROCM_VERSION = "6.2.2"
+ROCM_VERSION = "6.3.1"
 BASE_URL = "https://repo.radeon.com/rocm/apt/{}".format(ROCM_VERSION)
-STRIP_PREFIX = "opt/rocm-6.2.2"
+STRIP_PREFIX = "opt/rocm-6.3.1"

 def pkg_kwargs(pkg):
    return dict(
@ -127,9 +127,17 @@ cc_import(
 )

 bytecode_select(
-    name = "runfiles",
+    name = "bytecodes",
    bytecodes = glob(["lib/rocblas/library/*"]),
    enabled_gfx = "@libpjrt_rocm//:gfx",
+)
+
+filegroup(
+    name = "runfiles",
+    srcs = [
+        ":bytecodes",
+        "lib/rocblas/library/TensileManifest.txt",
+    ],
    visibility = ["@libpjrt_rocm//:__subpackages__"],
 )
 """,
@ -148,6 +156,7 @@ cc_import(
 """,
    "hipblaslt": """\
 load("@zml//bazel:cc_import.bzl", "cc_import")
+load("@zml//runtimes/rocm:gfx.bzl", "bytecode_select")
 cc_import(
    name = "hipblaslt",
    shared_library = "lib/libhipblaslt.so.0",
@ -157,9 +166,6 @@ cc_import(
    },
    visibility = ["@libpjrt_rocm//:__subpackages__"],
 )
-""",
-    "hipblaslt-dev": """\
-load("@zml//runtimes/rocm:gfx.bzl", "bytecode_select")

 bytecode_select(
    name = "bytecodes",
@ -173,8 +179,9 @@ bytecode_select(
 filegroup(
    name = "runfiles",
    srcs = [
-        "lib/hipblaslt/library/hipblasltExtOpLibrary.dat",
        ":bytecodes",
+        "lib/hipblaslt/library/hipblasltExtOpLibrary.dat",
+        "lib/hipblaslt/library/TensileManifest.txt",
    ],
    visibility = ["@libpjrt_rocm//:__subpackages__"],
 )
--- a/runtimes/rocm/rocm.zig
+++ b/runtimes/rocm/rocm.zig
@ -1,7 +1,12 @@
 const builtin = @import("builtin");
+const std = @import("std");
+
 const asynk = @import("async");
-const pjrt = @import("pjrt");
+const bazel_builtin = @import("bazel_builtin");
 const c = @import("c");
+const pjrt = @import("pjrt");
+const runfiles = @import("runfiles");
+const stdx = @import("stdx");

 pub fn isEnabled() bool {
    return @hasDecl(c, "ZML_RUNTIME_ROCM");
@ -14,6 +19,44 @@ fn hasRocmDevices() bool {
    return true;
 }

+fn setupRocmEnv() !void {
+    var arena = std.heap.ArenaAllocator.init(std.heap.c_allocator);
+    defer arena.deinit();
+
+    const paths = .{
+        .{ "HIPBLASLT_EXT_OP_LIBRARY_PATH", "hipblaslt/lib/hipblaslt/library/hipblasltExtOpLibrary.dat", false },
+        .{ "HIPBLASLT_TENSILE_LIBPATH", "hipblaslt/lib/hipblaslt/library/TensileManifest.txt", true },
+        .{ "ROCBLAS_TENSILE_LIBPATH", "rocblas/lib/rocblas/library/TensileManifest.txt", true },
+        .{ "ROCM_PATH", "libpjrt_rocm/sandbox", false },
+    };
+
+    const r = blk: {
+        var r_ = try runfiles.Runfiles.create(.{ .allocator = arena.allocator() }) orelse {
+            stdx.debug.panic("Unable to find Runfiles directory", .{});
+        };
+        const source_repo = bazel_builtin.current_repository;
+        break :blk r_.withSourceRepo(source_repo);
+    };
+
+    inline for (paths) |path| {
+        const name = path[0];
+        const rpath = path[1];
+        const dirname = path[2];
+
+        var real_path = r.rlocationAlloc(arena.allocator(), rpath) catch null orelse {
+            stdx.debug.panic("Unable to find " ++ name ++ " in " ++ bazel_builtin.current_repository, .{});
+        };
+
+        if (dirname) {
+            real_path = std.fs.path.dirname(real_path) orelse {
+                stdx.debug.panic("Unable to dirname on {s}", .{real_path});
+            };
+        }
+
+        _ = c.setenv(name, try arena.allocator().dupeZ(u8, real_path), 1);
+    }
+}
+
 pub fn load() !*const pjrt.Api {
    if (comptime !isEnabled()) {
        return error.Unavailable;
@ -25,5 +68,7 @@ pub fn load() !*const pjrt.Api {
        return error.Unavailable;
    }

+    try setupRocmEnv();
+
    return try pjrt.Api.loadFrom("libpjrt_rocm.so");
 }
--- a/runtimes/rocm/zmlxrocm.cc
+++ b/runtimes/rocm/zmlxrocm.cc
@ -1,39 +1,9 @@
 #include <dlfcn.h>
 #include <errno.h>
 #include <stdlib.h>
+#include <string.h>

-#include <fstream>
-#include <iostream>
-#include <string>
-
-#include "tools/cpp/runfiles/runfiles.h"
-
-static void setup_runfiles(int argc, char **argv) __attribute__((constructor))
-{
-    using bazel::tools::cpp::runfiles::Runfiles;
-    auto runfiles = std::unique_ptr<Runfiles>(Runfiles::Create(argv[0], BAZEL_CURRENT_REPOSITORY));
-
-    auto HIPBLASLT_EXT_OP_LIBRARY_PATH =
-        runfiles->Rlocation("hipblaslt-dev/lib/hipblaslt/library/hipblasltExtOpLibrary.dat");
-    if (HIPBLASLT_EXT_OP_LIBRARY_PATH != "")
-    {
-        setenv("HIPBLASLT_EXT_OP_LIBRARY_PATH", HIPBLASLT_EXT_OP_LIBRARY_PATH.c_str(), 1);
-    }
-
-    auto HIPBLASLT_TENSILE_LIBPATH = runfiles->Rlocation("hipblaslt-dev/lib/hipblaslt/library");
-    if (HIPBLASLT_TENSILE_LIBPATH != "")
-    {
-        setenv("HIPBLASLT_TENSILE_LIBPATH", HIPBLASLT_TENSILE_LIBPATH.c_str(), 1);
-    }
-
-    auto ROCBLAS_TENSILE_LIBPATH = runfiles->Rlocation("rocblas/lib/rocblas/library");
-    setenv("ROCBLAS_TENSILE_LIBPATH", ROCBLAS_TENSILE_LIBPATH.c_str(), 1);
-
-    auto ROCM_PATH = runfiles->Rlocation("libpjrt_rocm/sandbox");
-    setenv("ROCM_PATH", ROCM_PATH.c_str(), 1);
-}
-
-extern "C" void *zmlxrocm_dlopen(const char *filename, int flags) __attribute__((visibility("default")))
+void *zmlxrocm_dlopen(const char *filename, int flags) __attribute__((visibility("default")))
 {
    if (filename != NULL)
    {
--- a/third_party/modules/libxev/20241208.0-db6a52b/MODULE.bazel
+++ b/third_party/modules/libxev/20241208.0-db6a52b/MODULE.bazel
@ -0,0 +1,7 @@
+module(
+    name = "libxev",
+    version = "20241208.0-db6a52b",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "rules_zig", version = "20240904.0-010da15")
--- a/third_party/modules/libxev/20241208.0-db6a52b/overlay/BUILD.bazel
+++ b/third_party/modules/libxev/20241208.0-db6a52b/overlay/BUILD.bazel
@ -0,0 +1,13 @@
+load("@rules_zig//zig:defs.bzl", "zig_library")
+
+zig_library(
+    name = "xev",
+    srcs = glob([
+        "src/*.zig",
+        "src/backend/*.zig",
+        "src/linux/*.zig",
+        "src/watcher/*.zig",
+    ]),
+    main = "main2.zig",
+    visibility = ["//visibility:public"],
+)
--- a/third_party/modules/libxev/20241208.0-db6a52b/overlay/MODULE.bazel
+++ b/third_party/modules/libxev/20241208.0-db6a52b/overlay/MODULE.bazel
@ -0,0 +1,7 @@
+module(
+    name = "libxev",
+    version = "20241208.0-db6a52b",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "rules_zig", version = "20240904.0-010da15")
--- a/third_party/modules/libxev/20241208.0-db6a52b/overlay/main2.zig
+++ b/third_party/modules/libxev/20241208.0-db6a52b/overlay/main2.zig
@ -0,0 +1,22 @@
+const builtin = @import("builtin");
+const root = @import("root");
+
+const main = @import("src/main.zig");
+
+pub const ThreadPool = main.ThreadPool;
+pub const stream = main.stream;
+
+pub const Options = struct {
+    linux_backend: main.Backend = .epoll,
+};
+
+pub const options: Options = if (@hasDecl(root, "xev_options")) root.xev_options else .{};
+
+const default: main.Backend = switch (builtin.os.tag) {
+    .ios, .macos => .kqueue,
+    .linux => options.linux_backend,
+    .wasi => .wasi_poll,
+    .windows => .iocp,
+    else => @compileError("Unsupported OS"),
+};
+pub usingnamespace default.Api();
--- a/third_party/modules/libxev/20241208.0-db6a52b/patches/128.patch
+++ b/third_party/modules/libxev/20241208.0-db6a52b/patches/128.patch
@ -0,0 +1,159 @@
+From 0d1c2f8258072148459d3114b9ccaf43c02e0958 Mon Sep 17 00:00:00 2001
+From: Steeve Morin <steeve@zml.ai>
+Date: Tue, 19 Nov 2024 16:14:14 +0100
+Subject: [PATCH 1/2] backend/epoll: implement eventfd wakeup notification
+
+Tries to mimic what happens in backend/kqueue.
+
+Closes #4
+---
+ src/backend/epoll.zig | 42 ++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 42 insertions(+)
+
+diff --git a/src/backend/epoll.zig b/src/backend/epoll.zig
+index ae4ec7d..f44d326 100644
+--- a/src/backend/epoll.zig
+++ b/src/backend/epoll.zig
+@@ -21,6 +21,12 @@ pub const Loop = struct {
+ 
+     fd: posix.fd_t,
+ 
+    /// The eventfd that this epoll queue always has a filter for. Writing
+    /// an empty message to this eventfd can be used to wake up the loop
+    /// at any time. Waking up the loop via this eventfd won't trigger any
+    /// particular completion, it just forces tick to cycle.
+    eventfd: xev.Async,
+
+     /// The number of active completions. This DOES NOT include completions that
+     /// are queued in the submissions queue.
+     active: usize = 0,
+@@ -56,8 +62,12 @@ pub const Loop = struct {
+     } = .{},
+ 
+     pub fn init(options: xev.Options) !Loop {
+        var eventfd = try xev.Async.init();
+        errdefer eventfd.deinit();
+
+         var res: Loop = .{
+             .fd = try posix.epoll_create1(std.os.linux.EPOLL.CLOEXEC),
+            .eventfd = eventfd,
+             .thread_pool = options.thread_pool,
+             .thread_pool_completions = undefined,
+             .cached_now = undefined,
+@@ -68,6 +78,7 @@ pub const Loop = struct {
+ 
+     pub fn deinit(self: *Loop) void {
+         posix.close(self.fd);
+        self.eventfd.deinit();
+     }
+ 
+     /// Run the event loop. See RunMode documentation for details on modes.
+@@ -262,9 +273,26 @@ pub const Loop = struct {
+         // Initialize
+         if (!self.flags.init) {
+             self.flags.init = true;
+
+             if (self.thread_pool != null) {
+                 self.thread_pool_completions.init();
+             }
+
+            var ev: linux.epoll_event = .{
+                .events = linux.EPOLL.IN | linux.EPOLL.RDHUP,
+                .data = .{ .ptr = 0 },
+            };
+            posix.epoll_ctl(
+                self.fd,
+                linux.EPOLL.CTL_ADD,
+                self.eventfd.fd,
+                &ev,
+            ) catch |err| {
+                // We reset initialization because we can't do anything
+                // safely unless we get this mach port registered!
+                self.flags.init = false;
+                return err;
+            };
+         }
+ 
+         // Submit all the submissions. We copy the submission queue so that
+@@ -369,6 +397,10 @@ pub const Loop = struct {
+ 
+             // Process all our events and invoke their completion handlers
+             for (events[0..n]) |ev| {
+                // Zero data values are internal events that we do nothing
+                // on such as the eventfd wakeup.
+                if (ev.data.ptr == 0) continue;
+
+                 const c: *Completion = @ptrFromInt(@as(usize, @intCast(ev.data.ptr)));
+ 
+                 // We get the fd and mark this as in progress we can properly
+@@ -415,6 +447,7 @@ pub const Loop = struct {
+         const pool = self.thread_pool orelse return error.ThreadPoolRequired;
+ 
+         // Setup our completion state so that thread_perform can do stuff
+        c.task_loop = self;
+         c.task_completions = &self.thread_pool_completions;
+         c.task = .{ .callback = Loop.thread_perform };
+ 
+@@ -436,6 +469,14 @@ pub const Loop = struct {
+ 
+         // Add to our completion queue
+         c.task_completions.push(c);
+
+        // Wake up our main loop
+        c.task_loop.wakeup() catch {};
+    }
+
+    /// Sends an empty message to this loop's eventfd so that it wakes up.
+    fn wakeup(self: *Loop) !void {
+        try self.eventfd.notify();
+     }
+ 
+     fn start(self: *Loop, completion: *Completion) void {
+@@ -800,6 +841,7 @@ pub const Completion = struct {
+     /// reliable way to get access to the loop and shouldn't be used
+     /// except internally.
+     task: ThreadPool.Task = undefined,
+    task_loop: *Loop = undefined,
+     task_completions: *Loop.TaskCompletionQueue = undefined,
+     task_result: Result = undefined,
+ 
+
+From 38d4dbed71a732b0fc30c1181354ad9d53919402 Mon Sep 17 00:00:00 2001
+From: Corentin Godeau <corentin@zml.ai>
+Date: Tue, 14 Jan 2025 14:43:54 +0000
+Subject: [PATCH 2/2] backend/epoll: read the wakeup eventfd to avoid being
+ awaken again
+
+---
+ src/backend/epoll.zig | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/src/backend/epoll.zig b/src/backend/epoll.zig
+index f44d326..f84c687 100644
+--- a/src/backend/epoll.zig
+++ b/src/backend/epoll.zig
+@@ -280,7 +280,7 @@ pub const Loop = struct {
+ 
+             var ev: linux.epoll_event = .{
+                 .events = linux.EPOLL.IN | linux.EPOLL.RDHUP,
+-                .data = .{ .ptr = 0 },
+                .data = .{ .fd = self.eventfd.fd },
+             };
+             posix.epoll_ctl(
+                 self.fd,
+@@ -397,9 +397,12 @@ pub const Loop = struct {
+ 
+             // Process all our events and invoke their completion handlers
+             for (events[0..n]) |ev| {
+-                // Zero data values are internal events that we do nothing
+-                // on such as the eventfd wakeup.
+-                if (ev.data.ptr == 0) continue;
+                // Handle wakeup eventfd
+                if (ev.data.fd == self.eventfd.fd) {
+                    var buffer: u64 = undefined;
+                    _ = posix.read(self.eventfd.fd, std.mem.asBytes(&buffer)) catch {};
+                    continue;
+                }
+ 
+                 const c: *Completion = @ptrFromInt(@as(usize, @intCast(ev.data.ptr)));
+ 
--- a/third_party/modules/libxev/20241208.0-db6a52b/source.json
+++ b/third_party/modules/libxev/20241208.0-db6a52b/source.json
@ -0,0 +1,14 @@
+{
+    "strip_prefix": "libxev-db6a52bafadf00360e675fefa7926e8e6c0e9931",
+    "url": "https://github.com/zml/libxev/archive/db6a52bafadf00360e675fefa7926e8e6c0e9931.tar.gz",
+    "integrity": "sha256-4GT5wkfkZnIjNv20yDiWEzHAhbIiwHHJfS7A4u/LoNQ=",
+    "overlay": {
+        "MODULE.bazel": "",
+        "BUILD.bazel": "",
+        "main2.zig": ""
+    },
+    "patches": {
+        "128.patch": ""
+    },
+    "patch_strip": 1
+}
--- a/third_party/modules/libxev/metadata.json
+++ b/third_party/modules/libxev/metadata.json
@ -12,7 +12,9 @@
    ],
    "versions": [
        "20240825.0-dbe2291",
-        "20240910.0-a2d9b31"
+        "20240910.0-a2d9b31",
+        "20241119.0-6afcde9",
+        "20241208.0-db6a52b"
    ],
    "yanked_versions": {}
 }
--- a/zml/module.zig
+++ b/zml/module.zig
@ -2,7 +2,6 @@ const std = @import("std");

 const asynk = @import("async");
 const dialect = @import("mlir/dialects");
-const runfiles = @import("runfiles");
 const stdx = @import("stdx");
 const xla_pb = @import("//xla:xla_proto");

@ -902,10 +901,11 @@ fn compileModuleToPjrtExecutable(arena: std.mem.Allocator, platform: Platform, m
        }
    }
    switch (platform.target) {
-        .cuda => cuda_dir: {
-            // NVIDIA recommends to disable Triton GEMM on JAX:
+        .cuda => {
+            // NVIDIA recommends these settings
            // https://github.com/NVIDIA/JAX-Toolbox?tab=readme-ov-file#environment-variables
            setFlag(&options, "xla_gpu_enable_triton_gemm", false);
+            setFlag(&options, "xla_gpu_enable_latency_hiding_scheduler", true);
            //  setFlag(&options, "xla_gpu_enable_cudnn_fmha", true);
            //  setFlag(&options, "xla_gpu_fused_attention_use_cudnn_rng", true);
            //  setFlag(&options, "xla_gpu_enable_cudnn_layer_norm", true);
@ -913,17 +913,6 @@ fn compileModuleToPjrtExecutable(arena: std.mem.Allocator, platform: Platform, m
            //  setFlag(&options, "xla_gpu_enable_dynamic_slice_fusion", true);
            //  setFlag(&options, "xla_gpu_enable_while_loop_double_buffering", true);
            //  setFlag(&options, "xla_gpu_use_runtime_fusion", true);
-            //  setFlag(&options, "xla_gpu_enable_latency_hiding_scheduler", true);
-            var r_ = try runfiles.Runfiles.create(.{ .allocator = arena }) orelse {
-                log.warn("Bazel runfile not found !", .{});
-                break :cuda_dir;
-            };
-            defer r_.deinit(arena);
-            const source_repo = @import("bazel_builtin").current_repository;
-            const r = r_.withSourceRepo(source_repo);
-            const cuda_data_dir = (try r.rlocationAlloc(arena, "libpjrt_cuda/sandbox")).?;
-            log.info("xla_gpu_cuda_data_dir: {s}", .{cuda_data_dir});
-            setFlag(&options, "xla_gpu_cuda_data_dir", cuda_data_dir);
        },
        .rocm => {
            // Disable Triton GEMM on ROCM. For some reason it's much, much slower when