From 675cefaf2670bb2bc60d79ce9dd97b6ad78bceaf Mon Sep 17 00:00:00 2001
From: Tarry Singh <tarry.singh@deepkapha.com>
Date: Mon, 20 Oct 2025 10:24:27 +0000
Subject: [PATCH] Patch http_deb_archive rule and improve stdx utilities and
 ZML callback handling

---
 bazel/http_deb_archive.bzl |  1 +
 stdx/stdx.zig              | 10 ++++++++++
 zml/callback.zig           |  8 +++++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/bazel/http_deb_archive.bzl b/bazel/http_deb_archive.bzl
index f847e6f..91b52f5 100644
--- a/bazel/http_deb_archive.bzl
+++ b/bazel/http_deb_archive.bzl
@@ -39,6 +39,7 @@ http_deb_archive = repository_rule(
         "strip_prefix": attr.string(),
         "build_file": attr.label(allow_single_file = True),
         "build_file_content": attr.string(),
+        "patches": attr.label_list(allow_files = True),
         "workspace_file": attr.label(allow_single_file = True),
         "workspace_file_content": attr.string(),
     },
diff --git a/stdx/stdx.zig b/stdx/stdx.zig
index 4698349..3ec8b8b 100644
--- a/stdx/stdx.zig
+++ b/stdx/stdx.zig
@@ -24,3 +24,13 @@ pub inline fn stackSlice(comptime max_len: usize, T: type, len: usize) []T {
 }
 
 pub const noalloc: std.mem.Allocator = if (builtin.mode == .ReleaseFast) undefined else std.testing.failing_allocator;
+
+pub fn pinToCore(core_id: usize) void {
+    if (builtin.os.tag == .linux) {
+        const CPUSet = std.bit_set.ArrayBitSet(usize, std.os.linux.CPU_SETSIZE * @sizeOf(usize));
+
+        var set: CPUSet = .initEmpty();
+        set.set(core_id);
+        std.os.linux.sched_setaffinity(0, @ptrCast(&set.masks)) catch {};
+    }
+}
diff --git a/zml/callback.zig b/zml/callback.zig
index 5af96d1..ea7a4b2 100644
--- a/zml/callback.zig
+++ b/zml/callback.zig
@@ -120,7 +120,13 @@ pub fn call(
 pub const Config = struct {
     output_operand_aliases: []const i64 = &.{},
     copy_inputs_to_host_pinned: bool = false,
-    // TODO: document precisely what `command_buffer_compatible` is doing and its limitations.
+
+    /// Indicates that the handler is compatible with command buffers (ie Cuda graphs).
+    /// The Cuda backend traces the execution of the callback and records it as a Cuda graph.
+    /// To be compatible with CUDA graphs, the callback need:
+    ///   1. to not use any synchronous operations
+    ///   2. to not allocate any new host/device memory
+    /// See: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#prohibited-and-unhandled-operations.
     traits: pjrt.ffi.HandlerTraits = .{ .command_buffer_compatible = false },
     // TODO: handle sharded inputs
     has_side_effect: bool = true,