Implement buffer‑ID based loading by moving tensor ID handling into BufferStore, fix zml.call tag hashing, and expose CPU device count.

2025-08-28 14:39:21 +00:00 · 2025-08-28 14:39:21 +00:00 · 7913c00d70
commit 7913c00d70
parent 6e7617918d
13 changed files with 272 additions and 131 deletions
--- a/stdx/fmt.zig
+++ b/stdx/fmt.zig
@ -1,5 +1,6 @@
 const std = @import("std");
 /// Properly format a slice of numbers.
 pub fn slice(any_slice: anytype) FmtSlice(std.meta.Elem(@TypeOf(any_slice))) {
    return .{ .slice = any_slice };
 }
@ -115,3 +116,57 @@ pub fn formatComplexSlice(values: anytype, spec: std.fmt.Number, writer: *std.Io
 pub fn formatBoolSlice(values: anytype, spec: std.fmt.Number, writer: *std.Io.Writer) !void {
    return try formatSliceCustom(formatBool, values, spec, writer);
 }
 /// Format a struct using `format` method of subfields when possible.
 pub fn any(any_val: anytype) FmtAny(@TypeOf(any_val)) {
    return .{ .data = any_val };
 }
 fn FmtAny(Data: type) type {
    return struct {
        data: Data,
        pub inline fn format(self: @This(), writer: *std.io.Writer) std.io.Writer.Error!void {
            try printValue(writer, .{}, self.data, std.options.fmt_max_depth);
        }
    };
 }
 /// Fix up of std.io.Writer.printValue that uses `format` method of subfields when possible
 fn printValue(
    w: *std.io.Writer,
    options: std.fmt.Options,
    value: anytype,
    max_depth: usize,
 ) std.io.Writer.Error!void {
    const T = @TypeOf(value);
    if (std.meta.hasMethod(T, "format")) {
        return try value.format(w);
    }
    if (std.meta.hasMethod(T, "formatNumber")) {
        return try value.formatNumber(w, options.toNumber(.decimal, .lower));
    }
    if (max_depth == 0) {
        try w.writeAll(".{ ... }");
        return;
    }
    switch (@typeInfo(T)) {
        .@"struct" => |info| {
            try w.writeAll(".{ ");
            inline for (info.fields, 0..) |f, i| {
                if (i > 0) try w.writeAll(", ");
                if (!info.is_tuple) {
                    try w.writeByte('.');
                    try w.writeAll(f.name);
                    try w.writeAll(" = ");
                }
                try printValue(w, options, @field(value, f.name), max_depth - 1);
            }
            try w.writeAll(" }");
        },
        inline else => try w.printValue("any", options, value, max_depth - 1),
    }
 }
--- a/stdx/stdx.zig
+++ b/stdx/stdx.zig
@ -1,3 +1,6 @@
 const std = @import("std");
 const builtin = @import("builtin");
 pub const BoundedArray = @import("bounded_array.zig").BoundedArray;
 pub const BoundedArrayAligned = @import("bounded_array.zig").BoundedArrayAligned;
 pub const debug = @import("debug.zig");
@ -11,7 +14,6 @@ pub const queue = @import("queue.zig");
 pub const time = @import("time.zig");
 test {
    const std = @import("std");
    std.testing.refAllDecls(@This());
 }
@ -20,3 +22,5 @@ pub inline fn stackSlice(comptime max_len: usize, T: type, len: usize) []T {
    var storage: [max_len]T = undefined;
    return storage[0..len];
 }
 pub const noalloc: std.mem.Allocator = if (builtin.mode == .ReleaseFast) undefined else std.testing.failing_allocator;
--- a/zml/aio.zig
+++ b/zml/aio.zig
@ -43,8 +43,8 @@ pub fn detectFormatAndOpen(allocator: std.mem.Allocator, model_path: []const u8)
 /// whose shape is read from the "a.b" tensor.
 /// * If `Model` contains a list of layers, then the field:
 /// `Model.layers[2].a.b` will be populated from the "layers.2.a.b" tensor.
-pub fn populateModel(comptime Model: type, allocator: std.mem.Allocator, buffer_store: BufferStore) !Model {
+pub fn populateModel(comptime Model: type, allocator: std.mem.Allocator, store: BufferStore) !Model {
-    return populateModelWithPrefix(Model, allocator, buffer_store, "");
+    return populateModelWithPrefix(Model, allocator, store, "");
 }
 /// Creates a Model struct with tensor shapes read from the given TensorStore,
@ -62,8 +62,7 @@ pub fn populateModelWithPrefix(comptime Model: type, allocator: std.mem.Allocato
    try prefix_builder.push(allocator, prefix);
    defer prefix_builder.deinit(allocator);
-    const unique_id = zml.Tensor._reserveIdRange(@intCast(store.buffers.count()));
+    const ok = _populateStruct(allocator, &prefix_builder, store, &model, true) catch |err| {
    const ok = _populateStruct(allocator, &prefix_builder, unique_id, store, &model, true) catch |err| {
        std.debug.panic("Can't populate model of type {s}: {s}", .{ @typeName(type), @errorName(err) });
    };
    if (!ok) return error.TensorNotFound;
@ -74,17 +73,28 @@ pub fn populateModelWithPrefix(comptime Model: type, allocator: std.mem.Allocato
 pub const BufferStore = struct {
    pub const Buffers = std.StringArrayHashMapUnmanaged(HostBuffer);
    pub const Metadatas = std.StringArrayHashMapUnmanaged(Metadata);
    var _unique_store_id: std.atomic.Value(u64) = .init(0);
    const _store_id_range: u64 = 1024 * 1024 * 1024;
    arena: std.heap.ArenaAllocator,
    files: []MemoryMappedFile = &.{},
    buffers: Buffers = .{},
    _metadata: Metadatas = .{},
    _unique_id: u64,
-    /// Create an empty BufferStore. Takes owneship of the given files.
+    /// Create an empty BufferStore.
-    pub fn init(allocator: std.mem.Allocator, files: []const MemoryMappedFile) error{OutOfMemory}!BufferStore {
+    /// Takes owneship of the given files.
-        var self: zml.aio.BufferStore = .{
+    pub fn init(allocator: std.mem.Allocator) BufferStore {
        return .{
            .arena = std.heap.ArenaAllocator.init(allocator),
            ._unique_id = _unique_store_id.fetchAdd(_store_id_range, .monotonic),
        };
    }
    /// Create an empty BufferStore.
    /// Takes owneship of the given files.
    pub fn initWithFiles(allocator: std.mem.Allocator, files: []const MemoryMappedFile) error{OutOfMemory}!BufferStore {
        var self: BufferStore = .init(allocator);
        self.files = try self.arena.allocator().dupe(MemoryMappedFile, files);
        return self;
    }
@ -100,6 +110,61 @@ pub const BufferStore = struct {
        return self.buffers.get(key);
    }
    pub fn loadBufferById(self: BufferStore, x: zml.Tensor, platform: zml.Platform) !zml.Buffer {
        var host_buffer: zml.HostBuffer = switch (x._id) {
            .buffer_id => |id| hb: {
                if (id < self._unique_id or self._unique_id + _store_id_range <= id) {
                    @panic("`store.loadBufferById()` only works on Tensor created by `store.getTensor()`, using the same store object.");
                }
                break :hb self.buffers.values()[id - self._unique_id];
            },
            else => @panic("`store.loadBufferById()` only works on Tensor created by `store.getTensor()`"),
        };
        // Use the sharding information stored in the tensor.
        host_buffer._shape = x.shape();
        return try host_buffer.toDevice(platform);
    }
    /// Creates a bufferized version of a model from the given BufferStore.
    ///
    /// This will represent the weights of the model, loaded on a specific platform.
    /// It can be used with a `module.Exe` (a compiled version of the same Model), to make a
    /// `module.ExeWithWeights` ready to be called.
    pub fn loadModelById(self: BufferStore, Model: type, allocator: std.mem.Allocator, model: Model, platform: zml.Platform) !zml.Bufferized(Model) {
        const Ctx = struct {
            platform: *const zml.Platform,
            store: *const BufferStore,
            pub fn cb(ctx: @This(), x: zml.Tensor) zml.Buffer {
                return ctx.store.loadBufferById(x, ctx.platform.*) catch @panic("Failed to load buffer to device");
            }
        };
        var res: zml.Bufferized(Model) = undefined;
        try zml.meta.mapAlloc(Ctx.cb, allocator, .{ .platform = &platform, .store = &self }, model, &res);
        return res;
    }
    pub fn getTensor(self: BufferStore, key: []const u8) zml.Tensor {
        return self.getTensorOrNull(key) orelse {
            log.err("Tensor not found: {s}", .{key});
            self.findSimilarBufferKeys(std.heap.smp_allocator, key);
            @panic("Tensor not found");
        };
    }
    pub fn getTensorOrNull(self: BufferStore, key: []const u8) ?zml.Tensor {
        return if (self.buffers.getIndex(key)) |entry_idx|
            .{
                ._shape = self.buffers.values()[entry_idx].shape(),
                ._id = .{ .buffer_id = self._unique_id + entry_idx },
                ._donation = .input_buffer,
            }
        else
            return null;
    }
    /// Count layers starting with the given prefix.
    pub fn countLayers(self: BufferStore, prefix: []const u8) usize {
        // Note: This is kinda inefficient
@ -140,6 +205,58 @@ pub const BufferStore = struct {
        return null;
    }
    /// Assists in debuggigng `BufferNotFound` error
    /// This is useful when a buffer key is not found and you want to identify possible alternatives (or typos)
    pub fn findSimilarBufferKeys(store: BufferStore, tmp_alloc: std.mem.Allocator, original_key: []const u8) void {
        const suffixes = [_][]const u8{ "", ".weight", ".bias" };
        var shown_keys = std.StringHashMap(void).init(tmp_alloc);
        defer shown_keys.deinit();
        // remove suffix .weight and .bias
        var base_key = original_key;
        for (suffixes) |suffix| {
            if (std.mem.endsWith(u8, original_key, suffix)) {
                base_key = original_key[0 .. original_key.len - suffix.len];
                break;
            }
        }
        // first test: look for exact matches
        var matches: usize = 0;
        var it = store.buffers.iterator();
        while (it.next()) |entry| {
            const key = entry.key_ptr.*;
            if (std.mem.startsWith(u8, key, base_key)) {
                if (matches == 0) log.warn("Similar buffers found:", .{});
                if (!shown_keys.contains(key)) {
                    log.warn("  - {s}: {f}", .{ key, entry.value_ptr.*.shape() });
                    shown_keys.put(key, {}) catch continue;
                    matches += 1;
                }
            }
        }
        // second test: progressive partial matches
        if (matches == 0) {
            var components = std.mem.splitScalar(u8, base_key, '.');
            while (components.next()) |component| {
                matches = 0;
                it = store.buffers.iterator();
                while (it.next()) |entry| {
                    const key = entry.key_ptr.*;
                    if (std.mem.indexOf(u8, key, component) != null and !shown_keys.contains(key)) {
                        if (matches == 0) log.warn("Partial matches for '{s}':", .{component});
                        log.warn("  - {s}: {f}", .{ key, entry.value_ptr.*.shape() });
                        shown_keys.put(key, {}) catch continue;
                        matches += 1;
                        if (matches >= 5) break;
                    }
                }
                if (matches > 0) break;
            }
        }
    }
 };
 pub const Metadata = union(enum) {
@ -260,17 +377,35 @@ pub const MemoryMappedFile = struct {
 /// Helper handling prefix building.
 ///
 /// This allows to easily push/pop prefixes and handles the generation of the string with the correct format.
-const PrefixBuilder = struct {
+pub const PrefixBuilder = struct {
    /// Stores the computed prefix.
-    data: std.ArrayListUnmanaged(u8) = .{},
+    data: std.ArrayList(u8) = .{},
    /// Stack storing the size of the intermediary prefix.
-    subprefixes: std.ArrayListUnmanaged(u32) = .{},
+    subprefixes: std.ArrayList(u32) = .{},
    pub fn initCapacity(allocator: std.mem.Allocator, capacity: usize) !PrefixBuilder {
        return .{
            .data = try .initCapacity(allocator, capacity),
            .subprefixes = try .initCapacity(allocator, @divFloor(capacity, 4)),
        };
    }
    pub fn deinit(self: *PrefixBuilder, allocator: std.mem.Allocator) void {
        self.data.deinit(allocator);
        self.subprefixes.deinit(allocator);
    }
    pub fn items(self: PrefixBuilder) []const u8 {
        return self.data.items;
    }
    pub fn concat(self: *PrefixBuilder, prefix: []const u8) []const u8 {
        self.push(stdx.noalloc, prefix) catch unreachable;
        const res = self.items();
        self.pop();
        return res;
    }
    pub fn push(self: *PrefixBuilder, allocator: std.mem.Allocator, prefix: []const u8) !void {
        const old_len: u32 = @intCast(self.data.items.len);
        try self.subprefixes.append(allocator, old_len);
@ -301,13 +436,20 @@ const PrefixBuilder = struct {
        const last_prefix_len = self.subprefixes.pop() orelse unreachable;
        self.data.shrinkRetainingCapacity(last_prefix_len);
    }
    pub fn checkpoint(self: PrefixBuilder) [2]usize {
        return .{ self.data.items.len, self.subprefixes.items.len };
    }
    pub fn restore(self: *PrefixBuilder, ckpt: [2]usize) void {
        self.data.items.len, self.subprefixes.items.len = ckpt;
    }
 };
 fn _populateStruct(
    allocator: std.mem.Allocator,
    prefix_builder: *PrefixBuilder,
-    unique_id: u64,
+    store: BufferStore,
    buffer_store: BufferStore,
    obj: anytype,
    required: bool,
 ) !bool {
@ -322,17 +464,12 @@ fn _populateStruct(
    const prefix = prefix_builder.data.items;
    if (T == zml.Tensor) {
-        return if (buffer_store.buffers.getIndex(prefix)) |entry_idx| {
+        return if (store.getTensorOrNull(prefix)) |tensor| {
-            const buffer = buffer_store.get(prefix).?;
+            obj.* = tensor;
            obj.* = zml.Tensor{
                ._shape = buffer.shape(),
                ._id = .{ .buffer_id = unique_id + entry_idx },
                ._donation = .input_buffer,
            };
            return true;
        } else {
            if (required) {
-                log.err("Tensor not found: {s} ({d})", .{ prefix, buffer_store.buffers.count() });
+                log.err("Tensor not found: {s} ({d})", .{ prefix, store.buffers.count() });
            }
            return false;
        };
@ -343,14 +480,14 @@ fn _populateStruct(
            if (ptr_info.size == .slice) {
                obj.* = &.{};
-                const len = buffer_store.countLayers(prefix);
+                const len = store.countLayers(prefix);
                if (len > 0) {
                    obj.* = try allocator.alloc(ptr_info.child, len);
                    for (obj.*, 0..) |*value, i| {
                        try prefix_builder.pushDigit(allocator, i);
                        defer prefix_builder.pop();
-                        const found = try _populateStruct(allocator, prefix_builder, unique_id, buffer_store, value, required);
+                        const found = try _populateStruct(allocator, prefix_builder, store, value, required);
                        if (!found) {
                            log.err("Not able to load {s} as {s}", .{ prefix_builder.data.items, @typeName(ptr_info.child) });
                            return false;
@ -369,7 +506,7 @@ fn _populateStruct(
            for (obj, 0..) |*value, i| {
                try prefix_builder.pushDigit(allocator, i);
                defer prefix_builder.pop();
-                const found = try _populateStruct(allocator, prefix_builder, unique_id, buffer_store, value, required);
+                const found = try _populateStruct(allocator, prefix_builder, store, value, required);
                if (!found) {
                    log.err("Not able to load {s} as {s}", .{ prefix_builder.data.items, @typeName(arr_info.child) });
                    return false;
@ -386,15 +523,20 @@ fn _populateStruct(
                var has_default = false;
                if (field.default_value_ptr) |_| has_default = true;
-                const field_found = try _populateStruct(allocator, prefix_builder, unique_id, buffer_store, &@field(obj, field.name), required and !has_default);
+
-                partial_struct = partial_struct or field_found;
+                if (zml.meta.Contains(field.type, zml.Tensor)) {
-                if (!field_found) {
+                    const field_found = try _populateStruct(allocator, prefix_builder, store, &@field(obj, field.name), required and !has_default);
-                    if (field.default_value_ptr) |v| {
+                    partial_struct = partial_struct or field_found;
-                        @field(obj, field.name) = @as(*const field.type, @ptrCast(@alignCast(v))).*;
+                    if (!field_found) {
-                    } else {
+                        if (field.default_value_ptr) |v| {
-                        if (partial_struct) {
+                            @field(obj, field.name) = @as(*const field.type, @ptrCast(@alignCast(v))).*;
-                            log.warn("Incomplete metadata '{0s}': {1s}. Missing field: '{2s}'. '{0s}' will be ignored.", .{ prefix, @typeName(T), field.name });
+                        } else {
-                            obj.* = undefined;
+                            if (partial_struct) {
                                log.warn("Incomplete struct '{0s}': {1s}. Missing field: '{2s}'. '{0s}' will be ignored.", .{ prefix, @typeName(T), field.name });
                                obj.* = undefined;
                                return false;
                            }
                            return false;
                        }
@ -406,7 +548,7 @@ fn _populateStruct(
        },
        .optional => |opt_info| {
            obj.* = @as(opt_info.child, undefined);
-            const found = try _populateStruct(allocator, prefix_builder, unique_id, buffer_store, &(obj.*.?), false);
+            const found = try _populateStruct(allocator, prefix_builder, store, &(obj.*.?), false);
            if (!found) obj.* = null;
            return true;
        },
@ -449,7 +591,7 @@ test populateModel {
    var arena_state = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena_state.deinit();
-    var store: BufferStore = .{ .arena = arena_state };
+    var store: BufferStore = .init(arena_state.allocator());
    try store.buffers.ensureUnusedCapacity(arena_state.allocator(), 16);
    store.buffers.putAssumeCapacity("a", Model._newHostBuffer(10));
    store.buffers.putAssumeCapacity("b.a", Model._newHostBuffer(20));
@ -494,11 +636,11 @@ test populateModel {
 pub fn loadBuffers(
    comptime Model: type,
    init_args: if (@hasDecl(Model, "init")) stdx.meta.Tail(stdx.meta.FnArgs(Model.init)) else void,
-    buffer_store: BufferStore,
+    store: BufferStore,
    allocator: std.mem.Allocator,
    platform: zml.Platform,
 ) !zml.Bufferized(Model) {
-    return loadBuffersWithPrefix(Model, init_args, buffer_store, allocator, platform, "");
+    return loadBuffersWithPrefix(Model, init_args, store, allocator, platform, "");
 }
 /// Creates a bufferized version of a Model from the given BufferStore with a specified prefix.
@ -512,7 +654,7 @@ pub fn loadBuffers(
 pub fn loadBuffersWithPrefix(
    comptime Model: type,
    init_args: if (@hasDecl(Model, "init")) stdx.meta.Tail(stdx.meta.FnArgs(Model.init)) else void,
-    buffer_store: BufferStore,
+    store: BufferStore,
    allocator: std.mem.Allocator,
    platform: zml.Platform,
    prefix: []const u8,
@ -522,14 +664,14 @@ pub fn loadBuffersWithPrefix(
    const arena = arena_state.allocator();
    // Get model structure with tensor shapes from the buffer store with prefix
-    var model: Model = try zml.aio.populateModelWithPrefix(Model, arena, buffer_store, prefix);
+    var model: Model = try zml.aio.populateModelWithPrefix(Model, arena, store, prefix);
    // If the Model has a "init" function, call it with the given parameters.
    if (@hasDecl(Model, "init")) {
        @call(.auto, Model.init, .{&model} ++ init_args);
    }
-    return loadModelBuffersWithPrefix(Model, model, buffer_store, allocator, platform, prefix);
+    return loadModelBuffersWithPrefix(Model, model, store, allocator, platform, prefix);
 }
 /// Creates a bufferized version of a Model from the given BufferStore. For details about
@ -541,11 +683,11 @@ pub fn loadBuffersWithPrefix(
 pub fn loadModelBuffers(
    comptime Model: type,
    model: Model,
-    buffer_store: BufferStore,
+    store: BufferStore,
    allocator: std.mem.Allocator,
    platform: zml.Platform,
 ) !zml.Bufferized(Model) {
-    return try loadModelBuffersWithPrefix(Model, model, buffer_store, allocator, platform, "");
+    return try loadModelBuffersWithPrefix(Model, model, store, allocator, platform, "");
 }
 /// Creates a bufferized version of a Model from the given BufferStore and the given prefix.
@ -557,7 +699,7 @@ pub fn loadModelBuffers(
 pub fn loadModelBuffersWithPrefix(
    comptime Model: type,
    model: Model,
-    buffer_store: BufferStore,
+    store: BufferStore,
    allocator: std.mem.Allocator,
    platform: zml.Platform,
    prefix: []const u8,
@ -576,7 +718,7 @@ pub fn loadModelBuffersWithPrefix(
    try prefix_builder.push(allocator, prefix);
    defer prefix_builder.deinit(allocator);
-    try visitStructAndLoadBuffer(allocator, &prefix_builder, buffer_store, &res, platform);
+    try visitStructAndLoadBuffer(allocator, &prefix_builder, store, &res, platform);
    return res;
 }
@ -590,58 +732,6 @@ pub fn unloadBuffers(model: anytype) void {
    }).cb, {}, model);
 }
 /// Assists in debuggigng `BufferNotFound` error
 /// This is useful when a buffer key is not found and you want to identify possible alternatives (or typos)
 fn findSimilarBufferKeys(original_key: []const u8, store: BufferStore, temp_allocator: std.mem.Allocator) void {
    const suffixes = [_][]const u8{ "", ".weight", ".bias" };
    var shown_keys = std.StringHashMap(void).init(temp_allocator);
    defer shown_keys.deinit();
    // remove suffix .weight and .bias
    var base_key = original_key;
    for (suffixes) |suffix| {
        if (std.mem.endsWith(u8, original_key, suffix)) {
            base_key = original_key[0 .. original_key.len - suffix.len];
            break;
        }
    }
    // first test: look for exact matches
    var matches: usize = 0;
    var it = store.buffers.iterator();
    while (it.next()) |entry| {
        const key = entry.key_ptr.*;
        if (std.mem.startsWith(u8, key, base_key)) {
            if (matches == 0) log.warn("Similar buffers found:", .{});
            if (!shown_keys.contains(key)) {
                log.warn("  - {s}: {f}", .{ key, entry.value_ptr.*.shape() });
                shown_keys.put(key, {}) catch continue;
                matches += 1;
            }
        }
    }
    // second test: progressive partial matches
    if (matches == 0) {
        var components = std.mem.splitScalar(u8, base_key, '.');
        while (components.next()) |component| {
            matches = 0;
            it = store.buffers.iterator();
            while (it.next()) |entry| {
                const key = entry.key_ptr.*;
                if (std.mem.indexOf(u8, key, component) != null and !shown_keys.contains(key)) {
                    if (matches == 0) log.warn("Partial matches for '{s}':", .{component});
                    log.warn("  - {s}: {f}", .{ key, entry.value_ptr.*.shape() });
                    shown_keys.put(key, {}) catch continue;
                    matches += 1;
                    if (matches >= 5) break;
                }
            }
            if (matches > 0) break;
        }
    }
 }
 /// deinit all buffers in the given struct
 pub fn awaitAll(buffers: anytype) !void {
    zml.meta.visit((struct {
@ -651,7 +741,7 @@ pub fn awaitAll(buffers: anytype) !void {
    }).cb, {}, buffers);
 }
-fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *PrefixBuilder, buffer_store: BufferStore, obj: anytype, platform: zml.Platform) !void {
+fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *PrefixBuilder, store: BufferStore, obj: anytype, platform: zml.Platform) !void {
    const err_msg = "visitStructAndLoadBuffer must be called with a pointer to type. Received ";
    const type_info, const T = switch (@typeInfo(@TypeOf(obj))) {
        .pointer => |ptr_info| switch (ptr_info.size) {
@ -663,7 +753,7 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
    const prefix = prefix_builder.data.items;
    if (T == zml.Buffer) {
-        return if (buffer_store.get(prefix)) |host_buffer| {
+        return if (store.get(prefix)) |host_buffer| {
            // obj._shape has been set inside `loadModelBuffersWithPrefix`, before calling us.
            var buf_with_metadata = host_buffer;
            log.debug("Loading buffer {s} ({f})", .{ prefix, obj._shape });
@ -673,7 +763,7 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
        } else {
            log.err("Buffer not found: {s}", .{prefix});
-            findSimilarBufferKeys(prefix, buffer_store, allocator);
+            store.findSimilarBufferKeys(allocator, prefix);
            return error.BufferNotFound;
        };
@ -686,7 +776,7 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
                    try prefix_builder.pushDigit(allocator, i);
                    defer prefix_builder.pop();
-                    try visitStructAndLoadBuffer(allocator, prefix_builder, buffer_store, value, platform);
+                    try visitStructAndLoadBuffer(allocator, prefix_builder, store, value, platform);
                }
            } else stdx.debug.compileError("type not supported by visitStructAndLoadBuffer: {}", .{T});
        },
@ -694,7 +784,7 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
            for (obj, 0..) |*value, i| {
                try prefix_builder.pushDigit(allocator, i);
                defer prefix_builder.pop();
-                try visitStructAndLoadBuffer(allocator, prefix_builder, buffer_store, value, platform);
+                try visitStructAndLoadBuffer(allocator, prefix_builder, store, value, platform);
            }
        },
@ -704,12 +794,12 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
                try prefix_builder.push(allocator, field.name);
                defer prefix_builder.pop();
-                try visitStructAndLoadBuffer(allocator, prefix_builder, buffer_store, &@field(obj, field.name), platform);
+                try visitStructAndLoadBuffer(allocator, prefix_builder, store, &@field(obj, field.name), platform);
            }
        },
        .optional => {
            if (obj.*) |*obj_val| {
-                try visitStructAndLoadBuffer(allocator, prefix_builder, buffer_store, obj_val, platform);
+                try visitStructAndLoadBuffer(allocator, prefix_builder, store, obj_val, platform);
            }
        },
        else => {},
--- a/zml/aio/safetensors.zig
+++ b/zml/aio/safetensors.zig
@ -13,9 +13,7 @@ const StringBuilder = std.ArrayListUnmanaged(u8);
 const log = std.log.scoped(.@"zml/io");
 pub fn open(allocator: std.mem.Allocator, path: []const u8) !zml.aio.BufferStore {
-    var res: zml.aio.BufferStore = .{
+    var res: zml.aio.BufferStore = .init(allocator);
        .arena = std.heap.ArenaAllocator.init(allocator),
    };
    errdefer res.arena.deinit();
    const arena = res.arena.allocator();
--- a/zml/aio/tinyllama.zig
+++ b/zml/aio/tinyllama.zig
@ -25,9 +25,7 @@ const TinyLlamaConfig = extern struct {
 /// For convenience we use the same layer names
 /// than the one used by the Llama-3.1 models.
 pub fn open(allocator: std.mem.Allocator, model_path: []const u8) !zml.aio.BufferStore {
-    var res: zml.aio.BufferStore = .{
+    var res: zml.aio.BufferStore = .init(allocator);
        .arena = std.heap.ArenaAllocator.init(allocator),
    };
    errdefer res.arena.deinit();
    const arena = res.arena.allocator();
--- a/zml/aio/torch.zig
+++ b/zml/aio/torch.zig
@ -36,7 +36,7 @@ pub fn open(allocator: std.mem.Allocator, path: []const u8) !zml.aio.BufferStore
    const py_values = try eval.evaluate(tmp_alloc, ops, true);
    // file ownership is transferred to the BufferStore
-    var res = try zml.aio.BufferStore.init(allocator, &.{torch_file.mmap_file});
+    var res = try zml.aio.BufferStore.initWithFiles(allocator, &.{mmap_file});
    try torch_file.parseModel(py_values, &res);
    return res;
 }
--- a/zml/aio/torch/file.zig
+++ b/zml/aio/torch/file.zig
@ -480,7 +480,7 @@ test "Read pickle (zipped)" {
    // torch.save({ "model": model, "tensor": tensor}, "simple.pt")
    const file = try asynk.File.open("zml/aio/torch/simple.pt", .{ .mode = .read_only });
    const mmap_file = try zml.aio.MemoryMappedFile.init(file);
-    var store = try zml.aio.BufferStore.init(testing.allocator, &.{mmap_file});
+    var store = try zml.aio.BufferStore.initWithFiles(testing.allocator, &.{mmap_file});
    defer store.deinit();
    {
--- a/zml/exe.zig
+++ b/zml/exe.zig
@ -77,7 +77,7 @@ pub fn compileModel(
 ) !FnExe(func) {
    const ModelT = ModuleSignature(func).ModelT;
    const name = @typeName(ModelT) ++ ".forward";
-    log.info("Compiling {s} with {}", .{ name, args_shapes });
+    log.info("Compiling {s} with {f}", .{ name, stdx.fmt.any(args_shapes) });
    var context = try CompilationContext.init(allocator, name, platform);
    defer context.deinit();
@ -209,9 +209,9 @@ pub const BaseExe = struct {
        var execute_context: ?*pjrt.ExecuteContext = null;
        if (platform.pjrt_api.ffi()) |ffi| {
            log.info("Created context execution {*} for {*}", .{ execute_context, exe });
            execute_context = try platform.pjrt_api.createExecuteContext();
            try callback.bindInternalCallbacks(allocator, platform, ffi, execute_context.?);
            // log.info("Created context execution {*} for {*}", .{ execute_context, exe });
        }
        return .{
--- a/zml/meta.zig
+++ b/zml/meta.zig
@ -141,6 +141,7 @@ test MapType {
 /// Any `To` struct inside `from` will be copied over to the target.
 pub fn mapAlloc(comptime cb: anytype, allocator: std.mem.Allocator, ctx: FnParam(cb, 0), from: anytype, to: anytype) !void {
    // TODO: handle tuple to slice conversion
    // TODO: handle error bubbling up
    const From = FnParam(cb, 1);
    const To = stdx.meta.FnResult(cb);
    const FromStruct = @TypeOf(from);
--- a/zml/module.zig
+++ b/zml/module.zig
@ -1129,7 +1129,7 @@ pub fn hashShape(hasher: *std.hash.Wyhash, shape: Shape) void {
    hash(hasher, shape._dtype, .Shallow);
    hash(hasher, shape._sharding_info, .Shallow);
    for (shape.tags()) |tag| {
-        hash(hasher, @intFromPtr(tag), .Shallow);
+        hashArray(hasher, std.mem.span(tag), .Shallow);
    }
 }
--- a/zml/platform.zig
+++ b/zml/platform.zig
@ -82,9 +82,7 @@ pub const Platform = struct {
 };
 const _CreateOptions = struct {
-    // XLA CPU client doesn't read options
+    cpu: Cpu = .{ .device_count = 4 },
    // https://github.com/openxla/xla/blob/42496a28c374bd35f493cc5dbde74805407245dc/xla/pjrt/c/pjrt_c_api_cpu_internal.cc#L33-L46
    cpu: struct {} = .{},
    // bump memory fraction from XLA defaults of 75% to 90%.
    // Even on a 8GB GPU it should leave enough space for the Cuda driver
@ -94,6 +92,14 @@ const _CreateOptions = struct {
    tpu: struct {} = .{},
    neuron: struct {} = .{},
    pub const Cpu = struct {
        device_count: u32,
        fn writeNamedValues(self: Cpu, values: *std.ArrayListUnmanaged(pjrt.NamedValue)) void {
            values.appendAssumeCapacity(pjrt.NamedValue.from("cpu_device_count", @as(i64, self.device_count)));
        }
    };
    pub const Cuda = struct {
        allocator: Allocator = .{ .bfc = .{} },
        // TODO support all of https://github.com/openxla/xla/blob/3d31c48c719d331d432132b3e0c2c5ce52650675/xla/pjrt/c/pjrt_c_api_gpu_internal.cc#L76-L86
@ -118,7 +124,7 @@ const _CreateOptions = struct {
            };
        };
-        pub fn writeNamedValues(self: Cuda, values: *std.ArrayListUnmanaged(pjrt.NamedValue)) void {
+        fn writeNamedValues(self: Cuda, values: *std.ArrayListUnmanaged(pjrt.NamedValue)) void {
            switch (self.allocator) {
                .platform => {
                    values.appendAssumeCapacity(pjrt.NamedValue.fromString("allocator", "platform"));
@ -142,6 +148,7 @@ const _CreateOptions = struct {
        var values = std.ArrayListUnmanaged(pjrt.NamedValue).fromOwnedSlice(out);
        values.shrinkRetainingCapacity(0);
        switch (target) {
            .cpu => self.cpu.writeNamedValues(&values),
            .cuda => self.cuda.writeNamedValues(&values),
            inline else => |t| {
                stdx.debug.assertComptime(@hasField(_CreateOptions, @tagName(t)), "zml.platform.CreateOptions doesn't list target {s}", .{@tagName(t)});
--- a/zml/tensor.zig
+++ b/zml/tensor.zig
@ -269,18 +269,6 @@ pub const Tensor = struct {
        return res;
    }
    var _global_tensor_counter: u64 = 0;
    /// Internal use
    pub fn _reserveIdRange(len: u32) u64 {
        return @atomicRmw(u64, &_global_tensor_counter, .Add, len, .seq_cst);
    }
    /// Internal use
    pub fn setUniqueId(self: *Tensor) void {
        self._id = .{ .buffer_id = _reserveIdRange(1) };
    }
    /// Returns a Tensor containing the absolute value of each element of the input Tensor.
    pub fn abs(self: Tensor) Tensor {
        const loc = self.getContext().mlirCtx().location(@src());
--- a/zml/testing.zig
+++ b/zml/testing.zig
@ -298,7 +298,7 @@ test testLayer {
    };
    // create a buffer store containing the activations:
-    var activations = try zml.aio.BufferStore.init(std.testing.allocator, &.{});
+    var activations = zml.aio.BufferStore.init(std.testing.allocator);
    defer activations.deinit();
    {
        const input = zml.HostBuffer.fromArray(&[2]f32{ 1, -1 });