zml: clean up dead and commented code; note that copyslice is currently broken and pending reimplementation

2023-02-08 17:13:47 +00:00 · 2023-02-08 17:13:47 +00:00 · be6328813d
commit be6328813d
parent 058e1415fa
13 changed files with 4 additions and 834 deletions
--- a/mlir/dialects/stablehlo.zig
+++ b/mlir/dialects/stablehlo.zig
@ -288,79 +288,6 @@ fn elementTypeOrSelf(typ: mlir.Type) mlir.Type {
    } else typ;
 }
 pub fn scatter(
    ctx: mlir.Context,
    // inputs
    inputs: []const mlir.Value,
    scatter_indices: mlir.Value,
    updates: []const mlir.Value,
    // input functions
    update_ctx: anytype, // for update_fn
    update_fn: fn (anytype, mlir.Context, []const mlir.Value, []const mlir.Value) mlir.Operation,
    // attributes
    args: struct {
        update_window_dims: []const i64,
        inserted_window_dims: []const i64,
        input_batching_dims: []const i64,
        scatter_indices_batching_dims: []const i64,
        scatter_dims_to_operand_dims: []const i64,
        index_vector_dim: i64,
        indices_are_sorted: bool = false,
        unique_indices: bool = false,
    },
    // zml loc
    location: mlir.Location,
 ) mlir.Operation {
    // create block for update_fn
    const MaxBlockArguments = 32; // TODO(rene): where does this 32 come from?
    // taken from reduce
    const block_n_args = inputs.len * 2; // TODO(rene): is this correct? yes, passes tests: block_inputs plus block_accumulators = inputs
    const locations = ([_]mlir.Location{mlir.Location.unknown(ctx)} ** MaxBlockArguments)[0..block_n_args];
    var scatter_elem_types: [MaxBlockArguments]mlir.Type = undefined;
    for (inputs, 0..) |input, i| {
        const arg_type = mlir.RankedTensorType.init(&.{}, elementTypeOrSelf(input.getType())).as(mlir.Type).?;
        scatter_elem_types[i] = arg_type;
        scatter_elem_types[inputs.len + i] = arg_type;
    }
    var block = mlir.Block.open(scatter_elem_types[0..block_n_args], locations) catch unreachable;
    {
        defer block.close();
        var block_inputs: [MaxBlockArguments / 2]mlir.Value = undefined;
        var block_accs: [MaxBlockArguments / 2]mlir.Value = undefined;
        for (0..inputs.len) |i| {
            block_inputs[i] = block.argument(i);
            block_accs[i] = block.argument(inputs.len + i);
        }
        _ = update_fn(update_ctx, ctx, block_inputs[0..inputs.len], block_accs[0..inputs.len]);
    }
    return mlir.Operation.make(
        ctx,
        "stablehlo.scatter",
        .{
            .variadic_operands = &.{ inputs, &.{scatter_indices}, updates },
            // .blocks = &.{block},
            .block = block,
            .attributes = &.{
                .{ "scatter_dimension_numbers", ScatterDimensionNumbersAttribute.init(
                    ctx,
                    args.update_window_dims,
                    args.inserted_window_dims,
                    args.input_batching_dims,
                    args.scatter_indices_batching_dims,
                    args.scatter_dims_to_operand_dims,
                    args.index_vector_dim,
                ).as(mlir.Attribute).? },
                .{ "indices_are_sorted", mlir.BoolAttribute.init(ctx, args.indices_are_sorted).as(mlir.Attribute).? },
                .{ "unique_indices", mlir.BoolAttribute.init(ctx, args.unique_indices).as(mlir.Attribute).? },
            },
            .result_type_inference = true,
            .location = location,
        },
    );
 }
 pub fn iota(ctx: mlir.Context, dimension: i64, result_type: mlir.Type, location: mlir.Location) mlir.Operation {
    return mlir.Operation.make(ctx, "stablehlo.iota", .{
        .operands = &.{},
@ -439,66 +366,6 @@ pub fn reduce(
    });
 }
 pub const ReduceWindowOpts = struct {
    window_dimensions: []const i64,
    window_strides: []const i64,
    base_dilations: []const i64,
    window_dilations: []const i64,
    padding_values: []const i64,
    padding_shape: []const i64,
 };
 // pub fn reduce_window(
 //     ctx: mlir.Context,
 //     inputs: []const mlir.Value,
 //     init_values: []const mlir.Value,
 //     opts: ReduceWindowOpts,
 //     blkctx: anytype,
 //     blkfn: fn (anytype, mlir.Context, []const mlir.Value, []const mlir.Value) mlir.Operation,
 //     location: mlir.Location,
 // ) mlir.Operation {
 //     // TODO: move to ops.zig, and refactor similar to `reduce`
 //     const MaxBlockArguments = 32;
 //     const block_n_args = inputs.len + init_values.len;
 //     const locations = ([_]mlir.Location{mlir.Location.unknown(ctx)} ** MaxBlockArguments)[0..block_n_args];
 //     var reduce_elem_types: [MaxBlockArguments]mlir.Type = undefined;
 //     for (inputs, 0..) |input, i| {
 //         const arg_type = mlir.RankedTensorType.init(&.{}, elementTypeOrSelf(input.getType())).as(mlir.Type).?;
 //         reduce_elem_types[i] = arg_type;
 //         reduce_elem_types[inputs.len + i] = arg_type;
 //     }
 //     const module = @import("../module.zig");
 //     const comp = module.getCompilationContext();
 //     var block = comp.openBlock(reduce_elem_types[0..block_n_args], locations) catch unreachable;
 //     {
 //         defer comp.closeBlock(block);
 //         var block_inputs: [MaxBlockArguments / 2]mlir.Value = undefined;
 //         var block_accs: [MaxBlockArguments / 2]mlir.Value = undefined;
 //         for (0..inputs.len) |i| {
 //             block_inputs[i] = block.argument(i);
 //             block_accs[i] = block.argument(inputs.len + i);
 //         }
 //         _ = blkfn(blkctx, ctx, block_inputs[0..inputs.len], block_accs[0..init_values.len]);
 //     }
 //     const pad_shape = mlir.RankedTensorType.init(opts.padding_shape, DataType.i64.mlirType(ctx)).as(mlir.Type).?;
 //     return mlir.Operation.make(ctx, "stablehlo.reduce_window", .{
 //         .variadic_operands = &.{ inputs, init_values },
 //         .result_type_inference = true,
 //         .blocks = &.{block},
 //         .attributes = &.{
 //             .{ "window_dimensions", mlir.DenseArrayAttribute(.i64).init(ctx, opts.window_dimensions).as(mlir.Attribute).? },
 //             .{ "window_strides", mlir.DenseArrayAttribute(.i64).init(ctx, opts.window_strides).as(mlir.Attribute).? },
 //             .{ "base_dilations", mlir.DenseArrayAttribute(.i64).init(ctx, opts.base_dilations).as(mlir.Attribute).? },
 //             .{ "window_dilations", mlir.DenseArrayAttribute(.i64).init(ctx, opts.window_dilations).as(mlir.Attribute).? },
 //             .{ "padding", mlir.DenseIntOrFPElementsAttribute(.i64).init(pad_shape, std.mem.sliceAsBytes(opts.padding_values)).as(mlir.Attribute).? },
 //         },
 //         .location = location,
 //     });
 // }
 pub fn sort(
    ctx: mlir.Context,
    inputs: []const mlir.Value,
--- a/mlir/mlir.zig
+++ b/mlir/mlir.zig
@ -691,10 +691,6 @@ pub const OperationState = struct {
        c.mlirOperationStateAddOwnedRegions(self.innerPtr(), @intCast(regions.len), @ptrCast(regions.ptr));
    }
    // pub fn addSuccessor(self: *Self, successor: Operation) void {
    //     c.mlirOperationStateAddSuccessors(self.innerPtr(), 1, &[_]c.MlirOperation{successor.inner()});
    // }
    pub fn addAttribute(self: *Self, ctx: Context, name: [:0]const u8, attr: Attribute) void {
        c.mlirOperationStateAddAttributes(self.innerPtr(), 1, @ptrCast(&.{
            .{
@ -745,9 +741,9 @@ pub const DictionaryAttribute = struct {
        return NamedAttribute.wrap(c.mlirDictionaryAttrGetElement(self.inner(), @intCast(pos)));
    }
-    // pub fn getByName(self: Self, name: [:0]const u8) ?NamedAttribute {
+    pub fn getByName(self: Self, name: [:0]const u8) ?NamedAttribute {
-    //     return NamedAttribute.wrapOr(c.mlirDictionaryAttrGetElementByName(self.inner(), name));
+        return NamedAttribute.wrapOr(c.mlirDictionaryAttrGetElementByName(self.inner(), name));
-    // }
+    }
 };
 pub const Operation = struct {
@ -1519,276 +1515,6 @@ pub const DialectHandle = struct {
    }
 };
 // pub const AnyQuantizedType = MlirWrapperType(c.MlirType, .{
 //     .is_a_fn = c.mlirTypeIsAAnyQuantizedType,
 //     .is_null_fn = c.mlirTypeIsNull,
 //     .dump_fn = c.mlirTypeDump,
 //     .equal_fn = c.mlirTypeEqual,
 // }, struct {
 //     const Self = AnyQuantizedType;
 //     pub fn init(
 //         flags: quant.QuantizationFlags,
 //         storageType: Type,
 //         expressedType: Type,
 //         storageTypeMin: i64,
 //         storageTypeMax: i64,
 //     ) Self {
 //         return Self.wrap(c.mlirAnyQuantizedTypeGet(
 //             @intCast(@intFromEnum(flags)),
 //             storageType.inner(),
 //             expressedType.inner(),
 //             storageTypeMin,
 //             storageTypeMax,
 //         ));
 //     }
 //     pub fn getExpressedType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetExpressedType(self.inner()));
 //     }
 //     pub fn getFlags(self: Self) quant.QuantizationFlags {
 //         return @enumFromInt(c.mlirQuantizedTypeGetFlags(self.inner()));
 //     }
 //     pub fn isSigned(self: Self) bool {
 //         return c.mlirQuantizedTypeIsSigned(self.inner());
 //     }
 //     pub fn getStorageType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetStorageType(self.inner()));
 //     }
 //     pub fn getStorageTypeMin(self: Self) i64 {
 //         return c.mlirQuantizedTypeGetStorageTypeMin(self.inner());
 //     }
 //     pub fn getStorageTypeMax(self: Self) i64 {
 //         return c.mlirQuantizedTypeGetStorageTypeMax(self.inner());
 //     }
 //     pub fn getStorageTypeIntegralWidth(self: Self) c_uint {
 //         return c.mlirQuantizedTypeGetStorageTypeIntegralWidth(self.inner());
 //     }
 //     pub fn getQuantizedElementType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetQuantizedElementType(self.inner()));
 //     }
 // });
 // pub const UniformQuantizedType = MlirWrapperType(c.MlirType, .{
 //     .is_a_fn = c.mlirTypeIsAUniformQuantizedType,
 //     .is_null_fn = c.mlirTypeIsNull,
 //     .dump_fn = c.mlirTypeDump,
 //     .equal_fn = c.mlirTypeEqual,
 // }, struct {
 //     const Self = AnyQuantizedType;
 //     pub fn init(
 //         flags: quant.QuantizationFlags,
 //         storageType: Type,
 //         expressedType: Type,
 //         scale: f64,
 //         zeroPoint: i64,
 //         storageTypeMin: i64,
 //         storageTypeMax: i64,
 //     ) Self {
 //         return Self.wrap(c.mlirUniformQuantizedTypeGet(
 //             @intCast(@intFromEnum(flags)),
 //             storageType.inner(),
 //             expressedType.inner(),
 //             scale,
 //             zeroPoint,
 //             storageTypeMin,
 //             storageTypeMax,
 //         ));
 //     }
 //     pub fn getExpressedType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetExpressedType(self.inner()));
 //     }
 //     pub fn getFlags(self: Self) quant.QuantizationFlags {
 //         return @enumFromInt(c.mlirQuantizedTypeGetFlags(self.inner()));
 //     }
 //     pub fn isSigned(self: Self) bool {
 //         return c.mlirQuantizedTypeIsSigned(self.inner());
 //     }
 //     pub fn getStorageType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetStorageType(self.inner()));
 //     }
 //     pub fn getStorageTypeMin(self: Self) i64 {
 //         return c.mlirQuantizedTypeGetStorageTypeMin(self.inner());
 //     }
 //     pub fn getStorageTypeMax(self: Self) i64 {
 //         return c.mlirQuantizedTypeGetStorageTypeMax(self.inner());
 //     }
 //     pub fn getStorageTypeIntegralWidth(self: Self) c_uint {
 //         return c.mlirQuantizedTypeGetStorageTypeIntegralWidth(self.inner());
 //     }
 //     pub fn getQuantizedElementType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetQuantizedElementType(self.inner()));
 //     }
 //     pub fn getScale(self: Self) f64 {
 //         return c.mlirUniformQuantizedTypeGetScale(self.inner());
 //     }
 //     pub fn getZeroPoint(self: Self) i64 {
 //         return c.mlirUniformQuantizedTypeGetZeroPoint(self.inner());
 //     }
 //     pub fn isFixedPoint(self: Self) bool {
 //         return c.mlirUniformQuantizedTypeIsFixedPoint(self.inner());
 //     }
 // });
 // pub const QuantizedPerAxisType = MlirWrapperType(c.MlirType, .{
 //     .is_a_fn = c.mlirTypeIsAUniformQuantizedPerAxisType,
 //     .is_null_fn = c.mlirTypeIsNull,
 //     .dump_fn = c.mlirTypeDump,
 //     .equal_fn = c.mlirTypeEqual,
 // }, struct {
 //     const Self = AnyQuantizedType;
 //     pub fn init(
 //         flags: quant.QuantizationFlags,
 //         storageType: Type,
 //         expressedType: Type,
 //         nDims: usize,
 //         scales: []f64,
 //         zeroPoints: []i64,
 //         quantizedDimension: i32,
 //         storageTypeMin: i64,
 //         storageTypeMax: i64,
 //     ) Self {
 //         std.debug.assert(scales.len == nDims);
 //         std.debug.assert(zeroPoints.len == nDims);
 //         return Self.wrap(c.mlirUniformQuantizedPerAxisTypeGet(
 //             @intCast(@intFromEnum(flags)),
 //             storageType.inner(),
 //             expressedType.inner(),
 //             @intCast(nDims),
 //             scales.ptr,
 //             zeroPoints.ptr,
 //             quantizedDimension,
 //             storageTypeMin,
 //             storageTypeMax,
 //         ));
 //     }
 //     pub fn getExpressedType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetExpressedType(self.inner()));
 //     }
 //     pub fn getFlags(self: Self) quant.QuantizationFlags {
 //         return @enumFromInt(c.mlirQuantizedTypeGetFlags(self.inner()));
 //     }
 //     pub fn isSigned(self: Self) bool {
 //         return c.mlirQuantizedTypeIsSigned(self.inner());
 //     }
 //     pub fn getStorageType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetStorageType(self.inner()));
 //     }
 //     pub fn getStorageTypeMin(self: Self) i64 {
 //         return c.mlirQuantizedTypeGetStorageTypeMin(self.inner());
 //     }
 //     pub fn getStorageTypeMax(self: Self) i64 {
 //         return c.mlirQuantizedTypeGetStorageTypeMax(self.inner());
 //     }
 //     pub fn getStorageTypeIntegralWidth(self: Self) c_uint {
 //         return c.mlirQuantizedTypeGetStorageTypeIntegralWidth(self.inner());
 //     }
 //     pub fn getQuantizedElementType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetQuantizedElementType(self.inner()));
 //     }
 //     pub fn getNumDims(self: Self) usize {
 //         return @intCast(c.mlirUniformQuantizedPerAxisTypeGetNumDims(self.inner()));
 //     }
 //     pub fn getScale(self: Self) f64 {
 //         return @intCast(c.mlirUniformQuantizedPerAxisTypeGetScale(self.inner()));
 //     }
 //     pub fn getZeroPoint(self: Self, pos: usize) i64 {
 //         return c.mlirUniformQuantizedPerAxisTypeGetZeroPoint(self.inner(), @intCast(pos));
 //     }
 //     pub fn getQuantizedDimension(self: Self) i32 {
 //         return c.mlirUniformQuantizedPerAxisTypeGetQuantizedDimension(self.inner());
 //     }
 //     pub fn isFixedPoint(self: Self) bool {
 //         return c.mlirUniformQuantizedPerAxisTypeIsFixedPoint(self.inner());
 //     }
 // });
 // pub const CalibratedQuantizedType = MlirWrapperType(c.MlirType, .{
 //     .is_a_fn = c.mlirTypeIsACalibratedQuantizedType,
 //     .is_null_fn = c.mlirTypeIsNull,
 //     .dump_fn = c.mlirTypeDump,
 //     .equal_fn = c.mlirTypeEqual,
 // }, struct {
 //     const Self = AnyQuantizedType;
 //     pub fn init(expressedType: Type, min: f64, max: f64) Self {
 //         return Self.wrap(c.mlirCalibratedQuantizedTypeGet(expressedType.inner(), min, max));
 //     }
 //     pub fn getExpressedType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetExpressedType(self.inner()));
 //     }
 //     pub fn getFlags(self: Self) quant.QuantizationFlags {
 //         return @enumFromInt(c.mlirQuantizedTypeGetFlags(self.inner()));
 //     }
 //     pub fn isSigned(self: Self) bool {
 //         return c.mlirQuantizedTypeIsSigned(self.inner());
 //     }
 //     pub fn getStorageType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetStorageType(self.inner()));
 //     }
 //     pub fn getStorageTypeMin(self: Self) i64 {
 //         return c.mlirQuantizedTypeGetStorageTypeMin(self.inner());
 //     }
 //     pub fn getStorageTypeMax(self: Self) i64 {
 //         return c.mlirQuantizedTypeGetStorageTypeMax(self.inner());
 //     }
 //     pub fn getStorageTypeIntegralWidth(self: Self) c_uint {
 //         return c.mlirQuantizedTypeGetStorageTypeIntegralWidth(self.inner());
 //     }
 //     pub fn getQuantizedElementType(self: Self) Type {
 //         return Type.wrap(c.mlirQuantizedTypeGetQuantizedElementType(self.inner()));
 //     }
 //     pub fn getMin(self: Self) f64 {
 //         return c.mlirCalibratedQuantizedTypeGetMin(self.inner());
 //     }
 //     pub fn getMax(self: Self) f64 {
 //         return c.mlirCalibratedQuantizedTypeGetMax(self.inner());
 //     }
 // });
 pub const ShapedType = struct {
    _inner: c.MlirType,
    pub usingnamespace MlirHelpers(ShapedType, .{
--- a/pjrt/pjrt.zig
+++ b/pjrt/pjrt.zig
@ -319,14 +319,6 @@ pub const Client = opaque {
        return Profiler.init(null, options);
    }
    // pub fn getGpuCustomCallRegistry(self: *const Client, api: *const Api) ?*GpuCustomCallRegistry {
    //     if (api.lookupExtension(c.PJRT_Gpu_Custom_Call, c.PJRT_Extension_Type_Gpu_Custom_Call)) |ext| {
    //         return .{ .custom_call_register = ext.custom_call.? };
    //     }
    //     log.warn("No Gpu Custom Call registry found for platform: {}", .{self});
    //     return null;
    // }
    pub fn deserializeAndLoad(self: *const Client, api: *const Api, bytes: []const u8) ApiError!*LoadedExecutable {
        const ret = try api.call(.PJRT_Executable_DeserializeAndLoad, .{
            .client = self.inner(),
@ -365,32 +357,6 @@ pub const Client = opaque {
    }
 };
 // // pub const CustomCallSignature = *const fn (*anyopaque, **anyopaque, [*c]const u8, usize) callconv(.C) void;
 // // pub const GpuCustomCallRegistry = struct {
 // //     custom_call_register: *const c.PJRT_Gpu_Register_Custom_Call,
 // //     pub fn registerCustomCall(self: GpuCustomCallRegistry, api: *const Api, api_version: usize, name: []const u8, func: CustomCallSignature) ApiError!void {
 // //         var ret = pjrtStruct(c.PJRT_Gpu_Register_Custom_Call_Args{
 // //             .function_name = name.ptr,
 // //             .function_name_size = name.len,
 // //             .api_version = @intCast(api_version),
 // //             .custom_call_function = @ptrCast(@constCast(func)),
 // //         });
 // //         const result = self.custom_call_register(&ret);
 // //         if (result) |pjrt_c_error| {
 // //             const pjrt_error = .{ .inner = pjrt_c_error };
 // //             log.err("{s}", .{pjrt_error.getMessage(api)});
 // //             return pjrt_error.getCode().toApiError();
 // //         }
 // //     }
 // // };
 // // const OldPjrtExtension = extern struct {
 // //     type: c.PJRT_Extension_Type,
 // //     next: [*]OldPjrtExtension,
 // // };
 pub const Device = opaque {
    const inner = InnerMixin(c.PJRT_Device).inner;
--- a/pjrt/profiler.zig
+++ b/pjrt/profiler.zig
@ -128,64 +128,6 @@ pub const Profiler = struct {
    }
 };
 // If this was working it would be a good alternative to xspace_to_json.cc
 // const xspace = @import("xspace.pb.zig");
 // pub fn printDataAsXSpace(allocator: std.mem.Allocator, data: []const u8) void {
 //     var arena = std.heap.ArenaAllocator.init(allocator);
 //     defer arena.deinit();
 //
 //     const space = xspace.XSpace.decode(data, arena.allocator()) catch |e| {
 //         std.log.err("Couldn't load profiling data: {}", .{e});
 //         return;
 //     };
 //
 //     for (space.errors.items) |err| {
 //         std.log.err("{s}", .{err.getSlice()});
 //     }
 //     for (space.warnings.items) |warning| {
 //         std.log.warn("{s}", .{warning.getSlice()});
 //     }
 //     for (space.hostnames.items) |host| {
 //         std.log.info("Profiled host {s}", .{host.getSlice()});
 //     }
 //     for (space.planes.items) |plane| {
 //         var event_metadata = std.hash_map.AutoHashMap(i64, xspace.XEventMetadata).init(arena.allocator());
 //         event_metadata.ensureTotalCapacity(@intCast(plane.event_metadata.items.len)) catch return;
 //         defer event_metadata.deinit();
 //         for (plane.event_metadata.items) |event_meta_entry| {
 //             if (event_meta_entry.value) |event_meta| {
 //                 event_metadata.putAssumeCapacity(event_meta.id, event_meta);
 //             }
 //         }
 //         std.log.info("Profiled device {s}", .{plane.name.getSlice()});
 //         for (plane.lines.items) |line| {
 //             std.log.info(
 //                 "{d} -> {d} xline {s} ({d} events)",
 //                 .{ line.timestamp_ns, line.duration_ps, line.name.getSlice(), line.events.items.len },
 //             );
 //             const ps_per_ns: i64 = 1000;
 //             var duration_ns: i64 = 0;
 //             var last_metadata_id: i64 = 0;
 //             for (line.events.items) |event| {
 //                 if (event.metadata_id != last_metadata_id and duration_ns != 0) {
 //                     const duration_us = @as(f32, @floatFromInt(duration_ns)) / std.time.ns_per_us;
 //                     const meta = event_metadata.get(event.metadata_id).?;
 //                     std.log.info("event {s}: {d:.1}μs", .{ meta.name.getSlice(), duration_us });
 //                     last_metadata_id = event.metadata_id;
 //                     duration_ns = 0;
 //                 }
 //                 duration_ns += @divFloor(event.duration_ps, ps_per_ns);
 //                 const duration_us = @as(f32, @floatFromInt(duration_ns)) / std.time.ns_per_us;
 //                 const meta = event_metadata.get(event.metadata_id).?;
 //                 std.log.info("event {s}: {d:.1}μs", .{ meta.name.getSlice(), duration_us });
 //             }
 //         }
 //     }
 // }
 const ProfilingData = union(enum) {
    owned: []const u8,
    external: []const u8,
--- a/zml/aio.zig
+++ b/zml/aio.zig
@ -300,23 +300,12 @@ fn _populateStruct(
                    log.warn("No layer found at {s}", .{prefix});
                }
                return true;
            } else if (ptr_info.size == .One) {
                //if (ptr_info.child != zml.Tensor and ptr_info.child != ?zml.Tensor) {
                //    // Note: should we recurse on all pointers ?
                //    log.warn("Not looking into: {any}", .{prefix});
                //    return false;
                //}
                //obj.* = try allocator.create(ptr_info.child);
                //return try _populateStruct(allocator, buffer_store, unique_id, prefix, obj.*, required);
            } else {
                std.log.err("{s} - {s}: {s} type not supported", .{ @src().fn_name, prefix, @typeName(T) });
                return false;
            }
        },
        .Struct => |struct_info| {
            // TODO(Corentin): See if we keep that
            //if (@hasDecl(T, "_zml_reader_skip_me_")) return false;
            var partial_struct = false;
            inline for (struct_info.fields) |field| {
                try prefix_builder.push(allocator, field.name);
@ -343,46 +332,12 @@ fn _populateStruct(
            }
            return true;
        },
        //.Array => |array_info| {
        //    var new_prefix = prefix;
        //    if (prefix.items.len > 0)
        //        new_prefix.appendAssumeCapacity('.');
        //    const len = new_prefix.items.len;
        //    for (obj, 0..) |*value, i| {
        //        new_prefix.items.len += std.fmt.formatIntBuf(new_prefix.unusedCapacitySlice(), i, 10, .lower, .{});
        //        const found = try _populateStruct(allocator, buffer_store, unique_id, new_prefix, value, required);
        //        if (!found) return false;
        //        new_prefix.shrinkRetainingCapacity(len);
        //    }
        //    const num_layers = buffer_store.numLayers(prefix.items);
        //    if (num_layers != array_info.len) {
        //        log.warn("Found {d} layers with prefix {s}, but only loaded {d}", .{ num_layers, prefix.items, array_info.len });
        //    }
        //    return true;
        //},
        .Optional => |opt_info| {
            obj.* = @as(opt_info.child, undefined);
            const found = try _populateStruct(allocator, prefix_builder, unique_id, buffer_store, &(obj.*.?), false);
            if (!found) obj.* = null;
            return true;
        },
        //.Union => |union_info| {
        //    // Note: the main issue here is that several fields could match but we only return the first one.
        //    inline for (union_info.fields) |field| {
        //        // interpret obj as a "field", and try to populate that.
        //        obj.* = @unionInit(T, field.name, undefined);
        //        const found = try _populateStruct(allocator, buffer_store, unique_id, prefix, &@field(obj.*, field.name), false);
        //        if (found) {
        //            std.log.info("Interpreted {s} as {s}", .{ prefix.items, @typeName(field.type) });
        //            return true;
        //        }
        //    }
        //    obj.* = undefined;
        //    if (required) {
        //        std.log.err("Not able to intepret {s} as any member of the union: {s}", .{ prefix.items, @typeName(T) });
        //    }
        //    return false;
        //},
        .Int => {
            obj.* = undefined;
            return true;
@ -540,9 +495,6 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
            } else return error.TypeNotSupported;
        },
        .Struct => |struct_info| {
            // TODO(Corentin): See if we keep that
            //if (@hasDecl(T, "_zml_reader_skip_me_")) return false;
            inline for (struct_info.fields) |field| {
                try prefix_builder.push(allocator, field.name);
                defer prefix_builder.pop();
@ -550,23 +502,6 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
                try visitStructAndLoadBuffer(allocator, prefix_builder, buffer_store, &@field(obj, field.name), platform);
            }
        },
        //.Array => |array_info| {
        //    var new_prefix = prefix;
        //    if (prefix.items.len > 0)
        //        new_prefix.appendAssumeCapacity('.');
        //    const len = new_prefix.items.len;
        //    for (obj, 0..) |*value, i| {
        //        new_prefix.items.len += std.fmt.formatIntBuf(new_prefix.unusedCapacitySlice(), i, 10, .lower, .{});
        //        const found = try _populateStruct(allocator, buffer_store, unique_id, new_prefix, value, required);
        //        if (!found) return false;
        //        new_prefix.shrinkRetainingCapacity(len);
        //    }
        //    const num_layers = buffer_store.numLayers(prefix.items);
        //    if (num_layers != array_info.len) {
        //        log.warn("Found {d} layers with prefix {s}, but only loaded {d}", .{ num_layers, prefix.items, array_info.len });
        //    }
        //    return true;
        //},
        .Optional => |opt_info| {
            var child = @as(opt_info.child, undefined);
            if (visitStructAndLoadBuffer(allocator, prefix_builder, buffer_store, &child, platform)) {
@ -576,23 +511,6 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
                else => return err,
            }
        },
        //.Union => |union_info| {
        //    // Note: the main issue here is that several fields could match but we only return the first one.
        //    inline for (union_info.fields) |field| {
        //        // interpret obj as a "field", and try to populate that.
        //        obj.* = @unionInit(T, field.name, undefined);
        //        const found = try _populateStruct(allocator, buffer_store, unique_id, prefix, &@field(obj.*, field.name), false);
        //        if (found) {
        //            std.log.info("Interpreted {s} as {s}", .{ prefix.items, @typeName(field.type) });
        //            return true;
        //        }
        //    }
        //    obj.* = undefined;
        //    if (required) {
        //        std.log.err("Not able to intepret {s} as any member of the union: {s}", .{ prefix.items, @typeName(T) });
        //    }
        //    return false;
        //},
        else => {},
    }
 }
--- a/zml/aio/torch/parser.zig
+++ b/zml/aio/torch/parser.zig
@ -95,10 +95,6 @@ pub const Decoder = struct {
    }
    fn parseOps(self: *Decoder, allocator: Allocator, seekable_stream: anytype) ![]PickleOp {
        // TODO(SuperAuguste): deflate using `std.compress.flate`'s `decompressor`
        // TODO(SuperAuguste): explore swapping in non-generic reader here instead of using switch(?)
        //                     not sure if that'd actually be beneficial in any way
        var iter = try std.zip.Iterator(@TypeOf(seekable_stream)).init(seekable_stream);
        var filename_buf: [std.fs.max_path_bytes]u8 = undefined;
        while (try iter.next()) |entry| {
--- a/zml/helpers.zig
+++ b/zml/helpers.zig
@ -49,10 +49,6 @@ pub fn collectDims(
                        expected_dim.* = DIM_MISMATCH;
                    }
                }
                // TODO: strict mode:
                // else if (mode == .strict) {
                //     @compileError("Found unexpected axis " ++ @tagName(a) ++ " when collecting " ++ @typeName(ShapeStruct(dims)));
                // }
            }
        }
    }).cb, &context, v);
--- a/zml/hostbuffer.zig
+++ b/zml/hostbuffer.zig
@ -190,126 +190,6 @@ pub const HostBuffer = struct {
        res._shape = self._shape.reshape(shape_);
        return res;
    }
    pub const Slice = struct {
        single: ?i64 = null,
        start: i64 = 0,
        end: ?i64 = null,
        step: i64 = 1,
    };
    pub inline fn copySlice1d(self: HostBuffer, allocator: std.mem.Allocator, axis: i8, _args: Slice) !HostBuffer {
        var slices = [_]Slice{.{}} ** 5;
        slices[self._shape.axis(axis)] = _args;
        return copySlice(self, allocator, slices[0..self._shape.rank()]);
    }
    pub fn copySlice(self: HostBuffer, allocator: std.mem.Allocator, slices: []const Slice) !HostBuffer {
        const byte_size = self.dtype().sizeOf();
        var start_indices = [_]usize{0} ** 5;
        var strides_ = [_]usize{1} ** 5;
        const dims = self._shape.dims();
        var sh = self._shape;
        for (slices, 0..) |_args, a| {
            const args: Slice = .{
                .start = if (_args.start >= 0) _args.start else _args.start + dims[a],
                .end = _args.end orelse dims[a],
                .step = _args.step,
            };
            start_indices[a] = @intCast(args.start);
            strides_[a] = @intCast(args.step);
            sh._dims.set(a, b: {
                const range = args.end.? - args.start;
                const counts = @divFloor(range - 1, args.step) + 1;
                break :b counts;
            });
        }
        const rk = self.rank();
        meta.assert(rk <= 5, "copySlice only supports less than 5-D tensors. Received: {}", .{self});
        const raw_strides: [Shape.MAX_RANK]usize = blk: {
            var res: [Shape.MAX_RANK]usize = undefined;
            const _strides = self._shape.computeStrides(self.dtype().sizeOf());
            for (_strides.constSlice(), 0..rk) |stride, i| res[i] = @intCast(stride);
            break :blk res;
        };
        const result_tensor = try HostBuffer.empty(allocator, sh);
        const res_strides: [Shape.MAX_RANK]usize = blk: {
            var res: [Shape.MAX_RANK]usize = undefined;
            const _strides = self._shape.computeStrides(self.dtype().sizeOf());
            for (_strides.constSlice(), 0..rk) |stride, i| res[i] = @intCast(stride);
            break :blk res;
        };
        const src_data = self.data;
        const data_ = @constCast(result_tensor.data);
        for (0..@intCast(sh.dim(0))) |j0| {
            const off0 = (j0 * strides_[0] + start_indices[0]) * raw_strides[0];
            const res_off0 = j0 * res_strides[0];
            if (rk == 1) {
                @memcpy(data_[res_off0..][0..byte_size], src_data[off0..][0..byte_size]);
                continue;
            }
            for (0..@intCast(sh.dim(1))) |j1| {
                const off1 = off0 + (j1 * strides_[1] + start_indices[1]) * raw_strides[1];
                const res_off1 = res_off0 + j1 * res_strides[1];
                if (rk == 2) {
                    @memcpy(data_[res_off1..][0..byte_size], src_data[off1..][0..byte_size]);
                    continue;
                }
                for (0..@intCast(sh.dim(2))) |j2| {
                    const off2 = off1 + (j2 * strides_[2] + start_indices[2]) * raw_strides[2];
                    const res_off2 = res_off1 + j2 * res_strides[2];
                    if (rk == 3) {
                        @memcpy(data_[res_off2..][0..byte_size], src_data[off2..][0..byte_size]);
                        continue;
                    }
                    for (0..@intCast(sh.dim(3))) |j3| {
                        const off3 = off2 + (j3 * strides_[3] + start_indices[3]) * raw_strides[3];
                        const res_off3 = res_off2 + j3 * res_strides[3];
                        if (rk == 4) {
                            @memcpy(data_[res_off3..][0..byte_size], src_data[off3..][0..byte_size]);
                            continue;
                        }
                        for (0..@intCast(sh.dim(4))) |j4| {
                            const off4 = off3 + (j4 * strides_[4] + start_indices[4]) * raw_strides[4];
                            const res_off4 = res_off3 + j4 * res_strides[4];
                            @memcpy(data_[res_off4..][0..byte_size], src_data[off4..][0..byte_size]);
                        }
                    }
                }
            }
        }
        return result_tensor;
    }
    test copySlice {
        var arena_state = std.heap.ArenaAllocator.init(std.testing.allocator);
        defer arena_state.deinit();
        const allocator = arena_state.allocator();
        const x = HostBuffer.fromSlice(.{ 2, 5 }, &[_]f32{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });
        {
            const res = try copySlice1d(x, allocator, 0, .{ .end = 1 });
            try std.testing.expectEqualSlices(f32, &.{ 0, 1, 2, 3, 4 }, res.items(f32));
        }
        // { // failing
        //     const res = try copySlice1d(x, allocator, -1, .{ .start = -2 });
        //     try testing.expectEqualSlices(f32, &.{ 3, 4, 8, 9 }, res.items(f32));
        // }
        // {// failing
        //     const res = try copySlice1d(x, allocator, 1, .{ .start = 1, .step = 2 });
        //     try testing.expectEqualSlices(f32, &.{ 1, 3, 6, 8 }, res.items(f32));
        // }
        {
            const res = try copySlice(x, allocator, &.{ .{ .start = 1 }, .{ .start = 1, .step = 2 } });
            try std.testing.expectEqualSlices(f32, &.{ 6, 8 }, res.items(f32));
        }
    }
 };
 fn parseArrayInfo(T: type) Shape {
--- a/zml/module.zig
+++ b/zml/module.zig
@ -746,9 +746,7 @@ fn compileInternal(
    var timer = std.time.Timer.start() catch null;
    const tensor_args = context.tensorFromShapes(ModuleSignature(func).ArgsT, arena, args);
-    // TODO: this is fast, doesn't make system call, and use mutable state.
+    // Run in a dedicated thread because compilation relies on `threadlocal`.
    // does it need to be async ?
    // const f = try CompilationContext.generateBytecode(context, arena, "main", func, &model, &tensor_args, .{ .add_donations_attributes = true });
    const f = try asynk.callGeneric(CompilationContext.generateBytecode, .{ context, arena, "main", func, &model, &tensor_args, .{ .add_donations_attributes = true } });
    context._module.getBody().appendOperation(f.mlir_fn);
--- a/zml/nn.zig
+++ b/zml/nn.zig
@ -218,13 +218,6 @@ test "real/img" {
    const platform = zml.testing.env();
    const Fns = struct {
        // fn testSplitMergeIsId(impl: RopeOpts.Implementation) Tensor {
        //     const x = Tensor.arange(.{ .end = 20 }, .f32).reshape(.{ 5, 4 });
        //     const real, const imag = splitRealImg(x, impl);
        //     const y = mergeRealImg(real, imag, impl);
        //     return y.cmp(.EQ, x).flatten(0).convert(.i32).sum(-1);
        // }
        fn testSplitMergeIsId(impl: RopeOpts.Implementation) Tensor {
            const x = Tensor.arange(.{ .end = 20 }, .f32).reshape(.{ 5, 4 });
            const real, const imag = splitRealImg(x, impl);
--- a/zml/ops.zig
+++ b/zml/ops.zig
@ -547,17 +547,6 @@ fn _BlockSign(comptime func: anytype, blk_type: BlockType) BlockSignature {
        if (i >= arg_start) {
            n_tensors += staticCountTensors(ArgType) orelse @compileError("Can't use " ++ @typeName(ArgType) ++ " in an MLIR function, because it has a variable number of tensors");
        }
        // if (arg.type) |ArgType| {
        //     full_args[i] = ArgType;
        //     if (i >= arg_start) {
        //         n_tensors += staticCountTensors(ArgType) orelse @compileError("Can't use " ++ @typeName(ArgType) ++ " in an MLIR function, because it has a variable number of tensors");
        //     }
        // } else {
        //     // anytype are considered to not have tensors.
        //     // violation of this will be detected when calling `compile()` but not at Zig compile time.
        //     full_args[i] = void;
        // }
    }
    const FullArgs = std.meta.Tuple(&full_args);
    const BlkCtx = switch (blk_type) {
--- a/zml/pjrtx.zig
+++ b/zml/pjrtx.zig
@ -167,34 +167,8 @@ pub const Client = opaque {
    pub fn getProfiler(self: *const Client, api: *const Api, options: pjrt.Profiler.Options) pjrt.Profiler {
        return self.inner().getProfiler(api, options);
    }
    // pub fn getGpuCustomCallRegistry(self: Client) ?GpuCustomCallRegistry {
    //     return switch (self.inner) {
    //         inline else => |v, tag| if (v.getGpuCustomCallRegistry()) |registry| GpuCustomCallRegistry.wrap(tag, registry) else null,
    //     };
    // }
    // pub fn getGpuCustomCallRegistry(self: *const Client, api: *const Api) ?*GpuCustomCallRegistry {
    //     if (api.lookupExtension(c.PJRT_Gpu_Custom_Call, c.PJRT_Extension_Type_Gpu_Custom_Call)) |ext| {
    //         return .{ .custom_call_register = ext.custom_call.? };
    //     }
    //     log.warn("No Gpu Custom Call registry found for platform: {}", .{self});
    //     return null;
    // }
 };
 // pub const GpuCustomCallRegistry = struct {
 //     pub usingnamespace WrapperMixin(GpuCustomCallRegistry, pjrt.GpuCustomCallRegistry);
 //     inner: GpuCustomCallRegistry.UnionType,
 //     pub fn registerCustomCall(self: GpuCustomCallRegistry, api_version: usize, name: []const u8, func: pjrt.CustomCallSignature) ApiError!void {
 //         return switch (self.inner) {
 //             inline else => |v| v.registerCustomCall(api_version, name, func),
 //         };
 //     }
 // };
 pub const Buffer = opaque {
    const inner = InnerMixin(pjrt.Buffer).inner;
--- a/zml/shape.zig
+++ b/zml/shape.zig
@ -348,9 +348,6 @@ pub const Shape = struct {
        return self.dtype().sizeOf() * self.count();
    }
    // Aliases
    pub const numel = count;
    /// Compares the two shapes described, ignoring tagging.
    pub fn eql(self: Shape, other: Shape) bool {
        return std.mem.eql(i64, self.dims(), other.dims()) and self.dtype() == other.dtype();
@ -883,78 +880,6 @@ pub const Shape = struct {
        );
    }
    /// Parses an anytype argument of the form `val` or `.{ .a = val }`.l
    /// Helps offering consistent API through ZML.
    // pub fn parseTaggedValue(
    //     T: type,
    //     default_tag: EnumLiteral,
    //     d: anytype,
    // ) struct { Tag, T } {
    //     const err_msg = "Expected one tagged dimension, received a tuple: " ++ @typeName(@TypeOf(d));
    //     return switch (@typeInfo(@TypeOf(d))) {
    //         .Int, .ComptimeInt => .{ toTag(default_tag), @intCast(d) },
    //         .Struct => |struct_info| {
    //             if (struct_info.fields.len != 1) @compileError(err_msg);
    //             const name = struct_info.fields[0].name;
    //             return .{ name.ptr, @intCast(@field(d, name)) };
    //         },
    //         else => @compileError(err_msg),
    //     };
    // }
    /// Parses a list of tags `.{ .a, .b, .c }` into a `[]Tag`
    // pub inline fn parseTagList(comptime axes_: anytype) []Tag {
    //     switch (@typeInfo(@TypeOf(axes_))) {
    //         .Struct, .Array => {
    //             var _tags: [axes_.len]Tag = undefined;
    //             inline for (axes_, &_tags) |a, *t| t.* = toTag(a);
    //             return &_tags;
    //         },
    //         else => @compileError("Expected a tuple of enum literal, but found " ++ @tagName(@TypeOf(axes))),
    //     }
    // }
    /// Parses a comptime struct into a struct similarly to Shape.init,
    /// but with a custom type in place of the `i64` dimensions.
    /// Helps offering consistent API through ZML.
    // pub fn parseShapedValue(T: type, value: anytype) struct {
    //     std.BoundedArray(Tag, MAX_RANK),
    //     std.BoundedArray(T, MAX_RANK),
    // } {
    //     const too_long_err = std.fmt.comptimePrint("Received too many axes, maximum supported is {d}", .{MAX_RANK});
    //     var _tags: [MAX_RANK]Tag = [_]Tag{TagUnknown} ** MAX_RANK;
    //     const struct_info = switch (@typeInfo(@TypeOf(value))) {
    //         .Struct => |struct_info| struct_info,
    //         else => return .{
    //             .{ .len = 0, .buffer = _tags },
    //             std.BoundedArray(T, MAX_RANK).fromSlice(value) catch @panic(too_long_err),
    //         },
    //     };
    //     meta.assertComptime(struct_info.fields.len <= MAX_RANK, too_long_err, .{});
    //     var values: std.BoundedArray(T, MAX_RANK) = .{};
    //     inline for (struct_info.fields) |field| {
    //         if (T == Tag) {
    //             values.appendAssumeCapacity(toTag(@field(value, field.name)));
    //         } else {
    //             // If you have an error here it means Zig wasn't able to convert between the
    //             // value you passed and the expected `T`.
    //             values.appendAssumeCapacity(@field(value, field.name));
    //         }
    //     }
    //     if (!struct_info.is_tuple) {
    //         inline for (struct_info.fields, 0..) |field, i| {
    //             _tags[i] = toTag(field);
    //         }
    //     }
    //     return .{
    //         .{ .len = struct_info.fields.len, .buffer = _tags },
    //         values,
    //     };
    // }
    fn intersectTags(a: []const Tag, b: []const Tag) TagsArray {
        var res = TagsArray.init(0) catch unreachable;
        for (a) |tag_| {