diff --git a/zml/aio.zig b/zml/aio.zig
index 595a6b4..358903b 100644
--- a/zml/aio.zig
+++ b/zml/aio.zig
@@ -383,6 +383,18 @@ fn _populateStruct(
                 return false;
             }
         },
+        .Array => |arr_info| {
+            for (obj, 0..) |*value, i| {
+                try prefix_builder.pushDigit(allocator, i);
+                defer prefix_builder.pop();
+                const found = try _populateStruct(allocator, prefix_builder, unique_id, buffer_store, value, required);
+                if (!found) {
+                    log.err("Not able to load {s} as {s}", .{ prefix_builder.data.items, @typeName(arr_info.child) });
+                    return false;
+                }
+            }
+            return true;
+        },
         .Struct => |struct_info| {
             var partial_struct = false;
             inline for (struct_info.fields) |field| {
@@ -594,7 +606,7 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
         } else {
             return error.BufferNotFound;
         };
-    }
+    } else if (T == zml.Shape) return;
 
     switch (type_info) {
         .Pointer => |ptr_info| {
@@ -605,8 +617,16 @@ fn visitStructAndLoadBuffer(allocator: std.mem.Allocator, prefix_builder: *Prefi
 
                     try visitStructAndLoadBuffer(allocator, prefix_builder, buffer_store, value, platform);
                 }
-            } else return error.TypeNotSupported;
+            } else zml.meta.compileError("type not supported by visitStructAndLoadBuffer: {}", .{T});
         },
+        .Array => {
+            for (obj, 0..) |*value, i| {
+                try prefix_builder.pushDigit(allocator, i);
+                defer prefix_builder.pop();
+                try visitStructAndLoadBuffer(allocator, prefix_builder, buffer_store, value, platform);
+            }
+        },
+
         .Struct => |struct_info| {
             inline for (struct_info.fields) |field| {
                 if (field.is_comptime or @sizeOf(field.type) == 0) continue;
diff --git a/zml/buffer.zig b/zml/buffer.zig
index 9dc28c3..9205d8f 100644
--- a/zml/buffer.zig
+++ b/zml/buffer.zig
@@ -140,14 +140,20 @@ pub const Buffer = struct {
         try std.testing.expectEqual([_]u16{42} ** (4 * 3 * 2), y);
     }
 
-    /// Creates a Buffer as a view of memory visible from the device,
+    /// Creates a Buffer as a view of host memory visible from the device,
     /// thus avoiding a copy.
     ///
-    /// On CUDA, it also allows you to specify a host allocated slice as they seem to be accessible.
-    /// Be careful though, as it requires a specific alignment.
-    /// Also note that it might not work on all platforms,
-    /// could lead to crashes and is considerably slower.
-    pub fn asViewOf(platform: Platform, buf: HostBuffer) !Buffer {
+    /// Be careful though, as it requires a specific alignment
+    /// and it might not work on all platforms,
+    /// could lead to crashes and operations on the buffer will be slower.
+    /// Tested on Cuda 12.4.
+    pub fn asViewOfHostBuffer(platform: Platform, buf: HostBuffer) !Buffer {
+        return asViewOfDeviceBuffer(platform, buf.shape(), null, @constCast(@ptrCast(buf.data.ptr)));
+    }
+
+    /// Creates a Buffer from a pointer into device memory.
+    /// This allows to interface with other libraries producing buffers.
+    pub fn asViewOfDeviceBuffer(platform: Platform, shape_: Shape, stream: ?*const anyopaque, device_data: *anyopaque) !Buffer {
         const minor_to_major: [Shape.MAX_RANK]i64 = comptime blk: {
             var res: [Shape.MAX_RANK]i64 = undefined;
             for (0..Shape.MAX_RANK) |i| {
@@ -156,26 +162,28 @@ pub const Buffer = struct {
             break :blk res;
         };
 
+        const device_bytes: [*]u8 = @ptrCast(device_data);
         const pjrt_buffer = try platform.pjrt_client.createViewOfDeviceBuffer(platform.pjrt_api, .{
-            .data = buf.data,
-            .element_type = bufferTypeFromDtype(buf.shape().dtype()),
-            .dims = buf.shape().dims(),
-            // TODO: split in shards
+            .data = device_bytes[0..shape_.byteSize()],
+            .element_type = bufferTypeFromDtype(shape_.dtype()),
+            .dims = shape_.dims(),
+            // TODO: exposes sharding in the API.
             .device = platform.getDevices()[0],
             .layout = .{
                 .Tiled = .{
-                    .minor_to_major = minor_to_major[Shape.MAX_RANK - buf.shape().rank() ..],
+                    .minor_to_major = minor_to_major[Shape.MAX_RANK - shape_.rank() ..],
                     .tile_dims = &.{},
                     .tile_dims_sizes = &.{},
                 },
             },
+            .stream = @bitCast(@as(usize, @intFromPtr(stream))),
         });
 
         var shards: Shards = .{};
         shards.appendAssumeCapacity(pjrt_buffer);
         return .{
             ._api = platform.pjrt_api,
-            ._shape = buf.shape(),
+            ._shape = shape_,
             ._shards = shards,
         };
     }
diff --git a/zml/floats.zig b/zml/floats.zig
index 5adb522..ef09fa0 100644
--- a/zml/floats.zig
+++ b/zml/floats.zig
@@ -213,6 +213,7 @@ test BFloat16 {
     try std.testing.expectEqual(BFloat16.fromF32(3.02344107628), BFloat16{ .sign = 0, .exponent = 127 + 1, .mantissa = 65 });
     try std.testing.expectEqual(BFloat16.fromF32(1.0 / 128.0), BFloat16{ .sign = 0, .exponent = 127 - 7, .mantissa = 0 });
     try std.testing.expectEqual(std.mem.toBytes(BFloat16.inf().neg()), [_]u8{ 0x80, 0xff });
+    try std.testing.expectEqual(BFloat16.inf(), BFloat16.fromF32(std.math.inf(f32)));
 
     const lossless = [_]f32{ 0, -2, 1.0 / 128.0, -1e64, std.math.inf(f32) };
     for (&lossless) |v| {
diff --git a/zml/helpers.zig b/zml/helpers.zig
index 1c01bd9..2856bf2 100644
--- a/zml/helpers.zig
+++ b/zml/helpers.zig
@@ -139,34 +139,3 @@ fn ShapeStruct(comptime dims: anytype) type {
         .is_tuple = false,
     } });
 }
-
-/// Return a new struct with all tensors replaced by the output of the given function.
-pub fn mapTensors(func: anytype, v: anytype, args: anytype) @TypeOf(v) {
-    const T = @TypeOf(v);
-    const type_info = @typeInfo(T);
-    if (T == Tensor) return @call(.auto, func, .{v} ++ args);
-
-    return switch (type_info) {
-        .Pointer => @compileError("mapTensors only accept by value arguments. Received: " ++ @typeName(T)),
-        .Struct => |struct_info| {
-            var copy: T = v;
-            inline for (struct_info.fields) |feeld| {
-                if (feeld.is_comptime) continue;
-                if (@typeInfo(feeld.type) == .Pointer) {
-                    @compileError("mapTensors doesn't follow pointers and don't accept struct containing them. Received: " ++ @typeName(T));
-                }
-                @field(copy, feeld.name) = mapTensors(func, @field(v, feeld.name), args);
-            }
-            return copy;
-        },
-        .Array => {
-            var res: T = undefined;
-            for (v, &res) |item, *r| {
-                r.* = mapTensors(func, item, args);
-            }
-            return res;
-        },
-        .Union, .Optional => @compileError("mapTensors doesn't yet support " ++ @typeName(T)),
-        else => v,
-    };
-}
diff --git a/zml/ops.zig b/zml/ops.zig
index 8d65b18..f9044d8 100644
--- a/zml/ops.zig
+++ b/zml/ops.zig
@@ -303,7 +303,7 @@ pub fn for_(comptime func: anytype, blk_ctx: BlockSign(func).BlkCtx, num_steps_:
             return Tensor.constant(shape, x.dtype().zero());
         }
 
-        fn wrapFirstStep(x: Tensor, tag_: @TypeOf(step_tag)) Tensor {
+        fn wrapFirstStep(tag_: @TypeOf(step_tag), x: Tensor) Tensor {
             var shape = x.shape();
             shape._dims.insert(0, 1) catch unreachable;
             shape._tags.insert(0, tag_) catch unreachable;
@@ -315,13 +315,14 @@ pub fn for_(comptime func: anytype, blk_ctx: BlockSign(func).BlkCtx, num_steps_:
     // it's only used to infer the output shapes.
     const first_step = @call(.auto, func, .{ blk_ctx, Tensor.scalar(0, .i32) });
     log.debug("for_ first_step: {}", .{first_step});
+    const allocator = CompilationContext.current()._allocator;
     // Optimize for small num reps
     if (num_steps == 1) {
-        // return helpers.mapTensors(ForBlk.wrapFirstStep, first_step, .{ step_tag });
-        return first_step;
+        var res = first_step;
+        meta.mapAlloc(ForBlk.wrapFirstStep, allocator, step_tag, first_step, &res) catch unreachable;
+        return res;
     }
 
-    const allocator = CompilationContext.current()._allocator;
     if (num_steps <= 4) {
         var steps: [4]S.Return = undefined;
         steps[0] = first_step;
@@ -368,16 +369,19 @@ test for_ {
     // Just one baby step
     {
         const squares = try zml.testing.compileAndCall(platform, Squares.forward, .{1});
+        try zml.testing.expectEqualShapes(Shape.init(.{1}, .f32), squares.shape());
         try std.testing.expectEqual(0, squares.getValue(f32));
     }
     // Wow 4 in rows !
     {
         const squares = try zml.testing.compileAndCall(platform, Squares.forward, .{4});
+        try zml.testing.expectEqualShapes(Shape.init(.{4}, .f32), squares.shape());
         try std.testing.expectEqual([_]f32{ 0, 1, 4, 9 }, try squares.getValue([4]f32));
     }
     // AGI is coming, computing 10 squares as it's nothing.
     {
         const squares = try zml.testing.compileAndCall(platform, Squares.forward, .{10});
+        try zml.testing.expectEqualShapes(Shape.init(.{10}, .f32), squares.shape());
         try std.testing.expectEqual(
             [_]f32{ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81 },
             try squares.getValue([10]f32),
diff --git a/zml/tensor.zig b/zml/tensor.zig
index 69aeddd..8be051f 100644
--- a/zml/tensor.zig
+++ b/zml/tensor.zig
@@ -1908,9 +1908,9 @@ pub const Tensor = struct {
     }
 
     pub const Pad = struct {
-        low: i32 = 0,
-        high: i32 = 0,
-        interior: i32 = 0,
+        low: i64 = 0,
+        high: i64 = 0,
+        interior: i64 = 0,
     };
 
     /// Pads the input Tensor with the given values.
@@ -2542,6 +2542,26 @@ pub const Tensor = struct {
             try std.testing.expect(a.shape().eql(result.shape()));
             try std.testing.expectEqual(expected, result.getValue(@TypeOf(expected)));
         }
+        // Test with setting individual values (no batching)
+        {
+            const a_host = try zml.HostBuffer.arange(std.testing.allocator, .{ .end = 9 }, .i32);
+            const a = try zml.Buffer.from(platform, a_host);
+            defer a.deinit();
+            a_host.deinit(std.testing.allocator);
+
+            const scatter_indices = try zml.Buffer.fromArray(platform, [2][1]i32{ .{2}, .{7} });
+            const updates = try zml.Buffer.fromArray(platform, [2]i32{ 20, 70 });
+
+            const expected = [9]i32{ 0, 1, 22, 3, 4, 5, 6, 77, 8 };
+            const result = try zml.testing.compileAndCall(platform, Local.scatter, .{
+                a,
+                a.shape().axes(.{0}),
+                scatter_indices.withTags(.{ .n, .coord }),
+                updates.withTags(.{.n}),
+            });
+            try std.testing.expect(a.shape().eql(result.shape()));
+            try std.testing.expectEqual(expected, result.getValue(@TypeOf(expected)));
+        }
         {
             // Test with actual values and batching along axis .a
             const operand = try zml.Buffer.constant(platform, Shape.init(.{ .a = 2, .b = 3, .c = 4, .d = 2 }, .u16), 0);