const asynk = @import("async"); const std = @import("std"); const zml = @import("../../zml.zig"); const assert = std.debug.assert; const log = std.log.scoped(.zml_io); pub const GgufErrors = error{ ValueTypeMismatch, InvalidGguf, UnsupportedGgufType, EndOfMetadata, OutOfMemory, }; // Enums and structures pub const TensorType = enum(u32) { f32 = 0, f16 = 1, q4_0 = 2, q4_1 = 3, deprecated_q4_2 = 4, deprecated_q4_3 = 5, q5_0 = 6, q5_1 = 7, q8_0 = 8, q8_1 = 9, // k-quantizations q2_k = 10, q3_k = 11, q4_k = 12, q5_k = 13, q6_k = 14, q8_k = 15, i8 = 16, i16 = 17, i32 = 18, const MAX_KNOWN_ENUM = 18; pub fn canConvertQuant(self: TensorType) bool { return switch (self) { .q8_0, .q4_k, .q6_k, .q2_k, .q4_0, .q4_1 => true, else => false, }; } pub fn toDtype(self: TensorType) ?zml.DataType { return switch (self) { .f32 => .f32, .f16 => .f16, .i8 => .i8, .i16 => .i16, .i32 => .i32, else => null, }; } pub fn sizeOf(self: TensorType) usize { return self.toDtype().?.sizeOf(); } /// Return the tensor type features pub fn getFeatures(t: TensorType) TensorTypeFeatures { return switch (t) { inline else => |val| @field(TENSOR_TYPE_FEATURES, @tagName(val)), }; } }; /// GGUF tensor type to features lookup table. pub const TensorTypeFeatures = struct { items_per_block: u29, bytes_per_block: u29, pub fn alignment(features: TensorTypeFeatures) u8 { return std.math.log2_int(u29, features.bytes_per_block); } }; pub const TENSOR_TYPE_FEATURES: std.enums.EnumFieldStruct(TensorType, TensorTypeFeatures, null) = .{ .f32 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(f32) }, .f16 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(f16) }, .q4_0 = .{ .items_per_block = 32, .bytes_per_block = 18 }, .q4_1 = .{ .items_per_block = 32, .bytes_per_block = 20 }, .deprecated_q4_2 = .{ .items_per_block = 0, .bytes_per_block = 0 }, .deprecated_q4_3 = .{ .items_per_block = 0, .bytes_per_block = 0 }, .q5_0 = .{ .items_per_block = 32, .bytes_per_block = 22 }, .q5_1 = .{ .items_per_block = 32, .bytes_per_block = 24 }, .q8_0 = .{ .items_per_block = 32, .bytes_per_block = 34 }, .q8_1 = .{ .items_per_block = 32, .bytes_per_block = 40 }, .q2_k = .{ .items_per_block = 256, .bytes_per_block = 82 }, .q3_k = .{ .items_per_block = 256, .bytes_per_block = 110 }, .q4_k = .{ .items_per_block = 256, .bytes_per_block = 144 }, .q5_k = .{ .items_per_block = 256, .bytes_per_block = 176 }, .q6_k = .{ .items_per_block = 256, .bytes_per_block = 210 }, .q8_k = .{ .items_per_block = 256, .bytes_per_block = 292 }, .i8 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(i8) }, .i16 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(i16) }, .i32 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(i32) }, }; pub const GgufValueType = enum(u32) { // The value is a 8-bit unsigned integer. uint8 = 0, // The value is a 8-bit signed integer. int8 = 1, // The value is a 16-bit unsigned little-endian integer. uint16 = 2, // The value is a 16-bit signed little-endian integer. int16 = 3, // The value is a 32-bit unsigned little-endian integer. uint32 = 4, // The value is a 32-bit signed little-endian integer. int32 = 5, // The value is a 32-bit IEEE754 floating point number. float32 = 6, // The value is a boolean. // 1-byte value where 0 is false and 1 is true. // Anything else is invalid, and should be treated as either the model // being invalid or the reader being buggy. bool = 7, // The value is a UTF-8 non-null-terminated string, with length prepended. string = 8, // The value is an array of other values, with the length and type // prepended. Arrays can be nested, and the length of the array is the // number of elements in the array, not the number of bytes. array = 9, // The value is a 64-bit unsigned little-endian integer. uint64 = 10, // The value is a 64-bit signed little-endian integer. int64 = 11, // The value is a 64-bit IEEE754 floating point number. float64 = 12, // Special values used by the callbacks of gguf_do_with_value(). array_start = 100, array_end = 101, // Allow other values in case GGUF add more types without us noticing _, pub fn sizeOf(self: GgufValueType) usize { return switch (self) { .uint8 => @sizeOf(u8), .int8 => @sizeOf(i8), .uint16 => @sizeOf(u16), .int16 => @sizeOf(i16), .uint32 => @sizeOf(u32), .int32 => @sizeOf(i32), .float32 => @sizeOf(f32), .bool => @sizeOf(bool), .uint64 => @sizeOf(u64), .int64 => @sizeOf(i64), .float64 => @sizeOf(f64), .string => @sizeOf([]u8), else => unreachable, }; } pub fn arrayTypeCheck(self: GgufValueType, comptime T: type) !void { switch (self) { .string => if (T != []u8 and T != []const u8) return error.ValueTypeMismatch, .uint8 => if (T != u8) return error.ValueTypeMismatch, .int8 => if (T != i8) return error.ValueTypeMismatch, .uint16 => if (T != u16) return error.ValueTypeMismatch, .int16 => if (T != i16) return error.ValueTypeMismatch, .uint32 => if (T != u32) return error.ValueTypeMismatch, .int32 => if (T != i32) return error.ValueTypeMismatch, .float32 => if (T != f32) return error.ValueTypeMismatch, .bool => if (T != bool) return error.ValueTypeMismatch, .uint64 => if (T != u64) return error.ValueTypeMismatch, .int64 => if (T != i64) return error.ValueTypeMismatch, .float64 => if (T != f64) return error.ValueTypeMismatch, else => {}, } } }; pub const ValueType = enum { uint8, int8, uint16, int16, uint32, int32, float32, uint64, int64, float64, boolval, string, array, }; // Union of possible values. pub const GgufValue = union(ValueType) { uint8: u8, int8: i8, uint16: u16, int16: i16, uint32: u32, int32: i32, float32: f32, uint64: u64, int64: i64, float64: f64, boolval: bool, string: []const u8, array: Array, pub const Array = struct { // Any value type is valid, including arrays. child: GgufValueType, // Number of elements, not bytes len: usize, data: []u8, }; pub fn asLoaderValue(self: GgufValue) zml.aio.Value { return switch (self) { .array => |v| .{ .array = .{ .item_type = switch (v.child) { .bool => .boolval, .uint8 => .uint8, .int8 => .int8, .uint16 => .uint16, .int16 => .int16, .uint32 => .uint32, .int32 => .int32, .float32 => .float32, .uint64 => .uint64, .int64 => .int64, .float64 => .float64, .string => .string, // TODO: .array => .array, else => unreachable, }, .data = v.data, }, }, inline else => |v, tag| @unionInit(zml.aio.Value, @tagName(tag), v), }; } }; // Header const GgufHeader = extern struct { // Magic number to announce that this is a GGUF file. Must be `GUFF`. magic: [4]u8, // The version of the format implemented. // Must be `3` for version described in this spec. version: u32, // The number of tensors in the file. // This is explicit, instead of being included in the metadata, to ensure // it is always present for loading the tensors. tensor_count: usize, // The number of metadata key-value pairs. metadata_kv_count: usize, pub fn validate(self: GgufHeader) !void { if (!std.mem.eql(u8, &self.magic, "GGUF")) { log.err("Invalid GGUF file: wrong header {s}", .{self.magic}); return error.InvalidGguf; } } }; // Key representation in this library API. pub const GgufMetadataKv = struct { name: []const u8, type_: GgufValueType, val: GgufValue, }; // Tensor representation in this library API. const GGUF_TENSOR_MAX_DIM: usize = 8; // Future-proof: actual limit is 4. pub const GgufTensorInfo = struct { name: []const u8, t: TensorType, // Tensor type (enum TensorType). rank: usize, // Number of dimensions of the tensor. dims: [GGUF_TENSOR_MAX_DIM]i64, // Dimensions (Eg. [512, 1024, 1, 1]). start: usize, // Offset from start of data section. byte_len: usize, // Total size in bytes. num_weights: usize, // Total number of parameters. pub inline fn shape(info: GgufTensorInfo) []const i64 { return info.dims[0..info.rank]; } }; // Return the value type name given the type ID. fn getValueTypeName(t: u32) []const u8 { if (@as(usize, @intCast(t)) >= GGUF_VALUE_NAME.len) return "unknown"; return GGUF_VALUE_NAME[@intCast(t)]; } const GGUF_VALUE_NAME = [_][]const u8{ "uint8", "int8", "uint16", "int16", "uint32", "int32", "float32", "bool", "string", "array", "uint64", "int64", "float64", }; /// GGUF file API /// A memory-mapped view of a .gguf file. /// Format used by GGML models: https://github.com/ggerganov/ggml/ pub const GgufFile = struct { header: GgufHeader, // GUFF file header info. size: usize, // Total file size. file: zml.aio.MemoryMappedFile, left_kv: usize, // Number of key-value pairs yet to read. left_tensors: usize, // Number of tensors yet to read. off: usize, // Offset of the next item to parse. alignment: usize = 32, // File data alignment. Default: 32 bytes. /// Open and memmap the given file. pub fn open(path: []const u8) !GgufFile { const file = try asynk.File.open(path, .{}); const header = try file.reader().readStruct(GgufHeader); try header.validate(); return .{ .header = header, .size = (try file.stat()).size, .file = try zml.aio.MemoryMappedFile.init(file), .off = @sizeOf(GgufHeader), .left_kv = header.metadata_kv_count, .left_tensors = header.tensor_count, }; } /// Unmap the file memory and close the file handle. pub fn close(self: *GgufFile) void { self.file.deinit(); } /// Set the context to read the first key-value entry in the GGUF /// file and then all the rest. Is used when creating a new context /// and also when you want to restart scanning the key-value /// items in the file. fn rewind(ctx: *GgufFile) void { ctx.off = @sizeOf(GgufHeader); ctx.left_kv = ctx.header.metadata_kv_count; ctx.left_tensors = ctx.header.tensor_count; } pub fn seek(self: *GgufFile, pos: usize) void { assert(pos < self.size); self.off = pos; } fn readInt(self: *GgufFile, comptime T: type) !T { if (self.off + @sizeOf(T) >= self.size) return error.InvalidGguf; const res = self.file.file.reader().readInt(T, .little); self.off += @sizeOf(T); return res; } fn readTensorType(self: *GgufFile) !TensorType { const raw = try self.readInt(u32); if (raw > TensorType.MAX_KNOWN_ENUM) { log.err("Unsupported GGUF tensor type: {d}", .{raw}); return error.UnsupportedGgufType; } return @enumFromInt(raw); } fn readValueType(self: *GgufFile) !GgufValueType { const raw = try self.readInt(u32); const t: GgufValueType = @enumFromInt(raw); switch (t) { .uint8, .int8, .uint16, .int16, .uint32, .int32, .float32, .bool, .string, .array, .uint64, .int64, .float64, .array_start, .array_end => {}, else => { log.err("Unsupported GGUF value type: {s}", .{@tagName(t)}); return error.UnsupportedGgufType; }, } return t; } pub fn readAlloc(self: *GgufFile, allocator: std.mem.Allocator, len: usize) ![]u8 { const data = try allocator.alloc(u8, len); const read = try self.file.file.reader().readAll(data); if (read != data.len) return error.InvalidGguf; self.off += len; return data; } pub fn skipBytes(self: *GgufFile, len: usize) !void { try self.file.file.seekBy(@intCast(len)); self.off += len; } /// Read the len then the actual bytes. pub fn readString(self: *GgufFile, allocator: std.mem.Allocator) ![]u8 { const len: usize = try self.readInt(u64); return self.readAlloc(allocator, len); } pub fn skipString(self: *GgufFile) !void { const len: usize = try self.readInt(u64); return self.skipBytes(len); } fn readArrayHeader(self: *GgufFile, allocator: std.mem.Allocator) !GgufValue.Array { const child = try self.readValueType(); const len: usize = try self.readInt(u64); const data = switch (child) { // Since strings have variable lenghts, we need to read them one by one .string => str: { var data = try allocator.alloc([]u8, len); for (0..len) |i| data[i] = try self.readString(allocator); break :str std.mem.sliceAsBytes(data); }, else => try self.readAlloc(allocator, len * child.sizeOf()), }; return .{ .child = child, .len = len, .data = data, }; } fn readTypedValue(self: *GgufFile, allocator: std.mem.Allocator, t: GgufValueType) !GgufValue { return switch (t) { .uint8 => .{ .uint8 = try self.readInt(u8) }, .int8 => .{ .int8 = try self.readInt(i8) }, .uint16 => .{ .uint16 = try self.readInt(u16) }, .int16 => .{ .int16 = try self.readInt(i16) }, .uint32 => .{ .uint32 = try self.readInt(u32) }, .int32 => .{ .int32 = try self.readInt(i32) }, .float32 => .{ .float32 = @bitCast(try self.readInt(u32)) }, .bool => .{ .boolval = try self.readInt(u8) != 0 }, .string => .{ .string = try self.readString(allocator) }, .array => .{ .array = try self.readArrayHeader(allocator) }, .uint64 => .{ .uint64 = try self.readInt(u64) }, .int64 => .{ .int64 = try self.readInt(i64) }, .float64 => .{ .float64 = @bitCast(try self.readInt(u64)) }, else => error.UnsupportedGgufType, }; } /// Parses the next metadata entry. /// Returns error.EndOfMetadata if there are no longer metadata to process in this GGUF file. pub fn readMetadata(self: *GgufFile, allocator: std.mem.Allocator) !GgufMetadataKv { if (self.left_kv == 0) return error.EndOfMetadata; self.left_kv -= 1; const name = try self.readString(allocator); const type_ = try self.readValueType(); const val: GgufValue = try self.readTypedValue(allocator, type_); return .{ .name = name, .type_ = type_, .val = val }; } // Set the data section offset. This function must be called exactly when // all the key-values are consumed, in the context of the first call of // ctx.getTensor(): this way we will be able to return tensor offsets // as absolute positions and pointers to the mmapped file. fn setDataOffset(self: *GgufFile) !void { const base_off = self.off; assert(self.left_kv == 0 and self.left_tensors == self.header.tensor_count); for (0..self.left_tensors) |_| try self.skipTensor(); const padding: usize = getAlignmentPadding(self.alignment, self.off); self.file.data_offset = self.off + padding; try self.file.file.seekTo(base_off); self.off = base_off; } pub fn skipTensor(self: *GgufFile) !void { try self.skipString(); // Skip name const num_dim: u32 = try self.readInt(u32); // dimensions, type, and offset. try self.skipBytes(8 * num_dim + 4 + 8); } /// Parses the next tensor entry. /// Returns error.EndOfMetadata if there are no longer tensor metadata to process in this GGUF file. pub fn readTensorInfo(self: *GgufFile, allocator: std.mem.Allocator) !GgufTensorInfo { if (self.left_tensors == 0 or self.left_kv != 0) { return error.EndOfMetadata; } // We want to return tensor data with offsets relative to the start // of the file, so that the user of the API is able to access tensors // as it iterates over them. To do so, we need to perform a full // scan if this is the first tensor info we are reading. // TODO: explicitly set the data offset in if (self.file.data_offset == 0) try self.setDataOffset(); self.left_tensors -= 1; const name = try self.readString(allocator); const num_dim = try self.readInt(u32); assert(@as(usize, @intCast(num_dim)) <= GGUF_TENSOR_MAX_DIM); // Read the dimentions; unused dimensions are left `undefined`. // Note: we reverse the order of the dimensions to match zml convention. var dims: [GGUF_TENSOR_MAX_DIM]i64 = undefined; var num_weights: usize = 1; for (0..num_dim) |j| { const d = try self.readInt(u64); dims[num_dim - 1 - j] = @intCast(d); num_weights *= d; } const t: TensorType = try self.readTensorType(); const start = try self.readInt(u64); // To accurately calculate the bytes used by this tensor on the GGUF // file, we need to take into account that quantization methods store // tensors as block of N weights. So first of all we need to understand // the number of padding weights (since the last block may have just // fewer weights stored inside, but still requires to be stored to its full // length). Then we can do the math to see how many blocks we need, and // multiply by the block size to obtain the final total size. const tf = t.getFeatures(); const byte_len: usize = (std.math.divCeil(usize, num_weights, tf.items_per_block) catch unreachable) * tf.bytes_per_block; return .{ .name = name, .t = t, .rank = num_dim, .dims = dims, .start = start, .byte_len = byte_len, .num_weights = num_weights, }; } }; /// Given an offset or a length, returns the padding needed to align it to alignment. fn getAlignmentPadding(alignment: usize, offset: usize) usize { return @rem((alignment - @rem(offset, alignment)), alignment); }