Radix/zml/aio/gguf/core.zig

530 lines
19 KiB
Zig

const asynk = @import("async");
const std = @import("std");
const zml = @import("../../zml.zig");
const assert = std.debug.assert;
const log = std.log.scoped(.zml_io);
pub const GgufErrors = error{
ValueTypeMismatch,
InvalidGguf,
UnsupportedGgufType,
EndOfMetadata,
OutOfMemory,
};
// Enums and structures
pub const TensorType = enum(u32) {
f32 = 0,
f16 = 1,
q4_0 = 2,
q4_1 = 3,
deprecated_q4_2 = 4,
deprecated_q4_3 = 5,
q5_0 = 6,
q5_1 = 7,
q8_0 = 8,
q8_1 = 9,
// k-quantizations
q2_k = 10,
q3_k = 11,
q4_k = 12,
q5_k = 13,
q6_k = 14,
q8_k = 15,
i8 = 16,
i16 = 17,
i32 = 18,
const MAX_KNOWN_ENUM = 18;
pub fn canConvertQuant(self: TensorType) bool {
return switch (self) {
.q8_0, .q4_k, .q6_k, .q2_k, .q4_0, .q4_1 => true,
else => false,
};
}
pub fn toDtype(self: TensorType) ?zml.DataType {
return switch (self) {
.f32 => .f32,
.f16 => .f16,
.i8 => .i8,
.i16 => .i16,
.i32 => .i32,
else => null,
};
}
pub fn sizeOf(self: TensorType) usize {
return self.toDtype().?.sizeOf();
}
/// Return the tensor type features
pub fn getFeatures(t: TensorType) TensorTypeFeatures {
return switch (t) {
inline else => |val| @field(TENSOR_TYPE_FEATURES, @tagName(val)),
};
}
};
/// GGUF tensor type to features lookup table.
pub const TensorTypeFeatures = struct {
items_per_block: u29,
bytes_per_block: u29,
pub fn alignment(features: TensorTypeFeatures) u8 {
return std.math.log2_int(u29, features.bytes_per_block);
}
};
pub const TENSOR_TYPE_FEATURES: std.enums.EnumFieldStruct(TensorType, TensorTypeFeatures, null) = .{
.f32 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(f32) },
.f16 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(f16) },
.q4_0 = .{ .items_per_block = 32, .bytes_per_block = 18 },
.q4_1 = .{ .items_per_block = 32, .bytes_per_block = 20 },
.deprecated_q4_2 = .{ .items_per_block = 0, .bytes_per_block = 0 },
.deprecated_q4_3 = .{ .items_per_block = 0, .bytes_per_block = 0 },
.q5_0 = .{ .items_per_block = 32, .bytes_per_block = 22 },
.q5_1 = .{ .items_per_block = 32, .bytes_per_block = 24 },
.q8_0 = .{ .items_per_block = 32, .bytes_per_block = 34 },
.q8_1 = .{ .items_per_block = 32, .bytes_per_block = 40 },
.q2_k = .{ .items_per_block = 256, .bytes_per_block = 82 },
.q3_k = .{ .items_per_block = 256, .bytes_per_block = 110 },
.q4_k = .{ .items_per_block = 256, .bytes_per_block = 144 },
.q5_k = .{ .items_per_block = 256, .bytes_per_block = 176 },
.q6_k = .{ .items_per_block = 256, .bytes_per_block = 210 },
.q8_k = .{ .items_per_block = 256, .bytes_per_block = 292 },
.i8 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(i8) },
.i16 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(i16) },
.i32 = .{ .items_per_block = 1, .bytes_per_block = @sizeOf(i32) },
};
pub const GgufValueType = enum(u32) {
// The value is a 8-bit unsigned integer.
uint8 = 0,
// The value is a 8-bit signed integer.
int8 = 1,
// The value is a 16-bit unsigned little-endian integer.
uint16 = 2,
// The value is a 16-bit signed little-endian integer.
int16 = 3,
// The value is a 32-bit unsigned little-endian integer.
uint32 = 4,
// The value is a 32-bit signed little-endian integer.
int32 = 5,
// The value is a 32-bit IEEE754 floating point number.
float32 = 6,
// The value is a boolean.
// 1-byte value where 0 is false and 1 is true.
// Anything else is invalid, and should be treated as either the model
// being invalid or the reader being buggy.
bool = 7,
// The value is a UTF-8 non-null-terminated string, with length prepended.
string = 8,
// The value is an array of other values, with the length and type
// prepended. Arrays can be nested, and the length of the array is the
// number of elements in the array, not the number of bytes.
array = 9,
// The value is a 64-bit unsigned little-endian integer.
uint64 = 10,
// The value is a 64-bit signed little-endian integer.
int64 = 11,
// The value is a 64-bit IEEE754 floating point number.
float64 = 12,
// Special values used by the callbacks of gguf_do_with_value().
array_start = 100,
array_end = 101,
// Allow other values in case GGUF add more types without us noticing
_,
pub fn sizeOf(self: GgufValueType) usize {
return switch (self) {
.uint8 => @sizeOf(u8),
.int8 => @sizeOf(i8),
.uint16 => @sizeOf(u16),
.int16 => @sizeOf(i16),
.uint32 => @sizeOf(u32),
.int32 => @sizeOf(i32),
.float32 => @sizeOf(f32),
.bool => @sizeOf(bool),
.uint64 => @sizeOf(u64),
.int64 => @sizeOf(i64),
.float64 => @sizeOf(f64),
.string => @sizeOf([]u8),
else => unreachable,
};
}
pub fn arrayTypeCheck(self: GgufValueType, comptime T: type) !void {
switch (self) {
.string => if (T != []u8 and T != []const u8) return error.ValueTypeMismatch,
.uint8 => if (T != u8) return error.ValueTypeMismatch,
.int8 => if (T != i8) return error.ValueTypeMismatch,
.uint16 => if (T != u16) return error.ValueTypeMismatch,
.int16 => if (T != i16) return error.ValueTypeMismatch,
.uint32 => if (T != u32) return error.ValueTypeMismatch,
.int32 => if (T != i32) return error.ValueTypeMismatch,
.float32 => if (T != f32) return error.ValueTypeMismatch,
.bool => if (T != bool) return error.ValueTypeMismatch,
.uint64 => if (T != u64) return error.ValueTypeMismatch,
.int64 => if (T != i64) return error.ValueTypeMismatch,
.float64 => if (T != f64) return error.ValueTypeMismatch,
else => {},
}
}
};
pub const ValueType = enum {
uint8,
int8,
uint16,
int16,
uint32,
int32,
float32,
uint64,
int64,
float64,
boolval,
string,
array,
};
// Union of possible values.
pub const GgufValue = union(ValueType) {
uint8: u8,
int8: i8,
uint16: u16,
int16: i16,
uint32: u32,
int32: i32,
float32: f32,
uint64: u64,
int64: i64,
float64: f64,
boolval: bool,
string: []const u8,
array: Array,
pub const Array = struct {
// Any value type is valid, including arrays.
child: GgufValueType,
// Number of elements, not bytes
len: usize,
data: []u8,
};
pub fn asLoaderValue(self: GgufValue) zml.aio.Value {
return switch (self) {
.array => |v| .{
.array = .{
.item_type = switch (v.child) {
.bool => .boolval,
.uint8 => .uint8,
.int8 => .int8,
.uint16 => .uint16,
.int16 => .int16,
.uint32 => .uint32,
.int32 => .int32,
.float32 => .float32,
.uint64 => .uint64,
.int64 => .int64,
.float64 => .float64,
.string => .string,
// TODO: .array => .array,
else => unreachable,
},
.data = v.data,
},
},
inline else => |v, tag| @unionInit(zml.aio.Value, @tagName(tag), v),
};
}
};
// Header
const GgufHeader = extern struct {
// Magic number to announce that this is a GGUF file. Must be `GUFF`.
magic: [4]u8,
// The version of the format implemented.
// Must be `3` for version described in this spec.
version: u32,
// The number of tensors in the file.
// This is explicit, instead of being included in the metadata, to ensure
// it is always present for loading the tensors.
tensor_count: usize,
// The number of metadata key-value pairs.
metadata_kv_count: usize,
pub fn validate(self: GgufHeader) !void {
if (!std.mem.eql(u8, &self.magic, "GGUF")) {
log.err("Invalid GGUF file: wrong header {s}", .{self.magic});
return error.InvalidGguf;
}
}
};
// Key representation in this library API.
pub const GgufMetadataKv = struct {
name: []const u8,
type_: GgufValueType,
val: GgufValue,
};
// Tensor representation in this library API.
const GGUF_TENSOR_MAX_DIM: usize = 8; // Future-proof: actual limit is 4.
pub const GgufTensorInfo = struct {
name: []const u8,
t: TensorType, // Tensor type (enum TensorType).
rank: usize, // Number of dimensions of the tensor.
dims: [GGUF_TENSOR_MAX_DIM]i64, // Dimensions (Eg. [512, 1024, 1, 1]).
start: usize, // Offset from start of data section.
byte_len: usize, // Total size in bytes.
num_weights: usize, // Total number of parameters.
pub inline fn shape(info: GgufTensorInfo) []const i64 {
return info.dims[0..info.rank];
}
};
// Return the value type name given the type ID.
fn getValueTypeName(t: u32) []const u8 {
if (@as(usize, @intCast(t)) >= GGUF_VALUE_NAME.len) return "unknown";
return GGUF_VALUE_NAME[@intCast(t)];
}
const GGUF_VALUE_NAME = [_][]const u8{
"uint8", "int8", "uint16", "int16", "uint32", "int32",
"float32", "bool", "string", "array", "uint64", "int64",
"float64",
};
/// GGUF file API
/// A memory-mapped view of a .gguf file.
/// Format used by GGML models: https://github.com/ggerganov/ggml/
pub const GgufFile = struct {
header: GgufHeader, // GUFF file header info.
size: usize, // Total file size.
file: zml.aio.MemoryMappedFile,
left_kv: usize, // Number of key-value pairs yet to read.
left_tensors: usize, // Number of tensors yet to read.
off: usize, // Offset of the next item to parse.
alignment: usize = 32, // File data alignment. Default: 32 bytes.
/// Open and memmap the given file.
pub fn open(path: []const u8) !GgufFile {
const file = try asynk.File.open(path, .{});
const header = try file.reader().readStruct(GgufHeader);
try header.validate();
return .{
.header = header,
.size = (try file.stat()).size,
.file = try zml.aio.MemoryMappedFile.init(file),
.off = @sizeOf(GgufHeader),
.left_kv = header.metadata_kv_count,
.left_tensors = header.tensor_count,
};
}
/// Unmap the file memory and close the file handle.
pub fn close(self: *GgufFile) void {
self.file.deinit();
}
/// Set the context to read the first key-value entry in the GGUF
/// file and then all the rest. Is used when creating a new context
/// and also when you want to restart scanning the key-value
/// items in the file.
fn rewind(ctx: *GgufFile) void {
ctx.off = @sizeOf(GgufHeader);
ctx.left_kv = ctx.header.metadata_kv_count;
ctx.left_tensors = ctx.header.tensor_count;
}
pub fn seek(self: *GgufFile, pos: usize) void {
assert(pos < self.size);
self.off = pos;
}
fn readInt(self: *GgufFile, comptime T: type) !T {
if (self.off + @sizeOf(T) >= self.size) return error.InvalidGguf;
const res = self.file.file.reader().readInt(T, .little);
self.off += @sizeOf(T);
return res;
}
fn readTensorType(self: *GgufFile) !TensorType {
const raw = try self.readInt(u32);
if (raw > TensorType.MAX_KNOWN_ENUM) {
log.err("Unsupported GGUF tensor type: {d}", .{raw});
return error.UnsupportedGgufType;
}
return @enumFromInt(raw);
}
fn readValueType(self: *GgufFile) !GgufValueType {
const raw = try self.readInt(u32);
const t: GgufValueType = @enumFromInt(raw);
switch (t) {
.uint8, .int8, .uint16, .int16, .uint32, .int32, .float32, .bool, .string, .array, .uint64, .int64, .float64, .array_start, .array_end => {},
else => {
log.err("Unsupported GGUF value type: {s}", .{@tagName(t)});
return error.UnsupportedGgufType;
},
}
return t;
}
pub fn readAlloc(self: *GgufFile, allocator: std.mem.Allocator, len: usize) ![]u8 {
const data = try allocator.alloc(u8, len);
const read = try self.file.file.reader().readAll(data);
if (read != data.len) return error.InvalidGguf;
self.off += len;
return data;
}
pub fn skipBytes(self: *GgufFile, len: usize) !void {
try self.file.file.seekBy(@intCast(len));
self.off += len;
}
/// Read the len then the actual bytes.
pub fn readString(self: *GgufFile, allocator: std.mem.Allocator) ![]u8 {
const len: usize = try self.readInt(u64);
return self.readAlloc(allocator, len);
}
pub fn skipString(self: *GgufFile) !void {
const len: usize = try self.readInt(u64);
return self.skipBytes(len);
}
fn readArrayHeader(self: *GgufFile, allocator: std.mem.Allocator) !GgufValue.Array {
const child = try self.readValueType();
const len: usize = try self.readInt(u64);
const data = switch (child) {
// Since strings have variable lenghts, we need to read them one by one
.string => str: {
var data = try allocator.alloc([]u8, len);
for (0..len) |i| data[i] = try self.readString(allocator);
break :str std.mem.sliceAsBytes(data);
},
else => try self.readAlloc(allocator, len * child.sizeOf()),
};
return .{
.child = child,
.len = len,
.data = data,
};
}
fn readTypedValue(self: *GgufFile, allocator: std.mem.Allocator, t: GgufValueType) !GgufValue {
return switch (t) {
.uint8 => .{ .uint8 = try self.readInt(u8) },
.int8 => .{ .int8 = try self.readInt(i8) },
.uint16 => .{ .uint16 = try self.readInt(u16) },
.int16 => .{ .int16 = try self.readInt(i16) },
.uint32 => .{ .uint32 = try self.readInt(u32) },
.int32 => .{ .int32 = try self.readInt(i32) },
.float32 => .{ .float32 = @bitCast(try self.readInt(u32)) },
.bool => .{ .boolval = try self.readInt(u8) != 0 },
.string => .{ .string = try self.readString(allocator) },
.array => .{ .array = try self.readArrayHeader(allocator) },
.uint64 => .{ .uint64 = try self.readInt(u64) },
.int64 => .{ .int64 = try self.readInt(i64) },
.float64 => .{ .float64 = @bitCast(try self.readInt(u64)) },
else => error.UnsupportedGgufType,
};
}
/// Parses the next metadata entry.
/// Returns error.EndOfMetadata if there are no longer metadata to process in this GGUF file.
pub fn readMetadata(self: *GgufFile, allocator: std.mem.Allocator) !GgufMetadataKv {
if (self.left_kv == 0) return error.EndOfMetadata;
self.left_kv -= 1;
const name = try self.readString(allocator);
const type_ = try self.readValueType();
const val: GgufValue = try self.readTypedValue(allocator, type_);
return .{ .name = name, .type_ = type_, .val = val };
}
// Set the data section offset. This function must be called exactly when
// all the key-values are consumed, in the context of the first call of
// ctx.getTensor(): this way we will be able to return tensor offsets
// as absolute positions and pointers to the mmapped file.
fn setDataOffset(self: *GgufFile) !void {
const base_off = self.off;
assert(self.left_kv == 0 and self.left_tensors == self.header.tensor_count);
for (0..self.left_tensors) |_| try self.skipTensor();
const padding: usize = getAlignmentPadding(self.alignment, self.off);
self.file.data_offset = self.off + padding;
try self.file.file.seekTo(base_off);
self.off = base_off;
}
pub fn skipTensor(self: *GgufFile) !void {
try self.skipString(); // Skip name
const num_dim: u32 = try self.readInt(u32);
// dimensions, type, and offset.
try self.skipBytes(8 * num_dim + 4 + 8);
}
/// Parses the next tensor entry.
/// Returns error.EndOfMetadata if there are no longer tensor metadata to process in this GGUF file.
pub fn readTensorInfo(self: *GgufFile, allocator: std.mem.Allocator) !GgufTensorInfo {
if (self.left_tensors == 0 or self.left_kv != 0) {
return error.EndOfMetadata;
}
// We want to return tensor data with offsets relative to the start
// of the file, so that the user of the API is able to access tensors
// as it iterates over them. To do so, we need to perform a full
// scan if this is the first tensor info we are reading.
// TODO: explicitly set the data offset in
if (self.file.data_offset == 0) try self.setDataOffset();
self.left_tensors -= 1;
const name = try self.readString(allocator);
const num_dim = try self.readInt(u32);
assert(@as(usize, @intCast(num_dim)) <= GGUF_TENSOR_MAX_DIM);
// Read the dimentions; unused dimensions are left `undefined`.
// Note: we reverse the order of the dimensions to match zml convention.
var dims: [GGUF_TENSOR_MAX_DIM]i64 = undefined;
var num_weights: usize = 1;
for (0..num_dim) |j| {
const d = try self.readInt(u64);
dims[num_dim - 1 - j] = @intCast(d);
num_weights *= d;
}
const t: TensorType = try self.readTensorType();
const start = try self.readInt(u64);
// To accurately calculate the bytes used by this tensor on the GGUF
// file, we need to take into account that quantization methods store
// tensors as block of N weights. So first of all we need to understand
// the number of padding weights (since the last block may have just
// fewer weights stored inside, but still requires to be stored to its full
// length). Then we can do the math to see how many blocks we need, and
// multiply by the block size to obtain the final total size.
const tf = t.getFeatures();
const byte_len: usize = (std.math.divCeil(usize, num_weights, tf.items_per_block) catch unreachable) * tf.bytes_per_block;
return .{
.name = name,
.t = t,
.rank = num_dim,
.dims = dims,
.start = start,
.byte_len = byte_len,
.num_weights = num_weights,
};
}
};
/// Given an offset or a length, returns the padding needed to align it to alignment.
fn getAlignmentPadding(alignment: usize, offset: usize) usize {
return @rem((alignment - @rem(offset, alignment)), alignment);
}