/// Tools to load models from https://huggingface.co/karpathy/tinyllamas/ /// Originally made to be run with https://github.com/karpathy/llama2.c const asynk = @import("async"); const std = @import("std"); const stdx = @import("stdx"); const zml = @import("../zml.zig"); const TinyLlamaConfig = extern struct { dim: u32, hidden_dim: u32, n_layers: u32, n_heads: u32, n_kv_heads: u32, // llama2.c uses the sign bit of the vocab size to store if we need // a second embedding layer. vocab: packed struct(u32) { size: u31, has_lm_head: bool, }, seq_len: u32, }; /// Open a tinyllama file. /// For convenience we use the same layer names /// than the one used by the Llama-3.1 models. pub fn open(allocator: std.mem.Allocator, model_path: []const u8) !zml.aio.BufferStore { var res: zml.aio.BufferStore = .init(allocator); errdefer res.arena.deinit(); const arena = res.arena.allocator(); const file = try asynk.File.open(model_path, .{}); res.files = try arena.alloc(zml.aio.MemoryMappedFile, 1); res.files[0] = try zml.aio.MemoryMappedFile.init(file); const c = try file.reader().readStructEndian(TinyLlamaConfig, .little); zml.log.info("Read TinyLlamaConfig: {}", .{c}); const dim = c.dim; const hidden_dim = c.hidden_dim; const n_layers = c.n_layers; const n_heads = c.n_heads; const n_kv_heads = c.n_kv_heads; const seq_len = c.seq_len; const head_size = c.dim / c.n_heads; var off: usize = @sizeOf(TinyLlamaConfig); try res.buffers.ensureUnusedCapacity(arena, 8 + 9 * n_layers); // Map tinyllama naming to Meta llama3 naming. // token_embedding_table off = newBuff(&res, "model.embed_tokens.weight", .{ .voc = c.vocab.size, .d = dim }, off); // rms_att_weight off = try splitBuff(&res, "model.layers.{d}.input_layernorm.weight", .{dim}, n_layers, off); // wq off = try splitBuff(&res, "model.layers.{d}.self_attn.q_proj.weight", .{ dim, n_heads * head_size }, n_layers, off); // wk off = try splitBuff(&res, "model.layers.{d}.self_attn.k_proj.weight", .{ dim, n_kv_heads * head_size }, n_layers, off); // wv off = try splitBuff(&res, "model.layers.{d}.self_attn.v_proj.weight", .{ dim, n_kv_heads * head_size }, n_layers, off); // wo off = try splitBuff(&res, "model.layers.{d}.self_attn.o_proj.weight", .{ n_heads * head_size, dim }, n_layers, off); // rms_ffn_weight off = try splitBuff(&res, "model.layers.{d}.post_attention_layernorm.weight", .{dim}, n_layers, off); // w1 off = try splitBuff(&res, "model.layers.{d}.mlp.gate_proj.weight", .{ hidden_dim, dim }, n_layers, off); // w2 off = try splitBuff(&res, "model.layers.{d}.mlp.down_proj.weight", .{ dim, hidden_dim }, n_layers, off); // w3 off = try splitBuff(&res, "model.layers.{d}.mlp.up_proj.weight", .{ hidden_dim, dim }, n_layers, off); // rms_final_weight off = newBuff(&res, "model.norm.weight", .{dim}, off); // freq_cis_real (not used) off = newBuff(&res, "freq_cis_real", .{ seq_len, head_size / 2 }, off); off = newBuff(&res, "freq_cis_imag", .{ seq_len, head_size / 2 }, off); // wcls if (c.vocab.has_lm_head) { off = newBuff(&res, "lm_head.weight", .{ c.vocab.size, c.dim }, off); } else { res.buffers.putAssumeCapacityNoClobber("lm_head.weight", res.buffers.get("model.embed_tokens.weight").?); } const weights_size = off; std.log.info("Loaded a tinyllama file of {} bytes.\nThis is the parsed configuration of this llama model: {}", .{ weights_size, c }); if (file.stat() catch null) |stat| { stdx.debug.assert(weights_size == stat.size, "Expected to have a tinyllama file of {} bytes but file only got {} !\nThis is the parsed configuration of this llama model: {}", .{ weights_size, stat.size, c }); } { try res._metadata.ensureUnusedCapacity(arena, 11); res._metadata.putAssumeCapacityNoClobber("dim", .{ .int = c.dim }); res._metadata.putAssumeCapacityNoClobber("hidden_dim", .{ .int = c.hidden_dim }); res._metadata.putAssumeCapacityNoClobber("n_layers", .{ .int = c.n_layers }); res._metadata.putAssumeCapacityNoClobber("num_heads", .{ .int = c.n_heads }); res._metadata.putAssumeCapacityNoClobber("num_kv_heads", .{ .int = c.n_kv_heads }); res._metadata.putAssumeCapacityNoClobber("vocab_size", .{ .int = c.vocab.size }); res._metadata.putAssumeCapacityNoClobber("has_lm_head", .{ .bool = c.vocab.has_lm_head }); res._metadata.putAssumeCapacityNoClobber("max_seq_len", .{ .int = c.seq_len }); res._metadata.putAssumeCapacityNoClobber("rope_impl", .{ .string = "interleaved" }); res._metadata.putAssumeCapacityNoClobber("rope_freq_base", .{ .float = 10_000 }); res._metadata.putAssumeCapacityNoClobber("rms_norm_eps", .{ .float = 1e-6 }); } return res; } fn newBuff(store: *zml.aio.BufferStore, name: []const u8, sh: anytype, offset: usize) usize { var shape = zml.Shape.init(sh, .f32); const n = shape.byteSize(); const buff = zml.HostBuffer.fromBytes(shape, store.files[0].data[offset..][0..n]); store.buffers.putAssumeCapacityNoClobber(name, buff); zml.log.debug("Found {s}: {}", .{ name, shape }); return offset + n; } fn splitBuff(store: *zml.aio.BufferStore, comptime fmt: []const u8, sh: anytype, layers: usize, offset: usize) !usize { var shape = zml.Shape.init(sh, .f32); const n = shape.byteSize(); var off = offset; for (0..layers) |i| { const name = try std.fmt.allocPrint(store.arena.allocator(), fmt, .{i}); const buff = zml.HostBuffer.fromBytes(shape, store.files[0].data[off..][0..n]); store.buffers.putAssumeCapacityNoClobber(name, buff); off += n; if (i == 0) zml.log.debug("Found {s}: {}", .{ name, shape }); } return off; }