2023-01-02 14:28:25 +00:00
/// Tools to load models from https://huggingface.co/karpathy/tinyllamas/
/// Originally made to be run with https://github.com/karpathy/llama2.c
const std = @import ( " std " ) ;
const asynk = @import ( " async " ) ;
const zml = @import ( " ../zml.zig " ) ;
const TinyLlamaConfig = extern struct {
dim : u32 ,
hidden_dim : u32 ,
n_layers : u32 ,
n_heads : u32 ,
n_kv_heads : u32 ,
// llama2.c uses the sign bit of the vocab size to store if we need
// a second embedding layer.
vocab : packed struct ( u32 ) {
size : u31 ,
has_lm_head : bool ,
} ,
seq_len : u32 ,
} ;
/// Open a tinyllama file.
/// For convenience we use the same layer names
/// than the one used by the Llama-3.1 models.
pub fn open ( allocator : std . mem . Allocator , model_path : [ ] const u8 ) ! zml . aio . BufferStore {
var res : zml . aio . BufferStore = . {
. arena = std . heap . ArenaAllocator . init ( allocator ) ,
} ;
errdefer res . arena . deinit ( ) ;
const arena = res . arena . allocator ( ) ;
const file = try asynk . File . open ( model_path , . { } ) ;
res . files = try arena . alloc ( zml . aio . MemoryMappedFile , 1 ) ;
res . files [ 0 ] = try zml . aio . MemoryMappedFile . init ( file ) ;
const c = try file . reader ( ) . readStructEndian ( TinyLlamaConfig , . little ) ;
zml . log . info ( " Read TinyLlamaConfig: {} " , . { c } ) ;
const dim = c . dim ;
const hidden_dim = c . hidden_dim ;
const n_layers = c . n_layers ;
const n_heads = c . n_heads ;
const n_kv_heads = c . n_kv_heads ;
const seq_len = c . seq_len ;
const head_size = c . dim / c . n_heads ;
var off : usize = @sizeOf ( TinyLlamaConfig ) ;
try res . buffers . ensureUnusedCapacity ( arena , 8 + 9 * n_layers ) ;
// Map tinyllama naming to Meta llama3 naming.
// token_embedding_table
off = newBuff ( & res , " model.embed_tokens.weight " , . { . voc = c . vocab . size , . d = dim } , off ) ;
// rms_att_weight
off = try splitBuff ( & res , " model.layers.{d}.input_layernorm.weight " , . { dim } , n_layers , off ) ;
// wq
off = try splitBuff ( & res , " model.layers.{d}.self_attn.q_proj.weight " , . { dim , n_heads * head_size } , n_layers , off ) ;
// wk
off = try splitBuff ( & res , " model.layers.{d}.self_attn.k_proj.weight " , . { dim , n_kv_heads * head_size } , n_layers , off ) ;
// wv
off = try splitBuff ( & res , " model.layers.{d}.self_attn.v_proj.weight " , . { dim , n_kv_heads * head_size } , n_layers , off ) ;
// wo
off = try splitBuff ( & res , " model.layers.{d}.self_attn.o_proj.weight " , . { n_heads * head_size , dim } , n_layers , off ) ;
// rms_ffn_weight
off = try splitBuff ( & res , " model.layers.{d}.post_attention_layernorm.weight " , . { dim } , n_layers , off ) ;
// w1
off = try splitBuff ( & res , " model.layers.{d}.mlp.gate_proj.weight " , . { hidden_dim , dim } , n_layers , off ) ;
// w2
off = try splitBuff ( & res , " model.layers.{d}.mlp.down_proj.weight " , . { dim , hidden_dim } , n_layers , off ) ;
// w3
off = try splitBuff ( & res , " model.layers.{d}.mlp.up_proj.weight " , . { hidden_dim , dim } , n_layers , off ) ;
// rms_final_weight
off = newBuff ( & res , " model.norm.weight " , . { dim } , off ) ;
// freq_cis_real (not used)
off = newBuff ( & res , " freq_cis_real " , . { seq_len , head_size / 2 } , off ) ;
off = newBuff ( & res , " freq_cis_imag " , . { seq_len , head_size / 2 } , off ) ;
// wcls
if ( c . vocab . has_lm_head ) {
off = newBuff ( & res , " lm_head.weight " , . { c . vocab . size , c . dim } , off ) ;
} else {
res . buffers . putAssumeCapacityNoClobber ( " lm_head.weight " , res . buffers . get ( " model.embed_tokens.weight " ) . ? ) ;
}
const weights_size = off ;
std . log . info ( " Loaded a tinyllama file of {} bytes. \n This is the parsed configuration of this llama model: {} " , . { weights_size , c } ) ;
if ( file . stat ( ) catch null ) | stat | {
zml . meta . assert ( weights_size = = stat . size , " Expected to have a tinyllama file of {} bytes but file only got {} ! \n This is the parsed configuration of this llama model: {} " , . { weights_size , stat . size , c } ) ;
}
{
try res . _metadata . ensureUnusedCapacity ( arena , 11 ) ;
2023-04-07 16:45:58 +00:00
res . _metadata . putAssumeCapacityNoClobber ( " dim " , . { . int = c . dim } ) ;
res . _metadata . putAssumeCapacityNoClobber ( " hidden_dim " , . { . int = c . hidden_dim } ) ;
res . _metadata . putAssumeCapacityNoClobber ( " n_layers " , . { . int = c . n_layers } ) ;
res . _metadata . putAssumeCapacityNoClobber ( " num_heads " , . { . int = c . n_heads } ) ;
res . _metadata . putAssumeCapacityNoClobber ( " num_kv_heads " , . { . int = c . n_kv_heads } ) ;
res . _metadata . putAssumeCapacityNoClobber ( " vocab_size " , . { . int = c . vocab . size } ) ;
res . _metadata . putAssumeCapacityNoClobber ( " has_lm_head " , . { . bool = c . vocab . has_lm_head } ) ;
res . _metadata . putAssumeCapacityNoClobber ( " max_seq_len " , . { . int = c . seq_len } ) ;
2023-01-02 14:28:25 +00:00
res . _metadata . putAssumeCapacityNoClobber ( " rope_impl " , . { . string = " interleaved " } ) ;
2023-04-07 16:45:58 +00:00
res . _metadata . putAssumeCapacityNoClobber ( " rope_freq_base " , . { . float = 10_000 } ) ;
res . _metadata . putAssumeCapacityNoClobber ( " rms_norm_eps " , . { . float = 1e-6 } ) ;
2023-01-02 14:28:25 +00:00
}
return res ;
}
fn newBuff ( store : * zml . aio . BufferStore , name : [ ] const u8 , sh : anytype , offset : usize ) usize {
var shape = zml . Shape . init ( sh , . f32 ) ;
const n = shape . byteSize ( ) ;
const buff = zml . HostBuffer . fromBytes ( shape , store . files [ 0 ] . data [ offset . . ] [ 0 . . n ] ) ;
store . buffers . putAssumeCapacityNoClobber ( name , buff ) ;
zml . log . info ( " Found {s}: {} " , . { name , shape } ) ;
return offset + n ;
}
fn splitBuff ( store : * zml . aio . BufferStore , comptime fmt : [ ] const u8 , sh : anytype , layers : usize , offset : usize ) ! usize {
var shape = zml . Shape . init ( sh , . f32 ) ;
const n = shape . byteSize ( ) ;
var off = offset ;
for ( 0 . . layers ) | i | {
const name = try std . fmt . allocPrint ( store . arena . allocator ( ) , fmt , . { i } ) ;
const buff = zml . HostBuffer . fromBytes ( shape , store . files [ 0 ] . data [ off . . ] [ 0 . . n ] ) ;
store . buffers . putAssumeCapacityNoClobber ( name , buff ) ;
off + = n ;
if ( i = = 0 ) zml . log . info ( " Found {s}: {} " , . { name , shape } ) ;
}
return off ;
}
pub fn loadTokenizer ( allocator : std . mem . Allocator , tokenizer_path : [ ] const u8 , vocab_size : u32 ) ! zml . tokenizer . Tokenizer {
const tokenizer_file = try std . fs . cwd ( ) . openFile ( tokenizer_path , . { } ) ;
defer tokenizer_file . close ( ) ;
var tok_reader = std . io . bufferedReader ( tokenizer_file . reader ( ) ) ;
const r = tok_reader . reader ( ) ;
const max_token_len = try r . readInt ( u32 , . little ) ;
const special_tokens : zml . tokenizer . Tokenizer . SpecialTokens = . {
. unk = 0 ,
. bos = 1 ,
. eos = 2 ,
} ;
var tokenizer = try zml . tokenizer . Tokenizer . init ( allocator , vocab_size , max_token_len , null , special_tokens , true ) ;
var i : u32 = 0 ;
while ( readToken ( & tokenizer , & r ) ) : ( i + = 1 ) {
// Pass
} else | _ | {
if ( i < vocab_size ) {
zml . log . info ( " Read {d} words out of {?d} " , . { i , vocab_size } ) ;
}
tokenizer . vocab_size = i ;
}
return tokenizer ;
}
fn readToken ( tokenizer : * zml . tokenizer . Tokenizer , tok_reader : anytype ) ! void {
const score : f32 = @bitCast ( try tok_reader . readInt ( u32 , . little ) ) ;
const len : usize = @intCast ( try tok_reader . readInt ( u32 , . little ) ) ;
try tokenizer . readTokenInto ( score , len , tok_reader ) ;
}