From 111afcdd95a22ef7da5970adf5fdc590bb44fe82 Mon Sep 17 00:00:00 2001 From: Foke Singh Date: Wed, 28 May 2025 13:21:00 +0000 Subject: [PATCH] examples/llama: switch to `--hf-model-path` flag Instead of giving config, model weights and tokenizer paths, rely on `huggingface-cli` download. --- examples/llama/BUILD.bazel | 136 ++----------------------------------- examples/llama/main.zig | 62 ++++++++++------- 2 files changed, 42 insertions(+), 156 deletions(-) diff --git a/examples/llama/BUILD.bazel b/examples/llama/BUILD.bazel index d657f4a..839791b 100644 --- a/examples/llama/BUILD.bazel +++ b/examples/llama/BUILD.bazel @@ -21,106 +21,6 @@ zig_cc_binary( ], ) -cc_binary( - name = "Llama-3.1-8B-Instruct", - args = [ - "--config=$(location @Meta-Llama-3.1-8B-Instruct//:config.json)", - "--weights=$(location @Meta-Llama-3.1-8B-Instruct//:model.safetensors.index.json)", - "--tokenizer=$(location @Meta-Llama-3.1-8B-Instruct//:tokenizer.json)", - ], - data = [ - "@Meta-Llama-3.1-8B-Instruct", - "@Meta-Llama-3.1-8B-Instruct//:config.json", - "@Meta-Llama-3.1-8B-Instruct//:model.safetensors.index.json", - "@Meta-Llama-3.1-8B-Instruct//:tokenizer.json", - ], - tags = [ - "manual", - ], - deps = [":llama_lib"], -) - -cc_binary( - name = "Llama-3.1-70B-Instruct", - args = [ - "--config=$(location @Meta-Llama-3.1-70B-Instruct//:config.json)", - "--weights=$(location @Meta-Llama-3.1-70B-Instruct//:model.safetensors.index.json)", - "--tokenizer=$(location @Meta-Llama-3.1-70B-Instruct//:tokenizer.json)", - ], - data = [ - "@Meta-Llama-3.1-70B-Instruct", - "@Meta-Llama-3.1-70B-Instruct//:config.json", - "@Meta-Llama-3.1-70B-Instruct//:model.safetensors.index.json", - "@Meta-Llama-3.1-70B-Instruct//:tokenizer.json", - ], - tags = [ - "manual", - ], - deps = [":llama_lib"], -) - -cc_binary( - name = "Llama-3.2-1B-Instruct", - args = [ - "--config=$(location @Meta-Llama-3.2-1B-Instruct//:config.json)", - "--weights=$(location @Meta-Llama-3.2-1B-Instruct//:model.safetensors)", - "--tokenizer=$(location @Meta-Llama-3.2-1B-Instruct//:tokenizer.json)", - ], - data = [ - "@Meta-Llama-3.2-1B-Instruct", - "@Meta-Llama-3.2-1B-Instruct//:config.json", - "@Meta-Llama-3.2-1B-Instruct//:model.safetensors", - "@Meta-Llama-3.2-1B-Instruct//:tokenizer.json", - ], - tags = [ - "manual", - ], - deps = [":llama_lib"], -) - -cc_binary( - name = "Llama-3.2-3B-Instruct", - args = [ - "--config=$(location @Meta-Llama-3.2-3B-Instruct//:config.json)", - "--weights=$(location @Meta-Llama-3.2-3B-Instruct//:model.safetensors.index.json)", - "--tokenizer=$(location @Meta-Llama-3.2-3B-Instruct//:tokenizer.json)", - ], - data = [ - "@Meta-Llama-3.2-3B-Instruct", - "@Meta-Llama-3.2-3B-Instruct//:config.json", - "@Meta-Llama-3.2-3B-Instruct//:model.safetensors.index.json", - "@Meta-Llama-3.2-3B-Instruct//:tokenizer.json", - ], - tags = [ - "manual", - ], - deps = [":llama_lib"], -) - -cc_binary( - name = "TinyLlama-Stories-15M", - args = [ - "--config=$(location :tinyllama_stories15M_json)", - "--weights=$(location @Karpathy-TinyLlama-Stories15M//file)", - "--tokenizer=$(location @Karpathy-TinyLlama-Tokenizer//file)", - "--prompt='Once upon a time, there was a little girl named Lily.'", - "--no-llama3=1", # don't do template prompt encoding, I'm a simple model - "--sharding=false", # don't shard me, I'm so small - ], - data = [ - ":tinyllama_stories15M_json", - "@Karpathy-TinyLlama-Stories15M//file", - "@Karpathy-TinyLlama-Tokenizer//file", - ], - deps = [":llama_lib"], -) - -write_file( - name = "tinyllama_stories15M_json", - out = "config.json", - content = ['{"bos_token_id":1,"eos_token_id":2,"hidden_act":"silu","hidden_size":288,"intermediate_size":768,"max_position_embeddings":256,"model_type":"llama","num_attention_heads":6,"num_hidden_layers":6,"num_key_value_heads":6,"rms_norm_eps":1e-05,"hf_rope_impl":false,"rope_scaling":null,"rope_theta":10000.0}'], -) - zig_cc_binary( name = "test-implementation", srcs = ["llama.zig"], @@ -169,7 +69,7 @@ Artificial Intelligence in Healthcare mtree_spec( name = "mtree", - srcs = [":Llama-3.2-1B-Instruct"], + srcs = [":llama"], tags = [ "manual", ], @@ -177,7 +77,7 @@ mtree_spec( tar( name = "archive", - srcs = [":Llama-3.2-1B-Instruct"], + srcs = [":llama"], args = [ "--options", "zstd:compression-level=9", @@ -189,36 +89,10 @@ tar( ], ) -expand_template( - name = "entrypoint", - data = [ - ":Llama-3.2-1B-Instruct", - "@Meta-Llama-3.2-1B-Instruct", - "@Meta-Llama-3.2-1B-Instruct//:config.json", - "@Meta-Llama-3.2-1B-Instruct//:model.safetensors", - "@Meta-Llama-3.2-1B-Instruct//:tokenizer.json", - ], - substitutions = { - ":config": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:config.json)", - ":weights": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:model.safetensors)", - ":tokenizer": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:tokenizer.json)", - }, - tags = [ - "manual", - ], - template = [ - "./{}/Llama-3.2-1B-Instruct".format(package_name()), - "--config=./{}/Llama-3.2-1B-Instruct.runfiles/:config".format(package_name()), - "--weights=./{}/Llama-3.2-1B-Instruct.runfiles/:weights".format(package_name()), - "--tokenizer=./{}/Llama-3.2-1B-Instruct.runfiles/:tokenizer".format(package_name()), - ], -) - oci_image( name = "image_", base = "@distroless_cc_debian12_debug", - # entrypoint = ["./{}/Llama-3.2-1B-Instruct".format(package_name())], - entrypoint = ":entrypoint", + entrypoint = ["./{}/llama".format(package_name())], tags = [ "manual", ], @@ -241,7 +115,7 @@ oci_load( name = "load", image = ":image", repo_tags = [ - "distroless/llama-3.2-1b-instruct:latest", + "distroless/llama:latest", ], tags = [ "manual", @@ -252,7 +126,7 @@ oci_push( name = "push", image = ":image", remote_tags = ["latest"], - repository = "index.docker.io/steeve/llama-3.2-1b-instruct", + repository = "index.docker.io/steeve/llama", tags = [ "manual", ], diff --git a/examples/llama/main.zig b/examples/llama/main.zig index 0b121e6..0401444 100644 --- a/examples/llama/main.zig +++ b/examples/llama/main.zig @@ -151,9 +151,7 @@ pub fn generateText( const params = clap.parseParamsComptime( \\--help print this help \\--prompt the prompt - \\--config config.json path - \\--weights model weights path - \\--tokenizer tokenizer path + \\--hf-model-path path to the directory containing model weights, config and tokenizer \\--seed random seed (optional) \\--seq-len sequence length \\--create-options platform creation options JSON, defaults to {} @@ -199,18 +197,37 @@ pub fn asyncMain() !void { return; } - const config = blk: { - if (res.args.config) |config_json_path| { - var config_json_file = try asynk.File.open(config_json_path, .{ .mode = .read_only }); - defer config_json_file.close() catch unreachable; - var reader = std.json.reader(allocator, config_json_file.reader()); - defer reader.deinit(); - const config_obj = try std.json.parseFromTokenSourceLeaky(llama.LlamaLM.Config, allocator, &reader, .{ .ignore_unknown_fields = true }); - break :blk config_obj; - } else { - log.err("Missing --config", .{}); - return; + const hf_model_path = res.args.@"hf-model-path" orelse { + log.err("Missing --hf-model-path", .{}); + return; + }; + + const model_config_path = try std.fs.path.join(allocator, &.{ hf_model_path, "config.json" }); + defer allocator.free(model_config_path); + + const model_weights_path = b: { + const simple_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors" }); + if (asynk.File.access(simple_path, .{})) { + break :b simple_path; + } else |_| { + allocator.free(simple_path); } + + const sharded_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors.index.json" }); + break :b sharded_path; + }; + defer allocator.free(model_weights_path); + + const model_tokenizer_path = try std.fs.path.join(allocator, &.{ hf_model_path, "tokenizer.json" }); + defer allocator.free(model_tokenizer_path); + + const config = blk: { + var config_json_file = try asynk.File.open(model_config_path, .{ .mode = .read_only }); + defer config_json_file.close() catch unreachable; + var reader = std.json.reader(allocator, config_json_file.reader()); + defer reader.deinit(); + const config_obj = try std.json.parseFromTokenSourceLeaky(llama.LlamaLM.Config, allocator, &reader, .{ .ignore_unknown_fields = true }); + break :blk config_obj; }; var context = try zml.Context.init(); @@ -229,7 +246,7 @@ pub fn asyncMain() !void { create_opts.deinit(); context.printAvailablePlatforms(platform); - var ts = try zml.aio.detectFormatAndOpen(allocator, res.args.weights.?); + var ts = try zml.aio.detectFormatAndOpen(allocator, model_weights_path); defer ts.deinit(); var model_arena = std.heap.ArenaAllocator.init(allocator); @@ -281,7 +298,7 @@ pub fn asyncMain() !void { platform, }); - log.info("\tLoading Llama weights from {?s}...", .{res.args.weights}); + log.info("\tLoading Llama weights from {?s}...", .{model_weights_path}); var llama_weights = try zml.aio.loadBuffers(llama.LlamaLM, .{ config, llama_options }, ts, model_arena.allocator(), platform); defer zml.aio.unloadBuffers(&llama_weights); log.info("✅\tLoaded weights in {}", .{std.fmt.fmtDuration(start.read())}); @@ -296,16 +313,11 @@ pub fn asyncMain() !void { const kv_cache = try llama.KvCache.initBuffer(kv_shape, platform); var tokenizer = blk: { - if (res.args.tokenizer) |tok| { - log.info("Loading tokenizer from {s}", .{tok}); - var timer = try stdx.time.Timer.start(); - defer log.info("Loaded tokenizer from {s} [{}]", .{ tok, timer.read() }); + log.info("Loading tokenizer from {s}", .{model_tokenizer_path}); + var timer = try stdx.time.Timer.start(); + defer log.info("Loaded tokenizer from {s} [{}]", .{ model_tokenizer_path, timer.read() }); - break :blk try zml.tokenizer.Tokenizer.fromFile(model_arena.allocator(), tok); - } else { - log.err("Missing --tokenizer", .{}); - return; - } + break :blk try zml.tokenizer.Tokenizer.fromFile(model_arena.allocator(), model_tokenizer_path); }; errdefer tokenizer.deinit();