examples/llama: switch to --hf-model-path flag

Instead of giving config, model weights and tokenizer paths, rely on
`huggingface-cli` download.
This commit is contained in:
Foke Singh 2025-05-28 13:21:00 +00:00
parent 5a49a3e8ca
commit 111afcdd95
2 changed files with 42 additions and 156 deletions

View File

@ -21,106 +21,6 @@ zig_cc_binary(
],
)
cc_binary(
name = "Llama-3.1-8B-Instruct",
args = [
"--config=$(location @Meta-Llama-3.1-8B-Instruct//:config.json)",
"--weights=$(location @Meta-Llama-3.1-8B-Instruct//:model.safetensors.index.json)",
"--tokenizer=$(location @Meta-Llama-3.1-8B-Instruct//:tokenizer.json)",
],
data = [
"@Meta-Llama-3.1-8B-Instruct",
"@Meta-Llama-3.1-8B-Instruct//:config.json",
"@Meta-Llama-3.1-8B-Instruct//:model.safetensors.index.json",
"@Meta-Llama-3.1-8B-Instruct//:tokenizer.json",
],
tags = [
"manual",
],
deps = [":llama_lib"],
)
cc_binary(
name = "Llama-3.1-70B-Instruct",
args = [
"--config=$(location @Meta-Llama-3.1-70B-Instruct//:config.json)",
"--weights=$(location @Meta-Llama-3.1-70B-Instruct//:model.safetensors.index.json)",
"--tokenizer=$(location @Meta-Llama-3.1-70B-Instruct//:tokenizer.json)",
],
data = [
"@Meta-Llama-3.1-70B-Instruct",
"@Meta-Llama-3.1-70B-Instruct//:config.json",
"@Meta-Llama-3.1-70B-Instruct//:model.safetensors.index.json",
"@Meta-Llama-3.1-70B-Instruct//:tokenizer.json",
],
tags = [
"manual",
],
deps = [":llama_lib"],
)
cc_binary(
name = "Llama-3.2-1B-Instruct",
args = [
"--config=$(location @Meta-Llama-3.2-1B-Instruct//:config.json)",
"--weights=$(location @Meta-Llama-3.2-1B-Instruct//:model.safetensors)",
"--tokenizer=$(location @Meta-Llama-3.2-1B-Instruct//:tokenizer.json)",
],
data = [
"@Meta-Llama-3.2-1B-Instruct",
"@Meta-Llama-3.2-1B-Instruct//:config.json",
"@Meta-Llama-3.2-1B-Instruct//:model.safetensors",
"@Meta-Llama-3.2-1B-Instruct//:tokenizer.json",
],
tags = [
"manual",
],
deps = [":llama_lib"],
)
cc_binary(
name = "Llama-3.2-3B-Instruct",
args = [
"--config=$(location @Meta-Llama-3.2-3B-Instruct//:config.json)",
"--weights=$(location @Meta-Llama-3.2-3B-Instruct//:model.safetensors.index.json)",
"--tokenizer=$(location @Meta-Llama-3.2-3B-Instruct//:tokenizer.json)",
],
data = [
"@Meta-Llama-3.2-3B-Instruct",
"@Meta-Llama-3.2-3B-Instruct//:config.json",
"@Meta-Llama-3.2-3B-Instruct//:model.safetensors.index.json",
"@Meta-Llama-3.2-3B-Instruct//:tokenizer.json",
],
tags = [
"manual",
],
deps = [":llama_lib"],
)
cc_binary(
name = "TinyLlama-Stories-15M",
args = [
"--config=$(location :tinyllama_stories15M_json)",
"--weights=$(location @Karpathy-TinyLlama-Stories15M//file)",
"--tokenizer=$(location @Karpathy-TinyLlama-Tokenizer//file)",
"--prompt='Once upon a time, there was a little girl named Lily.'",
"--no-llama3=1", # don't do template prompt encoding, I'm a simple model
"--sharding=false", # don't shard me, I'm so small
],
data = [
":tinyllama_stories15M_json",
"@Karpathy-TinyLlama-Stories15M//file",
"@Karpathy-TinyLlama-Tokenizer//file",
],
deps = [":llama_lib"],
)
write_file(
name = "tinyllama_stories15M_json",
out = "config.json",
content = ['{"bos_token_id":1,"eos_token_id":2,"hidden_act":"silu","hidden_size":288,"intermediate_size":768,"max_position_embeddings":256,"model_type":"llama","num_attention_heads":6,"num_hidden_layers":6,"num_key_value_heads":6,"rms_norm_eps":1e-05,"hf_rope_impl":false,"rope_scaling":null,"rope_theta":10000.0}'],
)
zig_cc_binary(
name = "test-implementation",
srcs = ["llama.zig"],
@ -169,7 +69,7 @@ Artificial Intelligence in Healthcare
mtree_spec(
name = "mtree",
srcs = [":Llama-3.2-1B-Instruct"],
srcs = [":llama"],
tags = [
"manual",
],
@ -177,7 +77,7 @@ mtree_spec(
tar(
name = "archive",
srcs = [":Llama-3.2-1B-Instruct"],
srcs = [":llama"],
args = [
"--options",
"zstd:compression-level=9",
@ -189,36 +89,10 @@ tar(
],
)
expand_template(
name = "entrypoint",
data = [
":Llama-3.2-1B-Instruct",
"@Meta-Llama-3.2-1B-Instruct",
"@Meta-Llama-3.2-1B-Instruct//:config.json",
"@Meta-Llama-3.2-1B-Instruct//:model.safetensors",
"@Meta-Llama-3.2-1B-Instruct//:tokenizer.json",
],
substitutions = {
":config": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:config.json)",
":weights": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:model.safetensors)",
":tokenizer": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:tokenizer.json)",
},
tags = [
"manual",
],
template = [
"./{}/Llama-3.2-1B-Instruct".format(package_name()),
"--config=./{}/Llama-3.2-1B-Instruct.runfiles/:config".format(package_name()),
"--weights=./{}/Llama-3.2-1B-Instruct.runfiles/:weights".format(package_name()),
"--tokenizer=./{}/Llama-3.2-1B-Instruct.runfiles/:tokenizer".format(package_name()),
],
)
oci_image(
name = "image_",
base = "@distroless_cc_debian12_debug",
# entrypoint = ["./{}/Llama-3.2-1B-Instruct".format(package_name())],
entrypoint = ":entrypoint",
entrypoint = ["./{}/llama".format(package_name())],
tags = [
"manual",
],
@ -241,7 +115,7 @@ oci_load(
name = "load",
image = ":image",
repo_tags = [
"distroless/llama-3.2-1b-instruct:latest",
"distroless/llama:latest",
],
tags = [
"manual",
@ -252,7 +126,7 @@ oci_push(
name = "push",
image = ":image",
remote_tags = ["latest"],
repository = "index.docker.io/steeve/llama-3.2-1b-instruct",
repository = "index.docker.io/steeve/llama",
tags = [
"manual",
],

View File

@ -151,9 +151,7 @@ pub fn generateText(
const params = clap.parseParamsComptime(
\\--help print this help
\\--prompt <STRING> the prompt
\\--config <PATH> config.json path
\\--weights <PATH> model weights path
\\--tokenizer <PATH> tokenizer path
\\--hf-model-path <STRING> path to the directory containing model weights, config and tokenizer
\\--seed <UINT> random seed (optional)
\\--seq-len <UINT> sequence length
\\--create-options <STRING> platform creation options JSON, defaults to {}
@ -199,18 +197,37 @@ pub fn asyncMain() !void {
return;
}
const config = blk: {
if (res.args.config) |config_json_path| {
var config_json_file = try asynk.File.open(config_json_path, .{ .mode = .read_only });
defer config_json_file.close() catch unreachable;
var reader = std.json.reader(allocator, config_json_file.reader());
defer reader.deinit();
const config_obj = try std.json.parseFromTokenSourceLeaky(llama.LlamaLM.Config, allocator, &reader, .{ .ignore_unknown_fields = true });
break :blk config_obj;
} else {
log.err("Missing --config", .{});
return;
const hf_model_path = res.args.@"hf-model-path" orelse {
log.err("Missing --hf-model-path", .{});
return;
};
const model_config_path = try std.fs.path.join(allocator, &.{ hf_model_path, "config.json" });
defer allocator.free(model_config_path);
const model_weights_path = b: {
const simple_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors" });
if (asynk.File.access(simple_path, .{})) {
break :b simple_path;
} else |_| {
allocator.free(simple_path);
}
const sharded_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors.index.json" });
break :b sharded_path;
};
defer allocator.free(model_weights_path);
const model_tokenizer_path = try std.fs.path.join(allocator, &.{ hf_model_path, "tokenizer.json" });
defer allocator.free(model_tokenizer_path);
const config = blk: {
var config_json_file = try asynk.File.open(model_config_path, .{ .mode = .read_only });
defer config_json_file.close() catch unreachable;
var reader = std.json.reader(allocator, config_json_file.reader());
defer reader.deinit();
const config_obj = try std.json.parseFromTokenSourceLeaky(llama.LlamaLM.Config, allocator, &reader, .{ .ignore_unknown_fields = true });
break :blk config_obj;
};
var context = try zml.Context.init();
@ -229,7 +246,7 @@ pub fn asyncMain() !void {
create_opts.deinit();
context.printAvailablePlatforms(platform);
var ts = try zml.aio.detectFormatAndOpen(allocator, res.args.weights.?);
var ts = try zml.aio.detectFormatAndOpen(allocator, model_weights_path);
defer ts.deinit();
var model_arena = std.heap.ArenaAllocator.init(allocator);
@ -281,7 +298,7 @@ pub fn asyncMain() !void {
platform,
});
log.info("\tLoading Llama weights from {?s}...", .{res.args.weights});
log.info("\tLoading Llama weights from {?s}...", .{model_weights_path});
var llama_weights = try zml.aio.loadBuffers(llama.LlamaLM, .{ config, llama_options }, ts, model_arena.allocator(), platform);
defer zml.aio.unloadBuffers(&llama_weights);
log.info("\tLoaded weights in {}", .{std.fmt.fmtDuration(start.read())});
@ -296,16 +313,11 @@ pub fn asyncMain() !void {
const kv_cache = try llama.KvCache.initBuffer(kv_shape, platform);
var tokenizer = blk: {
if (res.args.tokenizer) |tok| {
log.info("Loading tokenizer from {s}", .{tok});
var timer = try stdx.time.Timer.start();
defer log.info("Loaded tokenizer from {s} [{}]", .{ tok, timer.read() });
log.info("Loading tokenizer from {s}", .{model_tokenizer_path});
var timer = try stdx.time.Timer.start();
defer log.info("Loaded tokenizer from {s} [{}]", .{ model_tokenizer_path, timer.read() });
break :blk try zml.tokenizer.Tokenizer.fromFile(model_arena.allocator(), tok);
} else {
log.err("Missing --tokenizer", .{});
return;
}
break :blk try zml.tokenizer.Tokenizer.fromFile(model_arena.allocator(), model_tokenizer_path);
};
errdefer tokenizer.deinit();