examples/llama: switch to --hf-model-path flag
Instead of giving config, model weights and tokenizer paths, rely on `huggingface-cli` download.
This commit is contained in:
parent
5a49a3e8ca
commit
111afcdd95
@ -21,106 +21,6 @@ zig_cc_binary(
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "Llama-3.1-8B-Instruct",
|
||||
args = [
|
||||
"--config=$(location @Meta-Llama-3.1-8B-Instruct//:config.json)",
|
||||
"--weights=$(location @Meta-Llama-3.1-8B-Instruct//:model.safetensors.index.json)",
|
||||
"--tokenizer=$(location @Meta-Llama-3.1-8B-Instruct//:tokenizer.json)",
|
||||
],
|
||||
data = [
|
||||
"@Meta-Llama-3.1-8B-Instruct",
|
||||
"@Meta-Llama-3.1-8B-Instruct//:config.json",
|
||||
"@Meta-Llama-3.1-8B-Instruct//:model.safetensors.index.json",
|
||||
"@Meta-Llama-3.1-8B-Instruct//:tokenizer.json",
|
||||
],
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
deps = [":llama_lib"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "Llama-3.1-70B-Instruct",
|
||||
args = [
|
||||
"--config=$(location @Meta-Llama-3.1-70B-Instruct//:config.json)",
|
||||
"--weights=$(location @Meta-Llama-3.1-70B-Instruct//:model.safetensors.index.json)",
|
||||
"--tokenizer=$(location @Meta-Llama-3.1-70B-Instruct//:tokenizer.json)",
|
||||
],
|
||||
data = [
|
||||
"@Meta-Llama-3.1-70B-Instruct",
|
||||
"@Meta-Llama-3.1-70B-Instruct//:config.json",
|
||||
"@Meta-Llama-3.1-70B-Instruct//:model.safetensors.index.json",
|
||||
"@Meta-Llama-3.1-70B-Instruct//:tokenizer.json",
|
||||
],
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
deps = [":llama_lib"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "Llama-3.2-1B-Instruct",
|
||||
args = [
|
||||
"--config=$(location @Meta-Llama-3.2-1B-Instruct//:config.json)",
|
||||
"--weights=$(location @Meta-Llama-3.2-1B-Instruct//:model.safetensors)",
|
||||
"--tokenizer=$(location @Meta-Llama-3.2-1B-Instruct//:tokenizer.json)",
|
||||
],
|
||||
data = [
|
||||
"@Meta-Llama-3.2-1B-Instruct",
|
||||
"@Meta-Llama-3.2-1B-Instruct//:config.json",
|
||||
"@Meta-Llama-3.2-1B-Instruct//:model.safetensors",
|
||||
"@Meta-Llama-3.2-1B-Instruct//:tokenizer.json",
|
||||
],
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
deps = [":llama_lib"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "Llama-3.2-3B-Instruct",
|
||||
args = [
|
||||
"--config=$(location @Meta-Llama-3.2-3B-Instruct//:config.json)",
|
||||
"--weights=$(location @Meta-Llama-3.2-3B-Instruct//:model.safetensors.index.json)",
|
||||
"--tokenizer=$(location @Meta-Llama-3.2-3B-Instruct//:tokenizer.json)",
|
||||
],
|
||||
data = [
|
||||
"@Meta-Llama-3.2-3B-Instruct",
|
||||
"@Meta-Llama-3.2-3B-Instruct//:config.json",
|
||||
"@Meta-Llama-3.2-3B-Instruct//:model.safetensors.index.json",
|
||||
"@Meta-Llama-3.2-3B-Instruct//:tokenizer.json",
|
||||
],
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
deps = [":llama_lib"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "TinyLlama-Stories-15M",
|
||||
args = [
|
||||
"--config=$(location :tinyllama_stories15M_json)",
|
||||
"--weights=$(location @Karpathy-TinyLlama-Stories15M//file)",
|
||||
"--tokenizer=$(location @Karpathy-TinyLlama-Tokenizer//file)",
|
||||
"--prompt='Once upon a time, there was a little girl named Lily.'",
|
||||
"--no-llama3=1", # don't do template prompt encoding, I'm a simple model
|
||||
"--sharding=false", # don't shard me, I'm so small
|
||||
],
|
||||
data = [
|
||||
":tinyllama_stories15M_json",
|
||||
"@Karpathy-TinyLlama-Stories15M//file",
|
||||
"@Karpathy-TinyLlama-Tokenizer//file",
|
||||
],
|
||||
deps = [":llama_lib"],
|
||||
)
|
||||
|
||||
write_file(
|
||||
name = "tinyllama_stories15M_json",
|
||||
out = "config.json",
|
||||
content = ['{"bos_token_id":1,"eos_token_id":2,"hidden_act":"silu","hidden_size":288,"intermediate_size":768,"max_position_embeddings":256,"model_type":"llama","num_attention_heads":6,"num_hidden_layers":6,"num_key_value_heads":6,"rms_norm_eps":1e-05,"hf_rope_impl":false,"rope_scaling":null,"rope_theta":10000.0}'],
|
||||
)
|
||||
|
||||
zig_cc_binary(
|
||||
name = "test-implementation",
|
||||
srcs = ["llama.zig"],
|
||||
@ -169,7 +69,7 @@ Artificial Intelligence in Healthcare
|
||||
|
||||
mtree_spec(
|
||||
name = "mtree",
|
||||
srcs = [":Llama-3.2-1B-Instruct"],
|
||||
srcs = [":llama"],
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
@ -177,7 +77,7 @@ mtree_spec(
|
||||
|
||||
tar(
|
||||
name = "archive",
|
||||
srcs = [":Llama-3.2-1B-Instruct"],
|
||||
srcs = [":llama"],
|
||||
args = [
|
||||
"--options",
|
||||
"zstd:compression-level=9",
|
||||
@ -189,36 +89,10 @@ tar(
|
||||
],
|
||||
)
|
||||
|
||||
expand_template(
|
||||
name = "entrypoint",
|
||||
data = [
|
||||
":Llama-3.2-1B-Instruct",
|
||||
"@Meta-Llama-3.2-1B-Instruct",
|
||||
"@Meta-Llama-3.2-1B-Instruct//:config.json",
|
||||
"@Meta-Llama-3.2-1B-Instruct//:model.safetensors",
|
||||
"@Meta-Llama-3.2-1B-Instruct//:tokenizer.json",
|
||||
],
|
||||
substitutions = {
|
||||
":config": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:config.json)",
|
||||
":weights": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:model.safetensors)",
|
||||
":tokenizer": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:tokenizer.json)",
|
||||
},
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
template = [
|
||||
"./{}/Llama-3.2-1B-Instruct".format(package_name()),
|
||||
"--config=./{}/Llama-3.2-1B-Instruct.runfiles/:config".format(package_name()),
|
||||
"--weights=./{}/Llama-3.2-1B-Instruct.runfiles/:weights".format(package_name()),
|
||||
"--tokenizer=./{}/Llama-3.2-1B-Instruct.runfiles/:tokenizer".format(package_name()),
|
||||
],
|
||||
)
|
||||
|
||||
oci_image(
|
||||
name = "image_",
|
||||
base = "@distroless_cc_debian12_debug",
|
||||
# entrypoint = ["./{}/Llama-3.2-1B-Instruct".format(package_name())],
|
||||
entrypoint = ":entrypoint",
|
||||
entrypoint = ["./{}/llama".format(package_name())],
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
@ -241,7 +115,7 @@ oci_load(
|
||||
name = "load",
|
||||
image = ":image",
|
||||
repo_tags = [
|
||||
"distroless/llama-3.2-1b-instruct:latest",
|
||||
"distroless/llama:latest",
|
||||
],
|
||||
tags = [
|
||||
"manual",
|
||||
@ -252,7 +126,7 @@ oci_push(
|
||||
name = "push",
|
||||
image = ":image",
|
||||
remote_tags = ["latest"],
|
||||
repository = "index.docker.io/steeve/llama-3.2-1b-instruct",
|
||||
repository = "index.docker.io/steeve/llama",
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
|
||||
@ -151,9 +151,7 @@ pub fn generateText(
|
||||
const params = clap.parseParamsComptime(
|
||||
\\--help print this help
|
||||
\\--prompt <STRING> the prompt
|
||||
\\--config <PATH> config.json path
|
||||
\\--weights <PATH> model weights path
|
||||
\\--tokenizer <PATH> tokenizer path
|
||||
\\--hf-model-path <STRING> path to the directory containing model weights, config and tokenizer
|
||||
\\--seed <UINT> random seed (optional)
|
||||
\\--seq-len <UINT> sequence length
|
||||
\\--create-options <STRING> platform creation options JSON, defaults to {}
|
||||
@ -199,18 +197,37 @@ pub fn asyncMain() !void {
|
||||
return;
|
||||
}
|
||||
|
||||
const hf_model_path = res.args.@"hf-model-path" orelse {
|
||||
log.err("Missing --hf-model-path", .{});
|
||||
return;
|
||||
};
|
||||
|
||||
const model_config_path = try std.fs.path.join(allocator, &.{ hf_model_path, "config.json" });
|
||||
defer allocator.free(model_config_path);
|
||||
|
||||
const model_weights_path = b: {
|
||||
const simple_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors" });
|
||||
if (asynk.File.access(simple_path, .{})) {
|
||||
break :b simple_path;
|
||||
} else |_| {
|
||||
allocator.free(simple_path);
|
||||
}
|
||||
|
||||
const sharded_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors.index.json" });
|
||||
break :b sharded_path;
|
||||
};
|
||||
defer allocator.free(model_weights_path);
|
||||
|
||||
const model_tokenizer_path = try std.fs.path.join(allocator, &.{ hf_model_path, "tokenizer.json" });
|
||||
defer allocator.free(model_tokenizer_path);
|
||||
|
||||
const config = blk: {
|
||||
if (res.args.config) |config_json_path| {
|
||||
var config_json_file = try asynk.File.open(config_json_path, .{ .mode = .read_only });
|
||||
var config_json_file = try asynk.File.open(model_config_path, .{ .mode = .read_only });
|
||||
defer config_json_file.close() catch unreachable;
|
||||
var reader = std.json.reader(allocator, config_json_file.reader());
|
||||
defer reader.deinit();
|
||||
const config_obj = try std.json.parseFromTokenSourceLeaky(llama.LlamaLM.Config, allocator, &reader, .{ .ignore_unknown_fields = true });
|
||||
break :blk config_obj;
|
||||
} else {
|
||||
log.err("Missing --config", .{});
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
var context = try zml.Context.init();
|
||||
@ -229,7 +246,7 @@ pub fn asyncMain() !void {
|
||||
create_opts.deinit();
|
||||
context.printAvailablePlatforms(platform);
|
||||
|
||||
var ts = try zml.aio.detectFormatAndOpen(allocator, res.args.weights.?);
|
||||
var ts = try zml.aio.detectFormatAndOpen(allocator, model_weights_path);
|
||||
defer ts.deinit();
|
||||
|
||||
var model_arena = std.heap.ArenaAllocator.init(allocator);
|
||||
@ -281,7 +298,7 @@ pub fn asyncMain() !void {
|
||||
platform,
|
||||
});
|
||||
|
||||
log.info("\tLoading Llama weights from {?s}...", .{res.args.weights});
|
||||
log.info("\tLoading Llama weights from {?s}...", .{model_weights_path});
|
||||
var llama_weights = try zml.aio.loadBuffers(llama.LlamaLM, .{ config, llama_options }, ts, model_arena.allocator(), platform);
|
||||
defer zml.aio.unloadBuffers(&llama_weights);
|
||||
log.info("✅\tLoaded weights in {}", .{std.fmt.fmtDuration(start.read())});
|
||||
@ -296,16 +313,11 @@ pub fn asyncMain() !void {
|
||||
const kv_cache = try llama.KvCache.initBuffer(kv_shape, platform);
|
||||
|
||||
var tokenizer = blk: {
|
||||
if (res.args.tokenizer) |tok| {
|
||||
log.info("Loading tokenizer from {s}", .{tok});
|
||||
log.info("Loading tokenizer from {s}", .{model_tokenizer_path});
|
||||
var timer = try stdx.time.Timer.start();
|
||||
defer log.info("Loaded tokenizer from {s} [{}]", .{ tok, timer.read() });
|
||||
defer log.info("Loaded tokenizer from {s} [{}]", .{ model_tokenizer_path, timer.read() });
|
||||
|
||||
break :blk try zml.tokenizer.Tokenizer.fromFile(model_arena.allocator(), tok);
|
||||
} else {
|
||||
log.err("Missing --tokenizer", .{});
|
||||
return;
|
||||
}
|
||||
break :blk try zml.tokenizer.Tokenizer.fromFile(model_arena.allocator(), model_tokenizer_path);
|
||||
};
|
||||
errdefer tokenizer.deinit();
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user