examples/llama: switch to --hf-model-path flag
Instead of giving config, model weights and tokenizer paths, rely on `huggingface-cli` download.
This commit is contained in:
parent
5a49a3e8ca
commit
111afcdd95
@ -21,106 +21,6 @@ zig_cc_binary(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
cc_binary(
|
|
||||||
name = "Llama-3.1-8B-Instruct",
|
|
||||||
args = [
|
|
||||||
"--config=$(location @Meta-Llama-3.1-8B-Instruct//:config.json)",
|
|
||||||
"--weights=$(location @Meta-Llama-3.1-8B-Instruct//:model.safetensors.index.json)",
|
|
||||||
"--tokenizer=$(location @Meta-Llama-3.1-8B-Instruct//:tokenizer.json)",
|
|
||||||
],
|
|
||||||
data = [
|
|
||||||
"@Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"@Meta-Llama-3.1-8B-Instruct//:config.json",
|
|
||||||
"@Meta-Llama-3.1-8B-Instruct//:model.safetensors.index.json",
|
|
||||||
"@Meta-Llama-3.1-8B-Instruct//:tokenizer.json",
|
|
||||||
],
|
|
||||||
tags = [
|
|
||||||
"manual",
|
|
||||||
],
|
|
||||||
deps = [":llama_lib"],
|
|
||||||
)
|
|
||||||
|
|
||||||
cc_binary(
|
|
||||||
name = "Llama-3.1-70B-Instruct",
|
|
||||||
args = [
|
|
||||||
"--config=$(location @Meta-Llama-3.1-70B-Instruct//:config.json)",
|
|
||||||
"--weights=$(location @Meta-Llama-3.1-70B-Instruct//:model.safetensors.index.json)",
|
|
||||||
"--tokenizer=$(location @Meta-Llama-3.1-70B-Instruct//:tokenizer.json)",
|
|
||||||
],
|
|
||||||
data = [
|
|
||||||
"@Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"@Meta-Llama-3.1-70B-Instruct//:config.json",
|
|
||||||
"@Meta-Llama-3.1-70B-Instruct//:model.safetensors.index.json",
|
|
||||||
"@Meta-Llama-3.1-70B-Instruct//:tokenizer.json",
|
|
||||||
],
|
|
||||||
tags = [
|
|
||||||
"manual",
|
|
||||||
],
|
|
||||||
deps = [":llama_lib"],
|
|
||||||
)
|
|
||||||
|
|
||||||
cc_binary(
|
|
||||||
name = "Llama-3.2-1B-Instruct",
|
|
||||||
args = [
|
|
||||||
"--config=$(location @Meta-Llama-3.2-1B-Instruct//:config.json)",
|
|
||||||
"--weights=$(location @Meta-Llama-3.2-1B-Instruct//:model.safetensors)",
|
|
||||||
"--tokenizer=$(location @Meta-Llama-3.2-1B-Instruct//:tokenizer.json)",
|
|
||||||
],
|
|
||||||
data = [
|
|
||||||
"@Meta-Llama-3.2-1B-Instruct",
|
|
||||||
"@Meta-Llama-3.2-1B-Instruct//:config.json",
|
|
||||||
"@Meta-Llama-3.2-1B-Instruct//:model.safetensors",
|
|
||||||
"@Meta-Llama-3.2-1B-Instruct//:tokenizer.json",
|
|
||||||
],
|
|
||||||
tags = [
|
|
||||||
"manual",
|
|
||||||
],
|
|
||||||
deps = [":llama_lib"],
|
|
||||||
)
|
|
||||||
|
|
||||||
cc_binary(
|
|
||||||
name = "Llama-3.2-3B-Instruct",
|
|
||||||
args = [
|
|
||||||
"--config=$(location @Meta-Llama-3.2-3B-Instruct//:config.json)",
|
|
||||||
"--weights=$(location @Meta-Llama-3.2-3B-Instruct//:model.safetensors.index.json)",
|
|
||||||
"--tokenizer=$(location @Meta-Llama-3.2-3B-Instruct//:tokenizer.json)",
|
|
||||||
],
|
|
||||||
data = [
|
|
||||||
"@Meta-Llama-3.2-3B-Instruct",
|
|
||||||
"@Meta-Llama-3.2-3B-Instruct//:config.json",
|
|
||||||
"@Meta-Llama-3.2-3B-Instruct//:model.safetensors.index.json",
|
|
||||||
"@Meta-Llama-3.2-3B-Instruct//:tokenizer.json",
|
|
||||||
],
|
|
||||||
tags = [
|
|
||||||
"manual",
|
|
||||||
],
|
|
||||||
deps = [":llama_lib"],
|
|
||||||
)
|
|
||||||
|
|
||||||
cc_binary(
|
|
||||||
name = "TinyLlama-Stories-15M",
|
|
||||||
args = [
|
|
||||||
"--config=$(location :tinyllama_stories15M_json)",
|
|
||||||
"--weights=$(location @Karpathy-TinyLlama-Stories15M//file)",
|
|
||||||
"--tokenizer=$(location @Karpathy-TinyLlama-Tokenizer//file)",
|
|
||||||
"--prompt='Once upon a time, there was a little girl named Lily.'",
|
|
||||||
"--no-llama3=1", # don't do template prompt encoding, I'm a simple model
|
|
||||||
"--sharding=false", # don't shard me, I'm so small
|
|
||||||
],
|
|
||||||
data = [
|
|
||||||
":tinyllama_stories15M_json",
|
|
||||||
"@Karpathy-TinyLlama-Stories15M//file",
|
|
||||||
"@Karpathy-TinyLlama-Tokenizer//file",
|
|
||||||
],
|
|
||||||
deps = [":llama_lib"],
|
|
||||||
)
|
|
||||||
|
|
||||||
write_file(
|
|
||||||
name = "tinyllama_stories15M_json",
|
|
||||||
out = "config.json",
|
|
||||||
content = ['{"bos_token_id":1,"eos_token_id":2,"hidden_act":"silu","hidden_size":288,"intermediate_size":768,"max_position_embeddings":256,"model_type":"llama","num_attention_heads":6,"num_hidden_layers":6,"num_key_value_heads":6,"rms_norm_eps":1e-05,"hf_rope_impl":false,"rope_scaling":null,"rope_theta":10000.0}'],
|
|
||||||
)
|
|
||||||
|
|
||||||
zig_cc_binary(
|
zig_cc_binary(
|
||||||
name = "test-implementation",
|
name = "test-implementation",
|
||||||
srcs = ["llama.zig"],
|
srcs = ["llama.zig"],
|
||||||
@ -169,7 +69,7 @@ Artificial Intelligence in Healthcare
|
|||||||
|
|
||||||
mtree_spec(
|
mtree_spec(
|
||||||
name = "mtree",
|
name = "mtree",
|
||||||
srcs = [":Llama-3.2-1B-Instruct"],
|
srcs = [":llama"],
|
||||||
tags = [
|
tags = [
|
||||||
"manual",
|
"manual",
|
||||||
],
|
],
|
||||||
@ -177,7 +77,7 @@ mtree_spec(
|
|||||||
|
|
||||||
tar(
|
tar(
|
||||||
name = "archive",
|
name = "archive",
|
||||||
srcs = [":Llama-3.2-1B-Instruct"],
|
srcs = [":llama"],
|
||||||
args = [
|
args = [
|
||||||
"--options",
|
"--options",
|
||||||
"zstd:compression-level=9",
|
"zstd:compression-level=9",
|
||||||
@ -189,36 +89,10 @@ tar(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
expand_template(
|
|
||||||
name = "entrypoint",
|
|
||||||
data = [
|
|
||||||
":Llama-3.2-1B-Instruct",
|
|
||||||
"@Meta-Llama-3.2-1B-Instruct",
|
|
||||||
"@Meta-Llama-3.2-1B-Instruct//:config.json",
|
|
||||||
"@Meta-Llama-3.2-1B-Instruct//:model.safetensors",
|
|
||||||
"@Meta-Llama-3.2-1B-Instruct//:tokenizer.json",
|
|
||||||
],
|
|
||||||
substitutions = {
|
|
||||||
":config": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:config.json)",
|
|
||||||
":weights": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:model.safetensors)",
|
|
||||||
":tokenizer": "$(rlocationpath @Meta-Llama-3.2-1B-Instruct//:tokenizer.json)",
|
|
||||||
},
|
|
||||||
tags = [
|
|
||||||
"manual",
|
|
||||||
],
|
|
||||||
template = [
|
|
||||||
"./{}/Llama-3.2-1B-Instruct".format(package_name()),
|
|
||||||
"--config=./{}/Llama-3.2-1B-Instruct.runfiles/:config".format(package_name()),
|
|
||||||
"--weights=./{}/Llama-3.2-1B-Instruct.runfiles/:weights".format(package_name()),
|
|
||||||
"--tokenizer=./{}/Llama-3.2-1B-Instruct.runfiles/:tokenizer".format(package_name()),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
oci_image(
|
oci_image(
|
||||||
name = "image_",
|
name = "image_",
|
||||||
base = "@distroless_cc_debian12_debug",
|
base = "@distroless_cc_debian12_debug",
|
||||||
# entrypoint = ["./{}/Llama-3.2-1B-Instruct".format(package_name())],
|
entrypoint = ["./{}/llama".format(package_name())],
|
||||||
entrypoint = ":entrypoint",
|
|
||||||
tags = [
|
tags = [
|
||||||
"manual",
|
"manual",
|
||||||
],
|
],
|
||||||
@ -241,7 +115,7 @@ oci_load(
|
|||||||
name = "load",
|
name = "load",
|
||||||
image = ":image",
|
image = ":image",
|
||||||
repo_tags = [
|
repo_tags = [
|
||||||
"distroless/llama-3.2-1b-instruct:latest",
|
"distroless/llama:latest",
|
||||||
],
|
],
|
||||||
tags = [
|
tags = [
|
||||||
"manual",
|
"manual",
|
||||||
@ -252,7 +126,7 @@ oci_push(
|
|||||||
name = "push",
|
name = "push",
|
||||||
image = ":image",
|
image = ":image",
|
||||||
remote_tags = ["latest"],
|
remote_tags = ["latest"],
|
||||||
repository = "index.docker.io/steeve/llama-3.2-1b-instruct",
|
repository = "index.docker.io/steeve/llama",
|
||||||
tags = [
|
tags = [
|
||||||
"manual",
|
"manual",
|
||||||
],
|
],
|
||||||
|
|||||||
@ -151,9 +151,7 @@ pub fn generateText(
|
|||||||
const params = clap.parseParamsComptime(
|
const params = clap.parseParamsComptime(
|
||||||
\\--help print this help
|
\\--help print this help
|
||||||
\\--prompt <STRING> the prompt
|
\\--prompt <STRING> the prompt
|
||||||
\\--config <PATH> config.json path
|
\\--hf-model-path <STRING> path to the directory containing model weights, config and tokenizer
|
||||||
\\--weights <PATH> model weights path
|
|
||||||
\\--tokenizer <PATH> tokenizer path
|
|
||||||
\\--seed <UINT> random seed (optional)
|
\\--seed <UINT> random seed (optional)
|
||||||
\\--seq-len <UINT> sequence length
|
\\--seq-len <UINT> sequence length
|
||||||
\\--create-options <STRING> platform creation options JSON, defaults to {}
|
\\--create-options <STRING> platform creation options JSON, defaults to {}
|
||||||
@ -199,18 +197,37 @@ pub fn asyncMain() !void {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const config = blk: {
|
const hf_model_path = res.args.@"hf-model-path" orelse {
|
||||||
if (res.args.config) |config_json_path| {
|
log.err("Missing --hf-model-path", .{});
|
||||||
var config_json_file = try asynk.File.open(config_json_path, .{ .mode = .read_only });
|
return;
|
||||||
defer config_json_file.close() catch unreachable;
|
};
|
||||||
var reader = std.json.reader(allocator, config_json_file.reader());
|
|
||||||
defer reader.deinit();
|
const model_config_path = try std.fs.path.join(allocator, &.{ hf_model_path, "config.json" });
|
||||||
const config_obj = try std.json.parseFromTokenSourceLeaky(llama.LlamaLM.Config, allocator, &reader, .{ .ignore_unknown_fields = true });
|
defer allocator.free(model_config_path);
|
||||||
break :blk config_obj;
|
|
||||||
} else {
|
const model_weights_path = b: {
|
||||||
log.err("Missing --config", .{});
|
const simple_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors" });
|
||||||
return;
|
if (asynk.File.access(simple_path, .{})) {
|
||||||
|
break :b simple_path;
|
||||||
|
} else |_| {
|
||||||
|
allocator.free(simple_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const sharded_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors.index.json" });
|
||||||
|
break :b sharded_path;
|
||||||
|
};
|
||||||
|
defer allocator.free(model_weights_path);
|
||||||
|
|
||||||
|
const model_tokenizer_path = try std.fs.path.join(allocator, &.{ hf_model_path, "tokenizer.json" });
|
||||||
|
defer allocator.free(model_tokenizer_path);
|
||||||
|
|
||||||
|
const config = blk: {
|
||||||
|
var config_json_file = try asynk.File.open(model_config_path, .{ .mode = .read_only });
|
||||||
|
defer config_json_file.close() catch unreachable;
|
||||||
|
var reader = std.json.reader(allocator, config_json_file.reader());
|
||||||
|
defer reader.deinit();
|
||||||
|
const config_obj = try std.json.parseFromTokenSourceLeaky(llama.LlamaLM.Config, allocator, &reader, .{ .ignore_unknown_fields = true });
|
||||||
|
break :blk config_obj;
|
||||||
};
|
};
|
||||||
|
|
||||||
var context = try zml.Context.init();
|
var context = try zml.Context.init();
|
||||||
@ -229,7 +246,7 @@ pub fn asyncMain() !void {
|
|||||||
create_opts.deinit();
|
create_opts.deinit();
|
||||||
context.printAvailablePlatforms(platform);
|
context.printAvailablePlatforms(platform);
|
||||||
|
|
||||||
var ts = try zml.aio.detectFormatAndOpen(allocator, res.args.weights.?);
|
var ts = try zml.aio.detectFormatAndOpen(allocator, model_weights_path);
|
||||||
defer ts.deinit();
|
defer ts.deinit();
|
||||||
|
|
||||||
var model_arena = std.heap.ArenaAllocator.init(allocator);
|
var model_arena = std.heap.ArenaAllocator.init(allocator);
|
||||||
@ -281,7 +298,7 @@ pub fn asyncMain() !void {
|
|||||||
platform,
|
platform,
|
||||||
});
|
});
|
||||||
|
|
||||||
log.info("\tLoading Llama weights from {?s}...", .{res.args.weights});
|
log.info("\tLoading Llama weights from {?s}...", .{model_weights_path});
|
||||||
var llama_weights = try zml.aio.loadBuffers(llama.LlamaLM, .{ config, llama_options }, ts, model_arena.allocator(), platform);
|
var llama_weights = try zml.aio.loadBuffers(llama.LlamaLM, .{ config, llama_options }, ts, model_arena.allocator(), platform);
|
||||||
defer zml.aio.unloadBuffers(&llama_weights);
|
defer zml.aio.unloadBuffers(&llama_weights);
|
||||||
log.info("✅\tLoaded weights in {}", .{std.fmt.fmtDuration(start.read())});
|
log.info("✅\tLoaded weights in {}", .{std.fmt.fmtDuration(start.read())});
|
||||||
@ -296,16 +313,11 @@ pub fn asyncMain() !void {
|
|||||||
const kv_cache = try llama.KvCache.initBuffer(kv_shape, platform);
|
const kv_cache = try llama.KvCache.initBuffer(kv_shape, platform);
|
||||||
|
|
||||||
var tokenizer = blk: {
|
var tokenizer = blk: {
|
||||||
if (res.args.tokenizer) |tok| {
|
log.info("Loading tokenizer from {s}", .{model_tokenizer_path});
|
||||||
log.info("Loading tokenizer from {s}", .{tok});
|
var timer = try stdx.time.Timer.start();
|
||||||
var timer = try stdx.time.Timer.start();
|
defer log.info("Loaded tokenizer from {s} [{}]", .{ model_tokenizer_path, timer.read() });
|
||||||
defer log.info("Loaded tokenizer from {s} [{}]", .{ tok, timer.read() });
|
|
||||||
|
|
||||||
break :blk try zml.tokenizer.Tokenizer.fromFile(model_arena.allocator(), tok);
|
break :blk try zml.tokenizer.Tokenizer.fromFile(model_arena.allocator(), model_tokenizer_path);
|
||||||
} else {
|
|
||||||
log.err("Missing --tokenizer", .{});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
errdefer tokenizer.deinit();
|
errdefer tokenizer.deinit();
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user