From e1b7fc578193affcb05a9c09636be4f288a5868f Mon Sep 17 00:00:00 2001 From: Foke Singh Date: Tue, 30 Sep 2025 16:17:00 +0000 Subject: [PATCH] Add example implementation and Bazel build for OpenAI gpt-oss models (GptOss.zig, main.zig, and BUILD.bazel). --- examples/gpt_oss/BUILD.bazel | 16 + examples/gpt_oss/GptOss.zig | 741 +++++++++++++++++++++++++++++++++++ examples/gpt_oss/main.zig | 376 ++++++++++++++++++ 3 files changed, 1133 insertions(+) create mode 100644 examples/gpt_oss/BUILD.bazel create mode 100644 examples/gpt_oss/GptOss.zig create mode 100644 examples/gpt_oss/main.zig diff --git a/examples/gpt_oss/BUILD.bazel b/examples/gpt_oss/BUILD.bazel new file mode 100644 index 0000000..7efba06 --- /dev/null +++ b/examples/gpt_oss/BUILD.bazel @@ -0,0 +1,16 @@ +load("@rules_zig//zig:defs.bzl", "zig_binary", "zig_test") + +zig_binary( + name = "gpt_oss", + srcs = [ + "GptOss.zig", + ], + main = "main.zig", + deps = [ + "@com_github_hejsil_clap//:clap", + "@zml//async", + "@zml//stdx", + "@zml//zml", + ], + visibility = ["//visibility:public"], +) diff --git a/examples/gpt_oss/GptOss.zig b/examples/gpt_oss/GptOss.zig new file mode 100644 index 0000000..bce16da --- /dev/null +++ b/examples/gpt_oss/GptOss.zig @@ -0,0 +1,741 @@ +///! GptOss architecture, using huggingface transformers naming. +///! Dimensions of activations: {.b, .s, .d} +const std = @import("std"); + +const stdx = @import("stdx"); +const zml = @import("zml"); + +const GptOss = @This(); +const log = std.log.scoped(.GptOss); + +pub const Config = struct { + bos_token_id: u32 = 199998, + eos_token_id: stdx.json.Union(union(enum) { + int: u32, + ints: []const u32, + }), + head_dim: u32, + num_hidden_layers: u32, + num_attention_heads: u32, + num_key_value_heads: u32, + experts_per_token: u32, + rope_theta: f32, + max_position_embeddings: u32, + rms_norm_eps: f32, + sliding_window: u32, + hf_rope_impl: bool = true, + rope_scaling: zml.nn.RopeOpts.Scaling = .{ .default = {} }, +}; + +pub const Options = struct { + sampling_strategy: zml.nn.SamplingStrategy, + max_seq_len: u32, + max_prompt_len: u32, + tokens_per_expert_ratio: f32, +}; + +pub const Mode = union(enum) { + /// In prefill mode we pass the actual len of the prompt + prefill: zml.Tensor, + /// In gen mode we pass the position of the next token + gen: zml.Tensor, +}; + +lm_head: ?zml.nn.Linear, +model: Model, + +config: Config, +options: Options, + +pub fn init(allocator: std.mem.Allocator, store: zml.aio.BufferStore, config: Config, options: Options) !GptOss { + var self: GptOss = .{ + .config = config, + .options = options, + .model = .{ + .max_seq_len = @intCast(options.max_seq_len), + .num_heads = @intCast(config.num_attention_heads), + .num_kv_heads = @intCast(config.num_key_value_heads), + .rope_opts = .{ + .layout = if (config.hf_rope_impl) .sequential else .interleaved, + .freq_base = config.rope_theta, + .scaling = config.rope_scaling, + }, + + .embed_tokens = .{ + .weight = store.getTensor("model.embed_tokens.weight").withSharding(.{1}), + }, + .layers = try allocator.alloc(TransformerLayer, config.num_hidden_layers), + .norm = .{ + .weight = store.getTensor("model.norm.weight"), + .eps = config.rms_norm_eps, + }, + }, + .lm_head = .{ .weight = store.getTensor("lm_head.weight").withSharding(.{0}) }, + }; + + var prefix: zml.aio.PrefixBuilder = try .initCapacity(allocator, 1024); + try prefix.push(stdx.noalloc, "model.layers"); + for (self.model.layers, 0..) |*layer, i| { + try prefix.pushDigit(stdx.noalloc, i); + defer prefix.pop(); + + var self_attn: SelfAttn = .{ + .sinks = store.getTensor(prefix.concat("self_attn.sinks")), + .q_proj = try zml.aio.populateModelWithPrefix(zml.nn.Linear, allocator, store, prefix.concat("self_attn.q_proj")), + .k_proj = try zml.aio.populateModelWithPrefix(zml.nn.Linear, allocator, store, prefix.concat("self_attn.k_proj")), + .v_proj = try zml.aio.populateModelWithPrefix(zml.nn.Linear, allocator, store, prefix.concat("self_attn.v_proj")), + .o_proj = try zml.aio.populateModelWithPrefix(zml.nn.Linear, allocator, store, prefix.concat("self_attn.o_proj")), + + .sliding_window = if (i % 2 == 0) config.sliding_window else null, + .num_heads = self.model.num_heads, + .num_kv_heads = self.model.num_kv_heads, + .rope_opts = self.model.rope_opts, + }; + + self_attn.q_proj.weight = self_attn.q_proj.weight.withSharding(.{0}); + self_attn.k_proj.weight = self_attn.k_proj.weight.withSharding(.{0}); + self_attn.v_proj.weight = self_attn.v_proj.weight.withSharding(.{0}); + self_attn.o_proj.weight = self_attn.o_proj.weight.withSharding(.{1}); + + const on_disk_moe = try zml.aio.populateModelWithPrefix(MoE.OnDisk, allocator, store, prefix.concat("mlp")); + var moe = on_disk_moe.rewrite(config.experts_per_token, options); + { + moe.experts.gate_up_proj.blocks = moe.experts.gate_up_proj.blocks.withSharding(.{.expert}); + moe.experts.down_proj.blocks = moe.experts.down_proj.blocks.withSharding(.{.expert}); + } + + layer.* = .{ + .input_layernorm = .{ + .weight = store.getTensor(prefix.concat("input_layernorm.weight")), + .eps = config.rms_norm_eps, + }, + .post_attention_layernorm = .{ + .weight = store.getTensor(prefix.concat("post_attention_layernorm.weight")), + .eps = config.rms_norm_eps, + }, + .self_attn = self_attn, + .mlp = moe, + }; + } + + // TODO(Corentin): Fix lm_head sharding when top-k sampling is enabled. + // It currently crashes/compilation fails + if (self.options.sampling_strategy.topk == 1 and self.lm_head != null) { + self.lm_head.?.weight = self.lm_head.?.weight.withSharding(.{0}); + } + + return self; +} + +/// Predicts the token at `token_index` position. +/// Returns: +/// - updated `tokens`, +/// - updated KV cache +/// - a Rng state to allow for probabilistic generation +pub fn forward( + self: GptOss, + tokens_: zml.Tensor, + mode: Mode, + kv_cache: KvCache, + rng: zml.Tensor.Rng, +) struct { zml.Tensor, KvCache, zml.Tensor.Rng } { + const tokens = tokens_.withPartialTags(.{.s}); + + // token index is the position in the kv cache where to write results. + const token_index: zml.Tensor = switch (mode) { + .gen => |token_index| token_index, + .prefill => .scalar(0, .u32), + }; + var out, const updated_kv_cache = zml.call(self.model, .forward, .{ tokens, token_index, kv_cache }); + + switch (mode) { + // In prefill we only pass the last token to the lm head. + .prefill => |prompt_len| out = out.gather(.{ .s = prompt_len.convert(.i32).addConstant(-1) }, .{ .indices_are_sorted = true }), + .gen => {}, + } + + var new_token, const new_rng = self.sampleTokens(self.lm_head, out, rng, self.options.sampling_strategy); + new_token = new_token.convert(.u32); + new_token = switch (mode) { + .gen => new_token.reuseBuffer(tokens), + .prefill => new_token.appendAxes(.{.s}), + }; + return .{ new_token, updated_kv_cache, new_rng }; +} + +fn sampleTokens( + self: GptOss, + lm_head_: ?zml.nn.Linear, + out_: zml.Tensor, + rng: zml.Tensor.Rng, + opts: zml.nn.SamplingStrategy, +) struct { zml.Tensor, zml.Tensor.Rng } { + const out = out_.withPartialTags(.{.d}); + + var logits = blk: { + if (lm_head_) |lm_head| { + break :blk zml.call(lm_head, .forward, .{out}); + } else { + break :blk self.model.embed_tokens.weight.withTags(.{ .voc, .d }).dot(out, .{.d}); + } + }; + + if (logits.shape().hasTag(.voc) == null) + logits = logits.rename(.{ .d = .voc }); + + const next_tokens, const new_rng = zml.nn.sampleTokens(logits, opts, rng); + return .{ next_tokens, new_rng }; +} + +pub fn loadBuffers(self: GptOss, allocator: std.mem.Allocator, store: zml.aio.BufferStore, platform: zml.Platform) !zml.Bufferized(GptOss) { + var prefix: zml.aio.PrefixBuilder = try .initCapacity(allocator, 256); + defer prefix.deinit(allocator); + + const noalloc = stdx.noalloc; + const loaded: zml.Bufferized(GptOss) = .{ + .model = .{ + .embed_tokens = try store.loadModelById(zml.nn.TokenEmbedding, noalloc, self.model.embed_tokens, platform), + .layers = try allocator.alloc(zml.Bufferized(TransformerLayer), self.model.layers.len), + .norm = try store.loadModelById(RmsNorm, noalloc, self.model.norm, platform), + }, + .lm_head = try store.loadModelById(?zml.nn.Linear, noalloc, self.lm_head, platform), + }; + + prefix.push(noalloc, "model.layers") catch unreachable; + for (loaded.model.layers, self.model.layers, 0..) |*d_layer, layer, layer_id| { + const ckpt = prefix.checkpoint(); + defer prefix.restore(ckpt); + + prefix.pushDigit(noalloc, layer_id) catch unreachable; + d_layer.* = .{ + .input_layernorm = try store.loadModelById(RmsNorm, noalloc, layer.input_layernorm, platform), + .self_attn = try store.loadModelById(SelfAttn, noalloc, layer.self_attn, platform), + .post_attention_layernorm = try store.loadModelById(RmsNorm, noalloc, layer.post_attention_layernorm, platform), + .mlp = try store.loadModelById(MoE, noalloc, layer.mlp, platform), + }; + } + + return loaded; +} + +pub const Model = struct { + embed_tokens: zml.nn.TokenEmbedding, + norm: RmsNorm, + layers: []TransformerLayer, + + max_seq_len: u32 = 0, + num_heads: i64 = 32, + num_kv_heads: i64 = 32, + rope_opts: zml.nn.RopeOpts = .{ + .layout = .interleaved, + .freq_base = 10_000, + }, + + /// Forward one token, using KV cache for previous tokens. + /// Returns result and updated KV cache. + pub fn forward(self: Model, tokens: zml.Tensor, token_index: zml.Tensor, kv_cache: KvCache) struct { zml.Tensor, KvCache } { + const embeds = embed(self.embed_tokens, tokens); + var hidden = embeds; + + var updated_kv_cache = kv_cache; + for (self.layers, 0..) |layer, i| { + hidden, updated_kv_cache = zml.call(layer, .forward, .{ hidden, token_index, updated_kv_cache.atLayer(i) }); + } + const output = zml.call(self.norm, .forward, .{hidden}); + + return .{ output, updated_kv_cache.reuseBuffer(kv_cache) }; + } + + pub fn embed(embed_tokens_: zml.nn.TokenEmbedding, tokens_: zml.Tensor) zml.Tensor { + return zml.call(embed_tokens_, .forward, .{tokens_}).withPartialTags(.{.d}); + } +}; + +pub const TransformerLayer = struct { + input_layernorm: RmsNorm, + self_attn: SelfAttn, + post_attention_layernorm: RmsNorm, + mlp: MoE, + + pub fn forward( + self: TransformerLayer, + x0: zml.Tensor, + token_index: zml.Tensor, + kv_cache: KvCache, + ) struct { zml.Tensor, KvCache } { + // Self Attention + //log.debug("TransformerLayer({}) -> {}", .{ x0, self.input_layernorm.forward(x0) }); + stdx.debug.assert(x0.rank() >= 2 and x0.shape().hasTags(.{ .s, .d }), "TransformerLayer expected input shape: {{..., .s, .d}}, received: {f}", .{x0}); + + const x0_normalized = zml.call(self.input_layernorm, .forward, .{x0}); + const delta0, const updated_kv_cache = zml.call(self.self_attn, .forward, .{ x0_normalized, token_index, kv_cache }); + const x1 = x0.add(delta0); + + // Fully Connected + const x1_normalized = zml.call(self.post_attention_layernorm, .forward, .{x1}); + const x2 = zml.call(self.mlp, .forward, .{x1_normalized}).add(x1); + + return .{ x2.reuseBuffer(x0), updated_kv_cache }; + } +}; + +const RmsNorm = struct { + weight: zml.Tensor, + eps: f32 = 1e-6, + + /// L2 normalization of input tensor along `.d` axis. + pub fn forward(self: RmsNorm, input: zml.Tensor) zml.Tensor { + const x = if (input.shape().isFullyTagged()) input else input.withPartialTags(.{.d}); + // Note: contrary to Llama here the full layer is done in .f32, not just the variance computation. + const normalized = zml.nn.rmsNorm(x.convert(.f32), .d, self.eps); + return normalized.mul(self.weight.convert(.f32).withTags(.{.d}).broad(x.shape())).convert(input.dtype()); + } +}; + +const MoE = struct { + experts: Mlp, + router: zml.nn.Linear, + moe_opts: MoeOpts, + + pub fn forward(self: MoE, input: zml.Tensor) zml.Tensor { + log.warn("compiling moe with {f}", .{input}); + // Note: GptOss applies softmax on the routing score. + // We delay the softmax to mixtureOfExperts where the actual routing is done. + // This allow to do re-routing without introducing nans. + const gating = self.router.forward(input); + return mixtureOfExperts(Mlp, self.experts, input, gating, self.moe_opts); + } + + pub const OnDisk = struct { + router: zml.nn.Linear, + experts: struct { + down_proj_bias: zml.Tensor, + down_proj_blocks: zml.Tensor, + down_proj_scales: zml.Tensor, + gate_up_proj_bias: zml.Tensor, + gate_up_proj_blocks: zml.Tensor, + gate_up_proj_scales: zml.Tensor, + }, + + pub fn rewrite(on_disk: OnDisk, experts_per_token: u32, options: Options) MoE { + const e = on_disk.experts; + return .{ + .experts = .{ + .gate_up_proj = .{ + // We need to bitcast the scale cause safetensors doesn't encode f8 types correctly + .scale = e.gate_up_proj_scales.withTags(.{ .expert, .out, .d }), + // We don't bitcast here because PJRT doesn't handle packed host buffers + .blocks = e.gate_up_proj_blocks.withTags(.{ .expert, .out, .d, .d_block }), + .blocks_dtype = .f4e2m1, + .bias = e.gate_up_proj_bias.withTags(.{ .expert, .d }), + }, + .down_proj = .{ + .blocks = e.down_proj_blocks.withTags(.{ .expert, .out, .d, .d_block }), + .blocks_dtype = .f4e2m1, + .scale = e.down_proj_scales.withTags(.{ .expert, .out, .d }), + .bias = e.down_proj_bias.withTags(.{ .expert, .d }), + }, + }, + + .router = .{ + .weight = on_disk.router.weight.withTags(.{ .expert, .d }), + .bias = on_disk.router.bias.?.withTags(.{.expert}), + }, + + .moe_opts = .{ + .experts_per_token = experts_per_token, + .tokens_per_expert_ratio = options.tokens_per_expert_ratio, + .normalization = .softmax, + }, + }; + } + }; +}; + +pub const Mlp = struct { + gate_up_proj: BlockScaledLinear, // {.out = intermediate_size * 2, .d = hidden_size / block_size, .d_block = block_size } + down_proj: BlockScaledLinear, // {.out = hidden_size * 2, .d = intermediate_size / block_size, .d_block = block_size } + + pub fn forward(self: Mlp, x: zml.Tensor) zml.Tensor { + const dt = x.dtype(); + var gate, var up = zml.nn.splitRealImg(self.gate_up_proj.forward(x), .interleaved); + gate = .minimum(gate, .scalar(7, dt)); + up = .clamp(up, .scalar(-7, dt), .scalar(7, dt)); + + const out = gate.quickGelu().mul(up.addConstant(1)); + return zml.call(self.down_proj, .forward, .{out}); + } + + pub fn format(self: Mlp, writer: *std.Io.Writer) std.Io.Writer.Error!void { + try writer.print("Mlp(gate_up_proj=.{f}, down_proj=.{f})", .{ self.gate_up_proj, self.down_proj }); + } +}; + +pub const SelfAttn = struct { + q_proj: zml.nn.Linear, + k_proj: zml.nn.Linear, + v_proj: zml.nn.Linear, + sinks: zml.Tensor, + + o_proj: zml.nn.Linear, + + sliding_window: ?u32, + num_heads: i64, + num_kv_heads: i64, + rope_opts: zml.nn.RopeOpts, + + /// Self Attention. + /// - If token_index is set, x is assumed to be the representation of one new token, + /// and kv_cache will be read for the previous tokens. + /// - If token_index is not set, x is assumed to be the representation of all tokens + /// since the beginning of the sequence, and kv_cache won't be read. + /// In both case, kv_cache will be updated with the computed key and value. + /// x: {.b, .s, .d } -> .{.b, .s, .d} + pub fn forward( + self: SelfAttn, + x: zml.Tensor, + token_index: zml.Tensor, + kv_cache: KvCache, + ) struct { zml.Tensor, KvCache } { + const num_kv_heads = self.num_kv_heads; + var q = zml.call(self.q_proj, .forward, .{x}).splitAxis(-1, .{ .h = self.num_heads, .hd = .auto }).withSharding(.{.h}); + var k = zml.call(self.k_proj, .forward, .{x}).splitAxis(-1, .{ .h = num_kv_heads, .hd = .auto }).withSharding(.{.h}); + var v = zml.call(self.v_proj, .forward, .{x}).splitAxis(-1, .{ .h = num_kv_heads, .hd = .auto }).withSharding(.{.h}); + + // Generate the attention mask. + const seq_len = kv_cache.k.dim(.k); + var attn_mask = zml.nn.causalAttnMask(.{ .q = seq_len, .k = seq_len }, x.dtype(), self.sliding_window); + + // Note: in Pytorch it would be very inefficient to generate the full attn_mask, + // then slice into it, but XLA is able to optimize this correctly. + attn_mask = attn_mask.gatherSlices(zml.Shape.init(.{ .q = x.dim(.s) }, attn_mask.dtype()), token_index.reshape(.{ .coord = 1 }), .{}); + + // In self-attention, .s axis is used both for keys and queries. + const pos_index = b: { + const temp = zml.Tensor.arange(.{ .end = x.dim(.s) }, token_index.dtype()).withTags(.{.s}).broad(zml.Shape.init(.{ .s = x.dim(.s) }, token_index.dtype())); + break :b temp.add(token_index.broad(temp.shape())); + }; + + q = zml.nn.rope(q, pos_index, self.rope_opts); + k = zml.nn.rope(k, pos_index, self.rope_opts); + q = q.rename(.{ .s = .q }); + k = k.rename(.{ .s = .k }); + v = v.rename(.{ .s = .k }); + + const dtype = q.dtype(); + const new_kv_cache = kv_cache.update(k, v, token_index); + k = new_kv_cache.keys().convert(dtype); + v = new_kv_cache.values().convert(dtype); + + // TODO ringbuffer kv cache. + + const softmax_bias = self.sinks.withTags(.{.h}); + const attn_output = zml.nn.sdpa(q, k, v, .{ .attn_mask = attn_mask, .softmax_bias = softmax_bias }); + const attn = attn_output.merge(.{ .d = .{ .h, .hd } }).rename(.{ .q = .s }); + return .{ zml.call(self.o_proj, .forward, .{attn}), new_kv_cache }; + } +}; + +pub const KvCache = struct { + k: zml.Tensor, + v: zml.Tensor, + layer_index: zml.Tensor, + + pub fn init(kv_shape: zml.Shape) KvCache { + // The KV-cache is initialized with ones to detect reads of uninitialized memory. + return .{ + .k = .constant(kv_shape, kv_shape.dtype().one()).withSharding(.{.h}), + .v = .constant(kv_shape, kv_shape.dtype().one()).withSharding(.{.h}), + .layer_index = .scalar(-1, .u32), + }; + } + + pub fn initShape(kv_shape: zml.Shape) zml.ShapeOf(KvCache) { + return .{ + .k = kv_shape, + .v = kv_shape, + .layer_index = zml.Shape.init(.{}, .u32), + }; + } + + pub fn initBuffer(kv_shape: zml.Shape, platform: zml.Platform) !zml.Bufferized(KvCache) { + return .{ + .k = try zml.Buffer.uninitialized(platform, kv_shape, .{}), + .v = try zml.Buffer.uninitialized(platform, kv_shape, .{}), + .layer_index = try zml.Buffer.uninitialized(platform, .scalar(.u32), .{}), + }; + } + + pub fn keys(self: KvCache) zml.Tensor { + return self.k.dynamicSlice(.{ .layer = zml.Tensor.DynSlice{ .start = self.layer_index, .len = 1 } }).squeeze(.layer); + } + + pub fn values(self: KvCache) zml.Tensor { + return self.v.dynamicSlice(.{ .layer = zml.Tensor.DynSlice{ .start = self.layer_index, .len = 1 } }).squeeze(.layer); + } + + pub fn update(self: KvCache, new_k: zml.Tensor, new_v: zml.Tensor, token_index: ?zml.Tensor) KvCache { + const idx = if (token_index) |idx| idx else zml.Tensor.arange(.{ .end = new_k.dim(.k) }, .u32); + + return .{ + .k = self.k.scatterSlices( + .{ .k = idx, .layer = self.layer_index }, + new_k.convert(self.k.dtype()), + .{ .indices_are_sorted = true, .update_fn = zml.Tensor.ScatterOpts.override }, + ).reuseBuffer(self.k), + .v = self.v.scatterSlices( + .{ .k = idx, .layer = self.layer_index }, + new_v.convert(self.v.dtype()), + .{ .indices_are_sorted = true, .update_fn = zml.Tensor.ScatterOpts.override }, + ).reuseBuffer(self.v), + .layer_index = self.layer_index, + }; + } + + pub fn atLayer(self: KvCache, layer_index: usize) KvCache { + return .{ + .k = self.k, + .v = self.v, + .layer_index = .scalar(layer_index, .u32), + }; + } + + pub fn reuseBuffer(self: KvCache, other: KvCache) KvCache { + return .{ + .k = self.k.reuseBuffer(other.k), + .v = self.v.reuseBuffer(other.v), + .layer_index = self.layer_index.reuseBuffer(other.layer_index), + }; + } +}; + +pub const BlockScaledLinear = struct { + blocks: zml.Tensor, + scale: zml.Tensor, + bias: ?zml.Tensor = null, + blocks_dtype: zml.DataType, + + pub fn forward(self: BlockScaledLinear, x: zml.Tensor) zml.Tensor { + const ctx = x.getContext(); + const res_shape = x.shape().setDim(-1, self.blocks.dim(-3)); + + // Bitcast to our actual type. This allows to load weights in a packed layout. + const blocks_0 = self.blocks.bitCast(self.blocks_dtype); + const blocks = blocks_0.merge(.{ .d_block = .{ .d_block, .bitcast } }); + + const scale = self.scale.bitCast(.f8e8m0); + + // log.warn("BlockScaledLinear({}): {f} -> {f}", .{ self, x, res_shape }); + const y = switch (ctx._platform.target) { + else => y: { + var dequantized_weight: zml.Tensor = .mul( + blocks.convert(x.dtype()), + scale.convert(x.dtype()).appendAxes(.{.d_block}), + ); + var y = x.dot(dequantized_weight.merge(.{ .d = .{ .d, .d_block } }), .{.d}); + // std.log.warn("output shape: {f}", .{y}); + std.debug.assert(y.shape().eql(res_shape)); + y._shape = res_shape; + break :y y; + }, + }; + return if (self.bias) |bias| y.add(bias.broad(y.shape())) else y; + } + + pub fn format(self: BlockScaledLinear, writer: *std.Io.Writer) !void { + try writer.print("BlockScaledLinear(blocks={f}, scale={f}, bias={?f}, dt={t})", .{ self.blocks, self.scale, self.bias, self.blocks_dtype }); + } +}; + +const MoeOpts = struct { + experts_per_token: u32, + tokens_per_expert_ratio: ?f32 = 0.0, + normalization: Normalization, + + pub const Normalization = enum { linear, softmax }; +}; + +/// We have three algorithms, +/// * one for single-stream inference (naive), +/// * one for small batch sized with exact precision that sends all tokens to all experts. +/// this isn't too costly as long as the batch size is small and the experts are IO bound. +/// * one for big batch size that assign a fixed compute budget per expert and +/// experts chose the tokens they want to handle. This introduces noise since it's possible +/// a token doesn't get their requested expert. +/// The parameter `tokens_per_expert_ratio` control how much compute budget is granted: +/// expert_budget = ratio * (num_tokens * experts_per_token / num_experts). +/// Bigger values of ratio will ensure it's rare a token doesn't get it's top 2 tokens. +/// +/// The preferred algorithm is the batched one, +/// it is selected as soon there is enough tokens to guarantee that experts will be active most of the time. +/// +/// - input: .{ .s, .d } per-entry vector +/// - gating: .{ .s, .expert } per-entry expert-affinity +/// - experts: .{ .expert, .d_out, .d } expert layer (need to have a .forward method). +/// -> output: .{ .s, .d_out } +pub fn mixtureOfExperts(Expert: type, experts: Expert, input: zml.Tensor, gating: zml.Tensor, opts: MoeOpts) zml.Tensor { + log.warn("mixtureOfExperts({s}, {f}, {f}, {})", .{ @typeName(Expert), input, gating, opts }); + const num_tokens: u32 = @intCast(input.dim(.s)); + const num_experts = gating.dim(.expert); + stdx.debug.assert(opts.experts_per_token > 0, "mixtureOfExperts expects opts.experts_per_token > 0, got {}", .{opts}); + + if (num_tokens == 1) { + return moePerTokenRouting(Expert, experts, input, gating, opts); + } + + const tokens_per_expert: u32 = if (opts.tokens_per_expert_ratio) |ratio| tpe: { + const compute_budget = ratio * @as(f32, @floatFromInt(num_tokens * opts.experts_per_token)); + var tpe: u32 = @intFromFloat(stdx.math.divFloat(f32, compute_budget, num_experts)); + // Round to next multiple of 8 to avoid weird shapes. + if (tpe % 8 != 0) tpe += 8 - (tpe % 8); + break :tpe tpe; + } else num_tokens; + + if (3 * tokens_per_expert <= 2 * num_tokens) { + const routing, const tokens_ids_per_expert = dispatchTokens(gating, .{ + .tokens_per_expert = tokens_per_expert, + .experts_per_token = opts.experts_per_token, + .normalization = opts.normalization, + }); + const scores_per_expert = routing.transpose(.{ .expert, .s }).gather(.{ .s = tokens_ids_per_expert }, .{}); + const input_per_expert = input.gather(.{ .s = tokens_ids_per_expert }, .{}); + var output_per_expert = experts.forward(input_per_expert); + output_per_expert = output_per_expert.mul(scores_per_expert.convert(output_per_expert.dtype()).broad(output_per_expert.shape())); + + // Reverse engineer the normal output shape that one expert would have produced for all tokens. + // If this fall short, we could use the "sliced_expert" strategy and call forward ourselves. + const output_shape = output_per_expert.shape().drop(.expert).rename(.{ .top_token = .s }).setDim(.s, num_tokens); + const output = zml.Tensor.scatterSlices( + .constant(output_shape, output_shape.dtype().zero()), + .{ .s = tokens_ids_per_expert }, + output_per_expert, + .{ .update_fn = zml.Tensor.ScatterOpts.increment }, + ); + + log.warn("mixtureOfExperts({s}, {f}, {f}) -> fixed budget impl tpe: {d}, tokens: {d}", .{ @typeName(Expert), input, gating, tokens_per_expert, num_tokens }); + return output; + } else { + return mixtureOfExpertsAllToAll(Expert, experts, input, gating, opts); + } +} + +/// Few tokens: most experts are unused, experts have at most one token. +/// Select active experts and compute with that. +pub fn moePerTokenRouting(Expert: type, experts: Expert, input: zml.Tensor, gating: zml.Tensor, opts: MoeOpts) zml.Tensor { + const num_tokens: u32 = @intCast(input.dim(.s)); + stdx.debug.assert(num_tokens < 32, "Trying to unroll a lot of tokens !", .{}); + const per_token_outputs = input.getContext().allocator().alloc(zml.Tensor, num_tokens) catch @panic("OOM"); + + const routing = gating.topK(.{ .top_expert = .expert }, opts.experts_per_token, .{}); + const per_token_score = switch (opts.normalization) { + .linear => routing.values.div(routing.values.sum(.top_expert)), + .softmax => routing.values.softmax(.top_expert), + }; + + for (per_token_outputs, 0..num_tokens) |*output, tok_id| { + for (0..opts.experts_per_token) |expert_rank| { + const expert_id = routing.indices.choose(.{ .s = tok_id, .top_expert = expert_rank }).asScalar(); + const expert_score = per_token_score.choose(.{ .s = tok_id, .top_expert = expert_rank }).asScalar(); + + var sliced_expert: Expert = undefined; + zml.meta.mapAlloc(struct { + pub fn cb(expert_id_: zml.Tensor, expert_weight: zml.Tensor) zml.Tensor { + return expert_weight.gather(.{ .expert = expert_id_ }, .{}); + } + }.cb, stdx.noalloc, expert_id, experts, &sliced_expert) catch unreachable; + + // TODO how does this work when the two experts are on different gpus? + // does the compute overlap ? + var expert_output = sliced_expert.forward(input.choose(.{ .s = tok_id })); + expert_output = .mul( + expert_output, + expert_score.convert(input.dtype()).broad(expert_output.shape()), + ); + output.* = if (expert_rank > 0) output.add(expert_output) else expert_output; + } + } + + log.warn("mixtureOfExperts({s}, {f}, {f}) -> single-stream impl", .{ @typeName(Expert), input, gating }); + return .stack(per_token_outputs, 0, .s); +} + +/// Send all tokens to all experts, and apply gating. +pub fn mixtureOfExpertsAllToAll(Expert: type, experts: Expert, input: zml.Tensor, gating: zml.Tensor, opts: MoeOpts) zml.Tensor { + log.warn("mixtureOfExperts({s}, {f}, {f}) -> all to all impl", .{ @typeName(Expert), input, gating }); + const num_experts = gating.dim(.expert); + const hard_gating = hardGating(gating, opts).print(); + // TODO: `input.insertAxes(0, .{.expert}).repeat1d(.expert, num_experts)` is too verbose for just broadcasting along a new axis` + const output_per_expert = experts.forward(input.insertAxes(0, .{.expert}).repeat1d(.expert, @intCast(num_experts))); + return output_per_expert.dot(hard_gating.convert(input.dtype()), .expert); +} + +/// Given `(token, expert) -> scores`, +/// keeps only the top-k expert per token, and normalize the scores accordingly. +/// Non selected experts will have a 0 score. +pub fn hardGating(gating: zml.Tensor, opts: MoeOpts) zml.Tensor { + const routing = gating.topK(.{ .top_expert = .expert }, opts.experts_per_token, .{}); + + const per_token_score = switch (opts.normalization) { + .linear => routing.values.div(routing.values.sum(.top_expert)), + .softmax => routing.values.softmax(.top_expert), + }; + + return zml.Tensor.scatterSlices( + .zeroes(gating.shape()), + .{ .expert = routing.indices }, + per_token_score, + .{ .indices_are_unique = true }, + ); +} + +/// Lot of tokens, each experts chose their tokens. +/// It means that some tokens may have only one expert assigned. +/// Each token will get assigned to at least one expert IIF the input gating is sums up to 1 (typically softmax output). +/// Returns the actual `(token, expert) -> scores` used. +pub fn dispatchTokens( + gating: zml.Tensor, + opts: struct { + tokens_per_expert: u32, + experts_per_token: u32, + normalization: MoeOpts.Normalization, + }, +) [2]zml.Tensor { + const num_experts = gating.dim(.expert); + + const token_pref = gating.argsort(.expert, .{ .descending = true }); + var expert_rank: zml.Tensor = .scatterSlices( + .zeroes(gating.shape().withDtype(.i32)), + .{ .expert = token_pref }, + .addConstant(.iota(gating.shape(), .expert), 1), + .{ .indices_are_unique = true }, + ); + // The pow(expert_rank) here means that we strongly favor top 1 over top 2 and top 2 over top 3. + // expert_routing: (expert, top_token) -> token + const expert_routing = gating.pow(expert_rank.convert(gating.dtype())).topK(.{ .top_token = .s }, opts.tokens_per_expert, .{}); + const scores_per_expert = gating.gather(.{ .s = expert_routing.indices }, .{}); + + // Update the gating coefficient to account for the expert routing. + // Each (token, expert) which can't be computed within the given budget is left to 0. + const gating_v2: zml.Tensor = .scatterSlices( + .zeroes(gating.shape()), + .{ .s = expert_routing.indices }, + scores_per_expert, + .{ .indices_are_unique = true, .update_fn = zml.Tensor.ScatterOpts.override }, + ); + // Now set to zero the scores (token, expert) for tokens that have been assigned more than experts_per_token. + const lowest_experts = gating_v2.topK(.{ .top_expert = .expert }, @intCast(num_experts - opts.experts_per_token), .{ .descending = false }); + var gating_v3: zml.Tensor = .scatterSlices( + gating_v2, + .{ .expert = lowest_experts.indices }, + .zeroes(lowest_experts.values.shape()), + .{ .indices_are_unique = true, .update_fn = zml.Tensor.ScatterOpts.override }, + ); + // Then normalize so the sum of experts scores for one token sums up to 1. + gating_v3 = switch (opts.normalization) { + .linear => gating_v3.div(gating_v3.sum(.expert)), + .softmax => gating_v3.softmax(.expert), + }; + const tokens_ids_per_expert = expert_routing.indices.transpose(.{ .expert, .top_token }); + + return .{ gating_v3, tokens_ids_per_expert }; +} diff --git a/examples/gpt_oss/main.zig b/examples/gpt_oss/main.zig new file mode 100644 index 0000000..72aead2 --- /dev/null +++ b/examples/gpt_oss/main.zig @@ -0,0 +1,376 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +const async = @import("async"); +const clap = @import("clap"); +const stdx = @import("stdx"); +const zml = @import("zml"); +const Buffer = zml.Buffer; +const Tensor = zml.Tensor; +const ShapeOf = zml.ShapeOf; + +const GptOss = @import("GptOss.zig"); + +const log = std.log.scoped(.GptOss); + +pub const std_options: std.Options = .{ + .log_level = .info, + .logFn = async.logFn(std.log.defaultLog), +}; + +const cli_params = clap.parseParamsComptime( + \\--help print this help + \\--prompt the prompt + \\--hf-model-path path to the directory containing model weights, config and tokenizer + \\--seed random seed (optional) + \\--seq-len max sequence length + \\--prompt-len max prompt length + \\--temperature temperature (default 1.0) + \\--topk topk (default 10) + \\--expert-budget token budget per expert + \\--platform-options platform options, using Zon syntax, eg '.{.cuda=.{.allocator=.{.async=.{.memory_fraction=0.95}}}}' + \\--nochat skip prompt template + \\--sharding default: true: sharding on or off +); + +pub fn tokenizePrompt(tokenizer: zml.tokenizer.Tokenizer, prompt: []const u8, no_chat: bool, out: []u32) ![]u32 { + var encoder = try tokenizer.encoder(); + defer encoder.deinit(); + + if (no_chat) { + const tokens = try encoder.encode(prompt); + if (tokens.len > out.len) return error.PromptTooLong; + @memcpy(out[0..tokens.len], tokens); + return out[0..tokens.len]; + } + + const start_header = tokenizer.tokenToId("<|start|>") orelse return error.NoSuchToken; + const end_header_start_message = tokenizer.tokenToId("<|message|>") orelse return error.NoSuchToken; + const end_message = tokenizer.tokenToId("<|end|>") orelse return error.NoSuchToken; + + var tokens: std.ArrayList(u32) = .initBuffer(out); + + const system_prompt = try encoder.encode("You are ChatGPT, a large language model trained by OpenAI.\n"); + if (system_prompt.len + 4 > tokens.unusedCapacitySlice().len) return error.PromptTooLong; + tokens.appendSliceAssumeCapacity(&.{ start_header, tokenizer.tokenToId("system").?, end_header_start_message }); + tokens.appendSliceAssumeCapacity(system_prompt); + tokens.appendAssumeCapacity(end_message); + + const user_prompt = try encoder.encode(prompt); + if (user_prompt.len + 9 > tokens.unusedCapacitySlice().len) return error.PromptTooLong; + tokens.appendSliceAssumeCapacity(&.{ start_header, tokenizer.tokenToId("user").?, end_header_start_message }); + tokens.appendSliceAssumeCapacity(user_prompt); + tokens.appendSliceAssumeCapacity(&.{ + end_message, + start_header, + tokenizer.tokenToId("assistant").?, + tokenizer.tokenToId("<|channel|>") orelse return error.NoSuchToken, + tokenizer.tokenToId("analysis") orelse return error.NoSuchToken, + end_header_start_message, + }); + + return tokens.items; +} + +pub fn generateText( + config: GptOss.Config, + options: GptOss.Options, + mod_prefill: zml.ModuleExe(GptOss.forward), + mod_generate: zml.ModuleExe(GptOss.forward), + kv_cache_: zml.Bufferized(GptOss.KvCache), + tokenizer: zml.tokenizer.Tokenizer, + allocator: std.mem.Allocator, + seed: u128, + prompt_tok: []const u32, + output: *std.Io.Writer, +) !void { + var tokenizer_decoder = try tokenizer.decoder(); + defer tokenizer_decoder.deinit(); + + const platform = mod_generate.platform(); + + // init RNG and buffers + var rng = try zml.Tensor.Rng.init(platform, seed); + var generated_token_buffer = [_]u32{undefined}; + + var current_token, var kv_cache = prefill: { + // prepare device buffers for the prefill tokens and their positions + const prefill_buffer = try allocator.alloc(u32, options.max_prompt_len); + @memcpy(prefill_buffer[0..prompt_tok.len], prompt_tok); + + var prefill_tokens = try zml.Buffer.fromSlice(platform, .{options.max_prompt_len}, prefill_buffer); + defer prefill_tokens.deinit(); + var prefill_token_pos = try zml.Buffer.scalar(platform, prompt_tok.len, .u32); + defer prefill_token_pos.deinit(); + + const first_token, const kv_cache, rng = mod_prefill.call(.{ prefill_tokens, .{ .prefill = prefill_token_pos }, kv_cache_, rng }); + + // extract the first generated token + _ = try first_token.toHost(std.mem.sliceAsBytes(&generated_token_buffer)); + log.warn("first_token: {d}", .{generated_token_buffer[0]}); + break :prefill .{ first_token, kv_cache }; + }; + defer zml.aio.unloadBuffers(&kv_cache); + defer current_token.deinit(); + + const output_tokens_len = options.max_seq_len - prompt_tok.len - 1; + const start = std.time.microTimestamp(); + + // One token has already been generated by the prefill. + var num_tokens_generated: usize = 1; + + generation: for (0..output_tokens_len + 1) |i| { + // collect and print generated sequence + num_tokens_generated += 1; + const generated_token = generated_token_buffer[0]; + if (try tokenizer_decoder.next(generated_token)) |chunk| { + try output.writeAll(chunk); + } + + // check for eos + if (i == output_tokens_len) break :generation; + switch (config.eos_token_id.value) { + .int => |eos| if (generated_token == @as(u32, @intCast(eos))) break :generation, + .ints => |eos_list| { + for (eos_list) |eos| { + if (generated_token == @as(u32, @intCast(eos))) break :generation; + } + }, + } + + // current token pos needs to go into a zml.Buffer + const token_pos_buffer = &[_]u32{@intCast(prompt_tok.len + i)}; + const token_pos = try zml.Buffer.fromSlice(platform, .{}, token_pos_buffer); + defer token_pos.deinit(); + + // call to generate the next token + current_token, kv_cache, rng = mod_generate.call(.{ current_token, .{ .gen = token_pos }, kv_cache, rng }); + + // extract the generated token from the buffer + _ = try current_token.toHost(std.mem.sliceAsBytes(&generated_token_buffer)); + } + const end = std.time.microTimestamp(); + const duration = stdx.math.divFloat(f64, end - start, std.time.us_per_s); + const speed = @as(f64, @floatFromInt(num_tokens_generated)) / duration; + + log.info("✅ Generated {d} tokens in {:.3}s: {d:.3}tok/s", .{ num_tokens_generated, duration, speed }); +} + +pub fn main() !void { + try async.AsyncThread.main(std.heap.smp_allocator, asyncMain); +} + +pub fn asyncMain() !void { + log.info(" GptOss was compiled with {}", .{@import("builtin").mode}); + + var allocator: std.mem.Allocator = alloc: { + if (builtin.mode == .Debug) { + var dbg_alloc: std.heap.DebugAllocator(.{ + .never_unmap = true, + .retain_metadata = true, + }) = .init; + break :alloc dbg_alloc.allocator(); + } + break :alloc std.heap.smp_allocator; + }; + + const cli = ClapBoilerplate.parseCli(allocator); + defer cli.deinit(); + + const hf_model_path = cli.args.@"hf-model-path" orelse { + log.err("Missing --hf-model-path", .{}); + return; + }; + + const config = config: { + var arena: std.heap.ArenaAllocator = .init(allocator); + defer arena.deinit(); + + const model_config_path = try std.fs.path.join(arena.allocator(), &.{ hf_model_path, "config.json" }); + + var config_json_file = try async.File.open(model_config_path, .{ .mode = .read_only }); + defer config_json_file.close() catch unreachable; + + var config_reader = config_json_file.reader(try arena.allocator().alloc(u8, 256)); + var reader = std.json.Reader.init(allocator, &config_reader.interface); + defer reader.deinit(); + var config = try std.json.parseFromTokenSourceLeaky(GptOss.Config, arena.allocator(), &reader, .{ .ignore_unknown_fields = true }); + + // From generation_config.json + config.eos_token_id = .{ .value = .{ .ints = &.{ 200002, 199999, 200012 } } }; + break :config config; + }; + + var context = try zml.Context.init(); + defer context.deinit(); + + // initialize ZML platform + const platform: zml.Platform = platform: { + const arena: std.heap.ArenaAllocator = .init(allocator); + defer arena.deinit(); + + // eg: --platform-options='.{.cuda=.{.allocator=.{.bfc=.{.memory_fraction=0.99}}}}' + // eg: --platform-options='.{.cpu=.{.device_count=8}}' + const platform_opts = std.zon.parse.fromSlice(zml.Platform.CreateOptions, allocator, @ptrCast(cli.args.@"platform-options" orelse ".{}"), null, .{ .free_on_error = false }) catch |err| { + log.err("Failed to parse --platform-options as json ({}): {s}", .{ err, cli.args.@"platform-options".? }); + return err; + }; + + const compilation_options = zml.CompilationOptions{ + .xla_dump_to = "/tmp/zml/gpt_oss", + .sharding_enabled = cli.args.sharding orelse true, + }; + + const platform = context + .autoPlatform(platform_opts) + .withCompilationOptions(compilation_options); + context.printAvailablePlatforms(platform); + + break :platform platform; + }; + + const options: GptOss.Options = .{ + .max_seq_len = cli.args.@"seq-len" orelse 8192, + .max_prompt_len = cli.args.@"prompt-len" orelse 256, + .tokens_per_expert_ratio = cli.args.@"expert-budget" orelse 4.0, + .sampling_strategy = .{ + .topk = cli.args.topk orelse 10, + .temperature = 1.0, + }, + }; + + var compiler_arena = std.heap.ArenaAllocator.init(allocator); + defer compiler_arena.deinit(); + + const model_weights_path = try std.fs.path.join(allocator, &.{ hf_model_path, "model.safetensors.index.json" }); + defer allocator.free(model_weights_path); + + var store = try zml.aio.detectFormatAndOpen(allocator, model_weights_path); + defer store.deinit(); + + const model: GptOss = try GptOss.init(compiler_arena.allocator(), store, config, options); + + const tokens_shape_prefill = zml.Shape.init(.{ .s = options.max_prompt_len }, .u32); + const tokens_shape = zml.Shape.init(.{ .s = 1 }, .u32); + + const dtype = model.model.embed_tokens.weight.dtype(); + + const kv_shape = zml.Shape.init(.{ + .layer = model.model.layers.len, + .k = options.max_seq_len, + .h = config.num_key_value_heads, + .hd = config.head_dim, + }, dtype).withSharding(.{.h}); + + const kv_cache_shape: zml.ShapeOf(GptOss.KvCache) = GptOss.KvCache.initShape(kv_shape); + const rng_shape = zml.Tensor.Rng.shape(); + + var start = try std.time.Timer.start(); + var fut_mod_prefill = try async.async(zml.compileModel, .{ + allocator, GptOss.forward, model, + .{ + tokens_shape_prefill, + zml.ShapeOf(GptOss.Mode){ .prefill = .scalar(.u32) }, + kv_cache_shape, + rng_shape, + }, + platform, + }); + + var fut_mod = try async.async(zml.compileModel, .{ + allocator, GptOss.forward, model, + .{ + tokens_shape, + zml.ShapeOf(GptOss.Mode){ .gen = .scalar(.u32) }, + kv_cache_shape, + rng_shape, + }, + platform, + }); + + log.info("\tLoading GptOss weights from {s}...", .{model_weights_path}); + var gpt_oss_weights = try model.loadBuffers(compiler_arena.allocator(), store, platform); + defer zml.aio.unloadBuffers(&gpt_oss_weights); + log.info("✅\tLoaded weights in {D}", .{start.read()}); + + var module_prefill = (try fut_mod_prefill.await()).prepare(gpt_oss_weights); + defer module_prefill.deinit(); + var module_gen = (try fut_mod.await()).prepare(gpt_oss_weights); + defer module_gen.deinit(); + log.info("✅\tCompiled model in {D}", .{start.read()}); + + log.info("Creating KvCache", .{}); + const kv_cache = try GptOss.KvCache.initBuffer(kv_shape, platform); + + var tokenizer = blk: { + const model_tokenizer_path = try std.fs.path.join(allocator, &.{ hf_model_path, "tokenizer.json" }); + defer allocator.free(model_tokenizer_path); + + log.info("Loading tokenizer from {s}", .{model_tokenizer_path}); + var timer = try stdx.time.Timer.start(); + defer log.info("Loaded tokenizer from {s} [{f}]", .{ model_tokenizer_path, timer.read() }); + + break :blk try zml.tokenizer.Tokenizer.fromFile(allocator, model_tokenizer_path); + }; + errdefer tokenizer.deinit(); + + const prompt = cli.args.prompt orelse "What are some fun facts about animals?"; + log.info("✅\tPrompt: {s}", .{prompt}); + + const no_chat = cli.args.nochat orelse false; + const prompt_tok_buf = try allocator.alloc(u32, options.max_prompt_len); + defer allocator.free(prompt_tok_buf); + + const prompt_tok = tokenizePrompt(tokenizer, prompt, no_chat, prompt_tok_buf) catch |err| switch (err) { + error.PromptTooLong => std.debug.panic("Prompt too long, expected at most {d} tokens. Consider increasing --max-prompt-len", .{prompt_tok_buf.len}), + else => |e| return e, + }; + log.info("\t Tokenized prompt: {any} ({d} tokens)", .{ prompt_tok, prompt_tok.len }); + + const seed = cli.args.seed orelse @as(u128, @bitCast(std.time.nanoTimestamp())); + + // Unbuffered writing of the tokens to stdout. + // generated text will be printed token by token. + var output = std.fs.File.stdout().writer(&.{}); + + try generateText(config, options, module_prefill, module_gen, kv_cache, tokenizer, allocator, seed, prompt_tok, &output.interface); +} + +const ClapBoilerplate = struct { + pub const Cli = clap.Result(clap.Help, &cli_params, parsers); + + fn bool_parser(in: []const u8) error{}!bool { + return std.mem.indexOfScalar(u8, "tTyY1", in[0]) != null; + } + + const parsers = .{ + .BOOL = bool_parser, + .UINT = clap.parsers.int(u32, 0), + .FLOAT = clap.parsers.float(f32), + .STRING = clap.parsers.string, + .PATH = clap.parsers.string, + }; + + pub fn parseCli(allocator: std.mem.Allocator) Cli { + var diag: clap.Diagnostic = .{}; + var stderr_buffer: [1024]u8 = undefined; + var stderr = std.fs.File.stderr().writer(&stderr_buffer); + const cli = clap.parse(clap.Help, &cli_params, parsers, .{ + .diagnostic = &diag, + .allocator = allocator, + }) catch |err| { + diag.report(&stderr.interface, err) catch {}; + stderr.interface.print("usage: ", .{}) catch {}; + clap.usage(&stderr.interface, clap.Help, &cli_params) catch {}; + stderr.interface.print("\n", .{}) catch {}; + stderr.interface.flush() catch {}; + std.process.exit(1); + }; + if (cli.args.help != 0) { + clap.help(&stderr.interface, clap.Help, &cli_params, .{}) catch {}; + stderr.interface.flush() catch {}; + std.process.exit(0); + } + return cli; + } +};