pjrt,zml: remove profiler
Closes Progress towards
This commit is contained in:
parent
7fb02e1888
commit
cba3be4859
@ -1,37 +0,0 @@
|
|||||||
# Running Gated Huggingface Models with Token Authentication
|
|
||||||
|
|
||||||
Some models have restrictions and may require some sort of approval or agreement
|
|
||||||
process, which, by consequence, **requires token-authentication with Huggingface**.
|
|
||||||
|
|
||||||
The easiest way might be to use the `huggingface-cli login` command.
|
|
||||||
|
|
||||||
Alternatively, here is how you can generate a **"read-only public repositories"**
|
|
||||||
access token to log into your account on Huggingface, directly from `bazel`, in order to download models.
|
|
||||||
|
|
||||||
* log in at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
|
|
||||||
* click on "Create new token"
|
|
||||||
* give the token a name, eg `zml_public_repos`,
|
|
||||||
* under _Repositories_, grant the following permission: "Read access to contents of all public gated repos you can access".
|
|
||||||
* at the bottom click on "Create token".
|
|
||||||
* copy the token by clicking `Copy`. **You won't be able to see it again.**
|
|
||||||
* the token looks something like `hf_abCdEfGhijKlM`.
|
|
||||||
* store the token on your machine (replace the placeholder with your actual token):
|
|
||||||
|
|
||||||
You can use the `HUGGINGFACE_TOKEN` environment variable to store the token or use
|
|
||||||
its standard location:
|
|
||||||
```
|
|
||||||
mkdir -p $HOME/.cache/huggingface/; echo <hf_my_token> > "$HOME/.cache/huggingface/token"
|
|
||||||
```
|
|
||||||
|
|
||||||
Now you're ready to download a gated model like `Meta-Llama-3-8b`!
|
|
||||||
|
|
||||||
**Example:**
|
|
||||||
|
|
||||||
```
|
|
||||||
# requires token in $HOME/.cache/huggingface/token, as created by the
|
|
||||||
# `huggingface-cli login` command, or the `HUGGINGFACE_TOKEN` environment variable.
|
|
||||||
cd examples
|
|
||||||
bazel run --config=release //llama:Meta-Llama-3-8b
|
|
||||||
bazel run --config=release //llama:Meta-Llama-3-8b -- --promt="Once upon a time,"
|
|
||||||
```
|
|
||||||
|
|
||||||
@ -1,28 +1,17 @@
|
|||||||
load("@rules_zig//zig:defs.bzl", "zig_library")
|
load("@rules_zig//zig:defs.bzl", "zig_library")
|
||||||
load("@zml//bazel:zig.bzl", "zig_cc_binary")
|
|
||||||
load("@zml//bazel:zig_srcs.bzl", "zig_srcs")
|
load("@zml//bazel:zig_srcs.bzl", "zig_srcs")
|
||||||
load("@zml//bazel:zig_proto_library.bzl", "zig_proto_library")
|
|
||||||
|
|
||||||
zig_library(
|
zig_library(
|
||||||
name = "pjrt",
|
name = "pjrt",
|
||||||
srcs = [
|
srcs = ["ffi.zig"],
|
||||||
"convert/trace_container.zig",
|
|
||||||
"convert/xplane_schema.zig",
|
|
||||||
"ffi.zig",
|
|
||||||
"profiler.zig",
|
|
||||||
],
|
|
||||||
main = "pjrt.zig",
|
main = "pjrt.zig",
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
":profiler_options_proto",
|
|
||||||
":trace_events_proto",
|
|
||||||
":xplane_proto",
|
|
||||||
"//stdx",
|
"//stdx",
|
||||||
"@xla//xla/ffi/api:c_api",
|
"@xla//xla/ffi/api:c_api",
|
||||||
"@xla//xla/pjrt/c:pjrt_c_api_ffi_extension_hdrs",
|
"@xla//xla/pjrt/c:pjrt_c_api_ffi_extension_hdrs",
|
||||||
"@xla//xla/pjrt/c:pjrt_c_api_gpu_extension_hdrs",
|
"@xla//xla/pjrt/c:pjrt_c_api_gpu_extension_hdrs",
|
||||||
"@xla//xla/pjrt/c:pjrt_c_api_hdrs",
|
"@xla//xla/pjrt/c:pjrt_c_api_hdrs",
|
||||||
"@xla//xla/pjrt/c:pjrt_c_api_profiler_extension_hdrs",
|
|
||||||
"@xla//xla/pjrt/c:pjrt_c_api_triton_extension_hdrs",
|
"@xla//xla/pjrt/c:pjrt_c_api_triton_extension_hdrs",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -31,36 +20,3 @@ zig_srcs(
|
|||||||
name = "sources",
|
name = "sources",
|
||||||
zig_lib = ":pjrt",
|
zig_lib = ":pjrt",
|
||||||
)
|
)
|
||||||
|
|
||||||
zig_proto_library(
|
|
||||||
name = "profiler_options_proto",
|
|
||||||
import_name = "//tsl:profiler_options_proto",
|
|
||||||
deps = ["@xla//third_party/tsl/tsl/profiler/protobuf:profiler_options_proto"],
|
|
||||||
)
|
|
||||||
|
|
||||||
zig_proto_library(
|
|
||||||
name = "xplane_proto",
|
|
||||||
import_name = "//tsl:xplane_proto",
|
|
||||||
deps = ["@xla//third_party/tsl/tsl/profiler/protobuf:xplane_proto"],
|
|
||||||
)
|
|
||||||
|
|
||||||
zig_proto_library(
|
|
||||||
name = "trace_events_proto",
|
|
||||||
import_name = "//tsl:trace_events_proto",
|
|
||||||
deps = ["@xla//third_party/tsl/tsl/profiler/protobuf:trace_events_proto"],
|
|
||||||
)
|
|
||||||
|
|
||||||
zig_cc_binary(
|
|
||||||
name = "xspace_to_json",
|
|
||||||
srcs = [
|
|
||||||
"convert/trace_container.zig",
|
|
||||||
"convert/xplane_schema.zig",
|
|
||||||
],
|
|
||||||
main = "xspace_to_json.zig",
|
|
||||||
visibility = ["//visibility:public"],
|
|
||||||
deps = [
|
|
||||||
":trace_events_proto",
|
|
||||||
":xplane_proto",
|
|
||||||
"//stdx",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|||||||
@ -1,366 +0,0 @@
|
|||||||
const std = @import("std");
|
|
||||||
|
|
||||||
const trace_events_proto = @import("//tsl:trace_events_proto");
|
|
||||||
const xplane_proto = @import("//tsl:xplane_proto");
|
|
||||||
|
|
||||||
const xplane_schema = @import("xplane_schema.zig");
|
|
||||||
|
|
||||||
// Constants used as trace_viewer PID (device_id in trace_events.proto).
|
|
||||||
// PID 0 is unused.
|
|
||||||
// Support up to 500 accelerator devices.
|
|
||||||
const first_device_id = 1;
|
|
||||||
const last_device_id = 500;
|
|
||||||
// Support Upto 200 custom planes as fake devices (i.e., planes with a
|
|
||||||
// "/custom:" prefix). See `<project_name>::custom_plane_prefix` for more
|
|
||||||
// information
|
|
||||||
const first_custom_plane_device_id = last_device_id + 1;
|
|
||||||
const max_custom_plane_devices_per_host = 200;
|
|
||||||
const last_custom_plane_device_id = first_custom_plane_device_id + max_custom_plane_devices_per_host - 1;
|
|
||||||
|
|
||||||
// Host threads are shown as a single fake device.
|
|
||||||
pub const host_threads_device_id = last_custom_plane_device_id + 1;
|
|
||||||
|
|
||||||
pub const xla_async_op_line_name = "Async XLA Ops";
|
|
||||||
|
|
||||||
pub const host_threads_plane_name = "/host:CPU";
|
|
||||||
pub const gpu_plane_prefix = "/device:GPU:";
|
|
||||||
pub const tpu_plane_prefix = "/device:TPU:";
|
|
||||||
pub const custom_plane_prefix = "/device:CUSTOM:";
|
|
||||||
|
|
||||||
pub const TraceContainer = struct {
|
|
||||||
arena: std.heap.ArenaAllocator,
|
|
||||||
events: std.ArrayListUnmanaged(TraceEvent) = .{},
|
|
||||||
devices: std.AutoArrayHashMapUnmanaged(u32, Device) = .{},
|
|
||||||
|
|
||||||
pub const Device = struct {
|
|
||||||
name: []const u8,
|
|
||||||
device_id: u32,
|
|
||||||
resources: std.AutoArrayHashMapUnmanaged(i64, Resource) = .{},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub const Resource = struct {
|
|
||||||
name: []const u8,
|
|
||||||
sort_index: i64,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub const TraceEvent = struct {
|
|
||||||
device_id: u32 = 0,
|
|
||||||
resource_id: i64 = 0,
|
|
||||||
name: []const u8 = &[_]u8{},
|
|
||||||
timestamp_ps: u128 = 0,
|
|
||||||
duration_ps: u64 = 0,
|
|
||||||
args: std.StringArrayHashMapUnmanaged([]const u8) = .{},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn init(allocator: std.mem.Allocator) TraceContainer {
|
|
||||||
return .{
|
|
||||||
.arena = std.heap.ArenaAllocator.init(allocator),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *TraceContainer) void {
|
|
||||||
self.arena.deinit();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn parseXSpaceBytes(self: *TraceContainer, pb_buffer: []const u8, max_events: ?usize) !void {
|
|
||||||
const arena = self.arena.allocator();
|
|
||||||
|
|
||||||
const xspace = try xplane_proto.XSpace.decode(pb_buffer, arena);
|
|
||||||
try self.fromXSpace(arena, xspace, max_events);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn findPlaneWithName(space: xplane_proto.XSpace, name: []const u8) ?*xplane_proto.XPlane {
|
|
||||||
for (space.planes.items) |*v| {
|
|
||||||
if (std.mem.eql(u8, v.name.getSlice(), name)) return v;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn findPlanesWithPrefix(
|
|
||||||
out: *std.ArrayList(*const xplane_proto.XPlane),
|
|
||||||
space: xplane_proto.XSpace,
|
|
||||||
prefix: []const u8,
|
|
||||||
) !void {
|
|
||||||
for (space.planes.items) |*p| {
|
|
||||||
if (std.mem.startsWith(u8, p.name.getSlice(), prefix)) {
|
|
||||||
try out.append(p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn xplaneToTraceEvents(self: *TraceContainer, allocator: std.mem.Allocator, device_id: u32, xplane: *const XPlaneHashed) !void {
|
|
||||||
// Convert devices and resources.
|
|
||||||
const device_entry = try self.devices.getOrPutValue(allocator, device_id, .{ .name = xplane.name(), .device_id = device_id });
|
|
||||||
var device = device_entry.value_ptr.*;
|
|
||||||
defer device_entry.value_ptr.* = device;
|
|
||||||
|
|
||||||
try device.resources.ensureUnusedCapacity(allocator, xplane.plane.lines.items.len);
|
|
||||||
const sort_by_ordinal = (device_id == host_threads_device_id);
|
|
||||||
|
|
||||||
// Convert events.
|
|
||||||
for (xplane.plane.lines.items, 0..) |*xline, ordinal| {
|
|
||||||
const resource_id = if (xline.display_id != 0) xline.display_id else xline.id;
|
|
||||||
const resource_name = if (xline.display_name.isEmpty()) xline.name.getSlice() else xline.display_name.getSlice();
|
|
||||||
device.resources.putAssumeCapacity(resource_id, .{
|
|
||||||
.name = resource_name,
|
|
||||||
.sort_index = if (sort_by_ordinal) @intCast(ordinal) else resource_id,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (std.mem.eql(u8, resource_name, xla_async_op_line_name)) continue;
|
|
||||||
|
|
||||||
for (xline.events.items) |*xevent| {
|
|
||||||
const event_type = xplane.getEventType(xevent.metadata_id);
|
|
||||||
if (event_type.isInternalEvent()) continue;
|
|
||||||
var event = try self.createEvent(allocator);
|
|
||||||
event.device_id = device_id;
|
|
||||||
event.resource_id = resource_id;
|
|
||||||
|
|
||||||
if (xplane.event_metadata_by_id.get(xevent.metadata_id)) |metadata| {
|
|
||||||
try event.args.ensureUnusedCapacity(allocator, 1 + metadata.stats.items.len);
|
|
||||||
|
|
||||||
if (metadata.display_name != .Empty) {
|
|
||||||
event.name = metadata.display_name.getSlice();
|
|
||||||
event.args.putAssumeCapacity("long_name", metadata.name.getSlice());
|
|
||||||
} else {
|
|
||||||
event.name = metadata.name.getSlice();
|
|
||||||
}
|
|
||||||
|
|
||||||
event.timestamp_ps = (@as(u128, @intCast(xline.timestamp_ns)) * 1000) + @as(u128, @intCast(xevent.data.?.offset_ps));
|
|
||||||
event.duration_ps = @intCast(xevent.duration_ps);
|
|
||||||
|
|
||||||
for (metadata.stats.items) |xstat| {
|
|
||||||
if (xstat.value == null) continue;
|
|
||||||
var stat_buffer = std.ArrayList(u8).init(allocator);
|
|
||||||
try xplane.xstatToString(xstat, stat_buffer.writer().any());
|
|
||||||
const stat_str = try stat_buffer.toOwnedSlice();
|
|
||||||
const stat_type = xplane.getStatType(xstat.metadata_id);
|
|
||||||
if (stat_type.isInternalStat()) continue;
|
|
||||||
if (stat_type == .step_name) event.name = stat_str;
|
|
||||||
event.args.putAssumeCapacity(xplane.getStatMetadataName(xstat.metadata_id), stat_str);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try event.args.ensureUnusedCapacity(allocator, xevent.stats.items.len);
|
|
||||||
for (xevent.stats.items) |xstat| {
|
|
||||||
if (xstat.value == null) continue;
|
|
||||||
var stat_buffer = std.ArrayList(u8).init(allocator);
|
|
||||||
try xplane.xstatToString(xstat, stat_buffer.writer().any());
|
|
||||||
const stat_str = try stat_buffer.toOwnedSlice();
|
|
||||||
const stat_type = xplane.getStatType(xstat.metadata_id);
|
|
||||||
if (stat_type.isInternalStat()) continue;
|
|
||||||
if (stat_type == .step_name) event.name = stat_str;
|
|
||||||
event.args.putAssumeCapacity(xplane.getStatMetadataName(xstat.metadata_id), stat_str);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fromXSpace(self: *TraceContainer, allocator: std.mem.Allocator, xspace: xplane_proto.XSpace, max_events: ?usize) !void {
|
|
||||||
if (findPlaneWithName(xspace, host_threads_plane_name)) |hp| {
|
|
||||||
const xplane = try XPlaneHashed.init(allocator, hp);
|
|
||||||
try self.xplaneToTraceEvents(allocator, host_threads_device_id, &xplane);
|
|
||||||
}
|
|
||||||
|
|
||||||
var device_planes = std.ArrayList(*const xplane_proto.XPlane).init(allocator);
|
|
||||||
defer device_planes.deinit();
|
|
||||||
|
|
||||||
try findPlanesWithPrefix(&device_planes, xspace, gpu_plane_prefix);
|
|
||||||
// We don't expect GPU and TPU planes and custom devices to be present in the same XSpace.
|
|
||||||
if (device_planes.items.len == 0) {
|
|
||||||
try findPlanesWithPrefix(&device_planes, xspace, tpu_plane_prefix);
|
|
||||||
}
|
|
||||||
if (device_planes.items.len == 0) {
|
|
||||||
try findPlanesWithPrefix(&device_planes, xspace, custom_plane_prefix);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (device_planes.items) |dp| {
|
|
||||||
var xplane = try XPlaneHashed.init(allocator, dp);
|
|
||||||
defer xplane.deinit(allocator);
|
|
||||||
const device_id: u32 = first_device_id + @as(u32, @intCast(xplane.plane.id));
|
|
||||||
try self.xplaneToTraceEvents(allocator, device_id, &xplane);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Trace viewer (non-streaming) has scalability issues, we need to drop
|
|
||||||
// events to avoid loading failure for trace viewer.
|
|
||||||
if (max_events) |limit| self.capEvents(limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn createEvent(self: *TraceContainer, allocator: std.mem.Allocator) !*TraceEvent {
|
|
||||||
try self.events.append(allocator, .{});
|
|
||||||
return &self.events.items[self.events.items.len - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn capEvents(self: *TraceContainer, max_count: u64) void {
|
|
||||||
const total_count = self.events.items.len;
|
|
||||||
if (total_count <= max_count) {
|
|
||||||
// Nothing to do. Events are not known sorted after return.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// sort the events according to start time.
|
|
||||||
// TODO: partial sort would improve performance.
|
|
||||||
std.mem.sort(TraceEvent, self.events.items, {}, struct {
|
|
||||||
pub fn call(_: void, lhs: TraceEvent, rhs: TraceEvent) bool {
|
|
||||||
return lhs.timestamp_ps < rhs.timestamp_ps;
|
|
||||||
}
|
|
||||||
}.call);
|
|
||||||
self.events.shrinkRetainingCapacity(max_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn toJson(self: *TraceContainer, writer: anytype) !void {
|
|
||||||
try writer.writeAll(
|
|
||||||
\\{"displayTimeUnit":"ns","metadata":{"highres-ticks":true},"traceEvents":[
|
|
||||||
);
|
|
||||||
|
|
||||||
self.devices.sort(struct {
|
|
||||||
keys: []const u32,
|
|
||||||
pub fn lessThan(ctx: @This(), lhs: usize, rhs: usize) bool {
|
|
||||||
return ctx.keys[lhs] < ctx.keys[rhs];
|
|
||||||
}
|
|
||||||
}{ .keys = self.devices.keys() });
|
|
||||||
|
|
||||||
for (self.devices.keys(), self.devices.values()) |device_id, *device| {
|
|
||||||
if (device.name.len != 0) {
|
|
||||||
try writer.print(
|
|
||||||
\\{{"ph":"M","pid":{d},"name":"process_name","args":{{"name":"{s}"}}}},
|
|
||||||
, .{ device_id, device.name });
|
|
||||||
}
|
|
||||||
try writer.print(
|
|
||||||
\\{{"ph":"M","pid":{d},"name":"process_sort_index","args":{{"sort_index":{d}}}}},
|
|
||||||
, .{
|
|
||||||
device_id,
|
|
||||||
device_id,
|
|
||||||
});
|
|
||||||
|
|
||||||
device.resources.sort(struct {
|
|
||||||
keys: []const i64,
|
|
||||||
pub fn lessThan(ctx: @This(), lhs: usize, rhs: usize) bool {
|
|
||||||
return ctx.keys[lhs] < ctx.keys[rhs];
|
|
||||||
}
|
|
||||||
}{ .keys = device.resources.keys() });
|
|
||||||
|
|
||||||
for (device.resources.keys(), device.resources.values()) |resource_id, resource| {
|
|
||||||
if (resource.name.len != 0) {
|
|
||||||
try writer.print(
|
|
||||||
\\{{"ph":"M","pid":{d},"tid":{d},"name":"thread_name","args":{{"name":"{s}"}}}},
|
|
||||||
, .{
|
|
||||||
device_id,
|
|
||||||
resource_id,
|
|
||||||
resource.name,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
try writer.print(
|
|
||||||
\\{{"ph":"M","pid":{d},"tid":{d},"name":"thread_sort_index","args":{{"sort_index":{d}}}}},
|
|
||||||
, .{ device_id, resource_id, resource.sort_index });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (self.events.items) |*event| {
|
|
||||||
const duration_ps = @max(event.duration_ps, 1);
|
|
||||||
try writer.print(
|
|
||||||
\\{{"ph":"X","pid":{d},"tid":{d},"ts":{d:.17},"dur":{d:.17},"name":"{s}"
|
|
||||||
, .{
|
|
||||||
event.device_id,
|
|
||||||
event.resource_id,
|
|
||||||
picoToMicro(event.timestamp_ps),
|
|
||||||
picoToMicro(duration_ps),
|
|
||||||
event.name,
|
|
||||||
});
|
|
||||||
if (event.args.count() != 0) {
|
|
||||||
try writer.writeAll(
|
|
||||||
\\,"args":{
|
|
||||||
);
|
|
||||||
event.args.sort(struct {
|
|
||||||
keys: []const []const u8,
|
|
||||||
|
|
||||||
pub fn lessThan(ctx: @This(), lhs: usize, rhs: usize) bool {
|
|
||||||
return std.mem.order(u8, ctx.keys[lhs], ctx.keys[rhs]).compare(std.math.CompareOperator.lt);
|
|
||||||
}
|
|
||||||
}{ .keys = event.args.keys() });
|
|
||||||
|
|
||||||
for (event.args.keys(), event.args.values(), 0..) |key, value, i| {
|
|
||||||
if (i < event.args.count() - 1) {
|
|
||||||
try writer.print(
|
|
||||||
\\"{s}":"{s}",
|
|
||||||
, .{ key, value });
|
|
||||||
} else {
|
|
||||||
// Last item has closing bracket rather than trailing comma.
|
|
||||||
try writer.print(
|
|
||||||
\\"{s}":"{s}"}}
|
|
||||||
, .{ key, value });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try writer.writeAll("},");
|
|
||||||
}
|
|
||||||
try writer.writeAll("{}]}");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
fn picoToMicro(p: anytype) f64 {
|
|
||||||
return @as(f64, @floatFromInt(p)) / 1E6;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub const XPlaneHashed = struct {
|
|
||||||
plane: *const xplane_proto.XPlane,
|
|
||||||
event_metadata_by_id: std.AutoHashMapUnmanaged(i64, *const xplane_proto.XEventMetadata) = .{},
|
|
||||||
stat_metadata_by_id: std.AutoHashMapUnmanaged(i64, *const xplane_proto.XStatMetadata) = .{},
|
|
||||||
|
|
||||||
pub fn init(
|
|
||||||
allocator: std.mem.Allocator,
|
|
||||||
plane: *const xplane_proto.XPlane,
|
|
||||||
) !XPlaneHashed {
|
|
||||||
var res: XPlaneHashed = .{ .plane = plane };
|
|
||||||
|
|
||||||
try res.event_metadata_by_id.ensureUnusedCapacity(allocator, @intCast(plane.event_metadata.items.len));
|
|
||||||
// build event metadata map
|
|
||||||
for (plane.event_metadata.items) |*event_metadata| {
|
|
||||||
res.event_metadata_by_id.putAssumeCapacity(event_metadata.key, &event_metadata.value.?);
|
|
||||||
}
|
|
||||||
|
|
||||||
// build stat metadata map
|
|
||||||
try res.stat_metadata_by_id.ensureUnusedCapacity(allocator, @intCast(plane.stat_metadata.items.len));
|
|
||||||
for (plane.stat_metadata.items) |*stat_metadata| {
|
|
||||||
res.stat_metadata_by_id.putAssumeCapacity(stat_metadata.key, &stat_metadata.value.?);
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: *XPlaneHashed, allocator: std.mem.Allocator) void {
|
|
||||||
self.stat_metadata_by_id.deinit(allocator);
|
|
||||||
self.event_metadata_by_id.deinit(allocator);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn name(self: XPlaneHashed) []const u8 {
|
|
||||||
return self.plane.name.getSlice();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn getEventType(self: XPlaneHashed, event_metadata_id: i64) xplane_schema.HostEventType {
|
|
||||||
if (self.event_metadata_by_id.get(event_metadata_id)) |v| {
|
|
||||||
return xplane_schema.HostEventType.fromString(v.name.getSlice());
|
|
||||||
} else return .unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn getStatMetadataName(self: XPlaneHashed, stat_metadata_id: i64) []const u8 {
|
|
||||||
if (self.stat_metadata_by_id.get(stat_metadata_id)) |v| {
|
|
||||||
return v.name.getSlice();
|
|
||||||
} else return &[_]u8{};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn getStatType(self: XPlaneHashed, stat_metadata_id: i64) xplane_schema.StatType {
|
|
||||||
if (self.stat_metadata_by_id.get(stat_metadata_id)) |v| {
|
|
||||||
return xplane_schema.StatType.fromString(v.name.getSlice());
|
|
||||||
} else return .unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn xstatToString(self: XPlaneHashed, stat: xplane_proto.XStat, writer: anytype) !void {
|
|
||||||
if (stat.value == null) return;
|
|
||||||
|
|
||||||
switch (stat.value.?) {
|
|
||||||
inline .int64_value, .uint64_value, .double_value => |v| try writer.print("{d}", .{v}),
|
|
||||||
.str_value => |*v| try writer.writeAll(v.getSlice()),
|
|
||||||
.bytes_value => try writer.writeAll("<opaque bytes>"),
|
|
||||||
.ref_value => |v| try writer.writeAll(self.getStatMetadataName(@intCast(v))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
@ -1,297 +0,0 @@
|
|||||||
const std = @import("std");
|
|
||||||
|
|
||||||
// `HostEventType` uses the unconventional casing/formatting
|
|
||||||
// so that the string representation of the enum used in the
|
|
||||||
// protobuf encoding directly maps to the zig enum tag name.
|
|
||||||
pub const HostEventType = enum(u16) {
|
|
||||||
unknown = 0,
|
|
||||||
TraceContext,
|
|
||||||
SessionRun,
|
|
||||||
FunctionRun,
|
|
||||||
RunGraph,
|
|
||||||
RunGraphDone,
|
|
||||||
TfOpRun,
|
|
||||||
EagerExecute,
|
|
||||||
@"ExecutorState::Process",
|
|
||||||
ExecutorDoneCallback,
|
|
||||||
MemoryAllocation,
|
|
||||||
MemoryDeallocation,
|
|
||||||
// Performance counter related.
|
|
||||||
RemotePerfCounter,
|
|
||||||
// tf.data captured function events.
|
|
||||||
@"InstantiatedCapturedFunction::Run",
|
|
||||||
@"InstantiatedCapturedFunction::RunWithBorrowedArgs",
|
|
||||||
@"InstantiatedCapturedFunction::RunInstantiated",
|
|
||||||
@"InstantiatedCapturedFunction::RunAsync",
|
|
||||||
// Loop ops.
|
|
||||||
ParallelForOp,
|
|
||||||
ForeverOp,
|
|
||||||
@"WhileOp-EvalCond",
|
|
||||||
@"WhileOp-StartBody",
|
|
||||||
ForOp,
|
|
||||||
// tf.data related.
|
|
||||||
@"IteratorGetNextOp::DoCompute",
|
|
||||||
@"IteratorGetNextAsOptionalOp::DoCompute",
|
|
||||||
Iterator,
|
|
||||||
@"Iterator::Prefetch::Generator",
|
|
||||||
PrefetchProduce,
|
|
||||||
PrefetchConsume,
|
|
||||||
ParallelInterleaveProduce,
|
|
||||||
ParallelInterleaveConsume,
|
|
||||||
ParallelInterleaveInitializeInput,
|
|
||||||
ParallelMapProduce,
|
|
||||||
ParallelMapConsume,
|
|
||||||
MapAndBatchProduce,
|
|
||||||
MapAndBatchConsume,
|
|
||||||
ParseExampleProduce,
|
|
||||||
ParseExampleConsume,
|
|
||||||
ParallelBatchProduce,
|
|
||||||
ParallelBatchConsume,
|
|
||||||
// Batching related.
|
|
||||||
BatchingSessionRun,
|
|
||||||
ProcessBatch,
|
|
||||||
BrainSessionRun,
|
|
||||||
ConcatInputTensors,
|
|
||||||
MergeInputTensors,
|
|
||||||
ScheduleWithoutSplit,
|
|
||||||
ScheduleWithSplit,
|
|
||||||
ScheduleWithEagerSplit,
|
|
||||||
@"ASBSQueue::Schedule",
|
|
||||||
// TFRT related.
|
|
||||||
TfrtModelRun,
|
|
||||||
// Serving related.
|
|
||||||
ServingModelRun,
|
|
||||||
// GPU related.
|
|
||||||
KernelLaunch,
|
|
||||||
KernelExecute,
|
|
||||||
// TPU related
|
|
||||||
EnqueueRequestLocked,
|
|
||||||
RunProgramRequest,
|
|
||||||
HostCallbackRequest,
|
|
||||||
TransferH2DRequest,
|
|
||||||
TransferPreprocessedH2DRequest,
|
|
||||||
TransferD2HRequest,
|
|
||||||
OnDeviceSendRequest,
|
|
||||||
OnDeviceRecvRequest,
|
|
||||||
OnDeviceSendRecvLocalRequest,
|
|
||||||
CustomWait,
|
|
||||||
OnDeviceSendRequestMulti,
|
|
||||||
OnDeviceRecvRequestMulti,
|
|
||||||
PjrtAsyncWait,
|
|
||||||
DoEnqueueProgram,
|
|
||||||
DoEnqueueContinuationProgram,
|
|
||||||
WriteHbm,
|
|
||||||
ReadHbm,
|
|
||||||
TpuExecuteOp,
|
|
||||||
CompleteCallbacks,
|
|
||||||
@"tpu::System::TransferToDevice=>IssueEvent",
|
|
||||||
@"tpu::System::TransferToDevice=>IssueEvent=>Done",
|
|
||||||
@"tpu::System::TransferFromDevice=>IssueEvent",
|
|
||||||
@"tpu::System::TransferFromDevice=>IssueEvent=>Done",
|
|
||||||
@"tpu::System::Execute",
|
|
||||||
@"TPUPartitionedCallOp-InitializeVarOnTPU",
|
|
||||||
@"TPUPartitionedCallOp-ExecuteRemote",
|
|
||||||
@"TPUPartitionedCallOp-ExecuteLocal",
|
|
||||||
Linearize,
|
|
||||||
Delinearize,
|
|
||||||
@"TransferBufferFromDevice-FastPath",
|
|
||||||
|
|
||||||
pub fn fromString(event_name: []const u8) HostEventType {
|
|
||||||
return std.meta.stringToEnum(HostEventType, event_name) orelse .unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn isInternalEvent(event_type: HostEventType) bool {
|
|
||||||
// TODO(b/162102421): Introduce a prefix for internal event names.
|
|
||||||
return switch (event_type) {
|
|
||||||
.MemoryAllocation,
|
|
||||||
.MemoryDeallocation,
|
|
||||||
.PrefetchProduce,
|
|
||||||
.PrefetchConsume,
|
|
||||||
.ParallelInterleaveProduce,
|
|
||||||
.ParallelInterleaveConsume,
|
|
||||||
.ParallelInterleaveInitializeInput,
|
|
||||||
.ParallelMapProduce,
|
|
||||||
.ParallelMapConsume,
|
|
||||||
.MapAndBatchProduce,
|
|
||||||
.MapAndBatchConsume,
|
|
||||||
.ParseExampleProduce,
|
|
||||||
.ParseExampleConsume,
|
|
||||||
=> true,
|
|
||||||
else => false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// `StatType` uses the unconventional casing/formatting
|
|
||||||
// so that the string representation of the enum used in the
|
|
||||||
// protobuf encoding directly maps to the zig enum tag name.
|
|
||||||
pub const StatType = enum(u16) {
|
|
||||||
unknown = 0,
|
|
||||||
// TraceMe arguments.
|
|
||||||
id,
|
|
||||||
device_ordinal,
|
|
||||||
chip_ordinal,
|
|
||||||
node_ordinal,
|
|
||||||
model_id,
|
|
||||||
queue_id,
|
|
||||||
queue_addr,
|
|
||||||
request_id,
|
|
||||||
run_id,
|
|
||||||
replica_id,
|
|
||||||
graph_type,
|
|
||||||
step_num,
|
|
||||||
iter_num,
|
|
||||||
index_on_host,
|
|
||||||
allocator_name,
|
|
||||||
bytes_reserved,
|
|
||||||
bytes_allocated,
|
|
||||||
bytes_available,
|
|
||||||
fragmentation,
|
|
||||||
peak_bytes_in_use,
|
|
||||||
requested_bytes,
|
|
||||||
allocation_bytes,
|
|
||||||
addr,
|
|
||||||
region_type,
|
|
||||||
data_type,
|
|
||||||
shape,
|
|
||||||
layout,
|
|
||||||
kpi_name,
|
|
||||||
kpi_value,
|
|
||||||
element_id,
|
|
||||||
parent_id,
|
|
||||||
core_type,
|
|
||||||
// XPlane semantics related.
|
|
||||||
_pt,
|
|
||||||
_ct,
|
|
||||||
_p,
|
|
||||||
_c,
|
|
||||||
_r,
|
|
||||||
_a,
|
|
||||||
// Device trace arguments.
|
|
||||||
device_id,
|
|
||||||
device_type_string,
|
|
||||||
context_id,
|
|
||||||
correlation_id,
|
|
||||||
// TODO(b/176137043): These "details" should differentiate between activity
|
|
||||||
// and API event sources.
|
|
||||||
memcpy_details,
|
|
||||||
memalloc_details,
|
|
||||||
MemFree_details,
|
|
||||||
Memset_details,
|
|
||||||
MemoryResidency_details,
|
|
||||||
nvtx_range,
|
|
||||||
kernel_details,
|
|
||||||
stream,
|
|
||||||
// Stats added when processing traces.
|
|
||||||
group_id,
|
|
||||||
flow,
|
|
||||||
step_name,
|
|
||||||
tf_op,
|
|
||||||
hlo_op,
|
|
||||||
deduplicated_name,
|
|
||||||
hlo_category,
|
|
||||||
hlo_module,
|
|
||||||
program_id,
|
|
||||||
equation,
|
|
||||||
is_eager,
|
|
||||||
is_func,
|
|
||||||
tf_function_call,
|
|
||||||
tracing_count,
|
|
||||||
flops,
|
|
||||||
model_flops,
|
|
||||||
bytes_accessed,
|
|
||||||
memory_access_breakdown,
|
|
||||||
source,
|
|
||||||
model_name,
|
|
||||||
model_version,
|
|
||||||
bytes_transferred,
|
|
||||||
queue,
|
|
||||||
dcn_collective_info,
|
|
||||||
// Performance counter related.
|
|
||||||
@"Raw Value",
|
|
||||||
@"Scaled Value",
|
|
||||||
@"Thread Id",
|
|
||||||
matrix_unit_utilization_percent,
|
|
||||||
// XLA metadata map related.
|
|
||||||
@"Hlo Proto",
|
|
||||||
// Device capability related.
|
|
||||||
clock_rate,
|
|
||||||
// For GPU, this is the number of SMs.
|
|
||||||
core_count,
|
|
||||||
memory_bandwidth,
|
|
||||||
memory_size,
|
|
||||||
compute_cap_major,
|
|
||||||
compute_cap_minor,
|
|
||||||
peak_teraflops_per_second,
|
|
||||||
peak_hbm_bw_gigabytes_per_second,
|
|
||||||
peak_sram_rd_bw_gigabytes_per_second,
|
|
||||||
peak_sram_wr_bw_gigabytes_per_second,
|
|
||||||
device_vendor,
|
|
||||||
// Batching related.
|
|
||||||
batch_size_after_padding,
|
|
||||||
padding_amount,
|
|
||||||
batching_input_task_size,
|
|
||||||
// GPU occupancy metrics
|
|
||||||
theoretical_occupancy_pct,
|
|
||||||
occupancy_min_grid_size,
|
|
||||||
occupancy_suggested_block_size,
|
|
||||||
// Aggregated Stats
|
|
||||||
self_duration_ps,
|
|
||||||
min_duration_ps,
|
|
||||||
total_profile_duration_ps,
|
|
||||||
max_iteration_num,
|
|
||||||
device_type,
|
|
||||||
uses_megacore,
|
|
||||||
symbol_id,
|
|
||||||
tf_op_name,
|
|
||||||
dma_stall_duration_ps,
|
|
||||||
key,
|
|
||||||
payload_size_bytes,
|
|
||||||
duration_us,
|
|
||||||
buffer_size,
|
|
||||||
transfers,
|
|
||||||
// Dcn message Stats
|
|
||||||
dcn_label,
|
|
||||||
dcn_source_slice_id,
|
|
||||||
dcn_source_per_slice_device_id,
|
|
||||||
dcn_destination_slice_id,
|
|
||||||
dcn_destination_per_slice_device_id,
|
|
||||||
dcn_chunk,
|
|
||||||
dcn_loop_index,
|
|
||||||
@"EdgeTPU Model information",
|
|
||||||
@"EdgeTPU Model Profile information",
|
|
||||||
@"EdgeTPU MLIR",
|
|
||||||
dropped_traces,
|
|
||||||
cuda_graph_id,
|
|
||||||
// Many events have `.cuda_graph_id`, such as graph sub events when tracing is in
|
|
||||||
// node level. Yet `.cuda_graph_exec_id` is used only for CudaGraphExecution events
|
|
||||||
// on the GPU device when tracing is in graph level.
|
|
||||||
cuda_graph_exec_id,
|
|
||||||
cuda_graph_orig_id,
|
|
||||||
step_idle_time_ps,
|
|
||||||
gpu_device_name,
|
|
||||||
source_stack,
|
|
||||||
device_offset_ps,
|
|
||||||
device_duration_ps,
|
|
||||||
|
|
||||||
pub fn fromString(stat_name: []const u8) StatType {
|
|
||||||
return std.meta.stringToEnum(StatType, stat_name) orelse .unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn isInternalStat(stat_type: StatType) bool {
|
|
||||||
return switch (stat_type) {
|
|
||||||
.kernel_details,
|
|
||||||
._pt,
|
|
||||||
._p,
|
|
||||||
._ct,
|
|
||||||
._c,
|
|
||||||
._r,
|
|
||||||
.flops,
|
|
||||||
.bytes_accessed,
|
|
||||||
.program_id,
|
|
||||||
.symbol_id,
|
|
||||||
=> true,
|
|
||||||
else => false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
@ -5,7 +5,6 @@ const c = @import("c");
|
|||||||
const stdx = @import("stdx");
|
const stdx = @import("stdx");
|
||||||
|
|
||||||
pub const ffi = @import("ffi.zig");
|
pub const ffi = @import("ffi.zig");
|
||||||
pub const Profiler = @import("profiler.zig").Profiler;
|
|
||||||
|
|
||||||
const log = std.log.scoped(.pjrt);
|
const log = std.log.scoped(.pjrt);
|
||||||
|
|
||||||
@ -389,19 +388,6 @@ pub const Client = opaque {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the Profiler for this API.
|
|
||||||
/// Not all platform have a profiling api, for those the profiler object will do nothing.
|
|
||||||
/// Platforms with known profiler extensions: cuda, xpu
|
|
||||||
pub fn getProfiler(self: *const Client, api: *const Api, options: Profiler.Options) Profiler {
|
|
||||||
if (api.version().minor >= 45) {
|
|
||||||
if (api.lookupExtension(c.PJRT_Profiler_Extension, c.PJRT_Extension_Type_Profiler)) |ext| {
|
|
||||||
return Profiler.init(ext.profiler_api.*, options);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
log.warn("No profiler found for platform: {}", .{self});
|
|
||||||
return Profiler.init(null, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deserializeAndLoad(self: *const Client, api: *const Api, bytes: []const u8) ApiError!*LoadedExecutable {
|
pub fn deserializeAndLoad(self: *const Client, api: *const Api, bytes: []const u8) ApiError!*LoadedExecutable {
|
||||||
const ret = try api.call(.PJRT_Executable_DeserializeAndLoad, .{
|
const ret = try api.call(.PJRT_Executable_DeserializeAndLoad, .{
|
||||||
.client = self.inner(),
|
.client = self.inner(),
|
||||||
|
|||||||
@ -1,194 +0,0 @@
|
|||||||
const std = @import("std");
|
|
||||||
const c = @import("c");
|
|
||||||
const tsl_proto = @import("//tsl:profiler_options_proto");
|
|
||||||
|
|
||||||
const log = std.log.scoped(.@"pjrt/profiler");
|
|
||||||
const TraceContainer = @import("convert/trace_container.zig").TraceContainer;
|
|
||||||
|
|
||||||
/// Pjrt Profiler extension
|
|
||||||
pub const Profiler = struct {
|
|
||||||
api: ?c.PLUGIN_Profiler_Api,
|
|
||||||
inner: *c.PLUGIN_Profiler,
|
|
||||||
last_error: ?*Error = null,
|
|
||||||
status: Status = .ready,
|
|
||||||
|
|
||||||
pub const Status = enum { ready, started, stopped, done };
|
|
||||||
pub const Error = c.PLUGIN_Profiler_Error;
|
|
||||||
pub const Options = tsl_proto.ProfileOptions;
|
|
||||||
|
|
||||||
pub const default_options: Options = .{
|
|
||||||
.version = 1,
|
|
||||||
.device_type = .UNSPECIFIED, // profile all devices
|
|
||||||
.include_dataset_ops = false, // tensorflow specific
|
|
||||||
.host_tracer_level = 2,
|
|
||||||
.device_tracer_level = 1,
|
|
||||||
.python_tracer_level = 0,
|
|
||||||
.enable_hlo_proto = true,
|
|
||||||
.start_timestamp_ns = 0,
|
|
||||||
.duration_ms = 0,
|
|
||||||
.repository_path = .Empty,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn init(api: ?c.PLUGIN_Profiler_Api, options: ?Options) Profiler {
|
|
||||||
if (api == null) {
|
|
||||||
return .{ .api = null, .inner = undefined };
|
|
||||||
}
|
|
||||||
var options_with_timestamp = options orelse default_options;
|
|
||||||
options_with_timestamp.start_timestamp_ns = @truncate(@max(0, std.time.nanoTimestamp()));
|
|
||||||
|
|
||||||
var buffer: [std.fs.max_path_bytes + @sizeOf(Options) * 4]u8 = undefined;
|
|
||||||
var fba = std.heap.FixedBufferAllocator.init(&buffer);
|
|
||||||
const byte_options = options_with_timestamp.encode(fba.allocator()) catch unreachable;
|
|
||||||
var args: c.PLUGIN_Profiler_Create_Args = .{
|
|
||||||
.options = byte_options.ptr,
|
|
||||||
.options_size = byte_options.len,
|
|
||||||
.profiler = undefined, // out
|
|
||||||
};
|
|
||||||
var res: Profiler = .{ .api = api, .inner = undefined };
|
|
||||||
res.check(api.?.create.?(&args)) catch unreachable;
|
|
||||||
|
|
||||||
res.inner = args.profiler.?;
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn transition(self: *Profiler, fn_name: []const u8, expected: Status, next: Status) void {
|
|
||||||
if (self.status == expected) {
|
|
||||||
self.status = next;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
std.debug.panic("Profiler can't `{s}()`. Current status: {}, expected: {}", .{ fn_name, self.status, expected });
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn start(self: *Profiler) void {
|
|
||||||
self.transition("start", .ready, .started);
|
|
||||||
if (self.api == null) return;
|
|
||||||
var args: c.PLUGIN_Profiler_Start_Args = .{ .profiler = self.inner };
|
|
||||||
self.check(self.api.?.start.?(&args)) catch unreachable;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn stop(self: *Profiler) void {
|
|
||||||
self.transition("stop", .started, .stopped);
|
|
||||||
if (self.api == null) return;
|
|
||||||
|
|
||||||
var args: c.PLUGIN_Profiler_Stop_Args = .{ .profiler = self.inner };
|
|
||||||
self.check(self.api.?.stop.?(&args)) catch unreachable;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn collectData(self: *Profiler, allocator: std.mem.Allocator) !ProfilingData {
|
|
||||||
self.transition("collect_data", .stopped, .done);
|
|
||||||
if (self.api == null) return .{ .external = &.{} };
|
|
||||||
|
|
||||||
var args: c.PLUGIN_Profiler_CollectData_Args = .{
|
|
||||||
.struct_size = c.PLUGIN_Profiler_CollectData_Args_STRUCT_SIZE,
|
|
||||||
.profiler = self.inner,
|
|
||||||
.buffer = null,
|
|
||||||
.buffer_size_in_bytes = 0,
|
|
||||||
};
|
|
||||||
try self.check(self.api.?.collect_data.?(&args));
|
|
||||||
std.debug.assert(args.buffer_size_in_bytes > 0);
|
|
||||||
return if (args.buffer == null) blk: {
|
|
||||||
log.debug("Plugin profiler wants us to allocate {d} bytes for profile data", .{args.buffer_size_in_bytes});
|
|
||||||
// The plugin want us to allocate memory for it:
|
|
||||||
const buffer = try allocator.alloc(u8, args.buffer_size_in_bytes);
|
|
||||||
args.buffer = buffer.ptr;
|
|
||||||
try self.check(self.api.?.collect_data.?(&args));
|
|
||||||
break :blk .{ .owned = buffer };
|
|
||||||
} else blk: {
|
|
||||||
log.debug("Plugin profiler has {d} bytes of profile data", .{args.buffer_size_in_bytes});
|
|
||||||
// Drop sentinel. The profiler plugin returns a null terminated string.
|
|
||||||
// But this is creating issues if we save the sentinel on disk,
|
|
||||||
// because it will trip up protobuf readers.
|
|
||||||
var data = args.buffer[0..args.buffer_size_in_bytes];
|
|
||||||
data = if (data.len > 0 and data[data.len - 1] == 0) data[0 .. data.len - 1] else data;
|
|
||||||
break :blk .{ .external = data };
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn dumpDataTo(
|
|
||||||
self: *Profiler,
|
|
||||||
allocator: std.mem.Allocator,
|
|
||||||
dir: std.fs.Dir,
|
|
||||||
file_name: []const u8,
|
|
||||||
) !void {
|
|
||||||
const profile_data = try self.collectData(allocator);
|
|
||||||
defer profile_data.free(allocator);
|
|
||||||
|
|
||||||
if (profile_data.items().len == 0) return;
|
|
||||||
|
|
||||||
const file = try dir.createFile(file_name, .{ .truncate = true });
|
|
||||||
defer file.close();
|
|
||||||
log.info("Writing profiling data to {s} ({} bytes)", .{ file_name, profile_data.items().len });
|
|
||||||
return try file.writeAll(profile_data.items());
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn dumpAsJsonTo(
|
|
||||||
self: *Profiler,
|
|
||||||
allocator: std.mem.Allocator,
|
|
||||||
dir: std.fs.Dir,
|
|
||||||
file_name: []const u8,
|
|
||||||
) !void {
|
|
||||||
log.info("Writing profiling data to {s}", .{file_name});
|
|
||||||
var output_file = try dir.createFile(file_name, .{});
|
|
||||||
defer output_file.close();
|
|
||||||
var buffered_writer = std.io.bufferedWriter(output_file.writer());
|
|
||||||
try self.dumpAsJsonToWriter(allocator, buffered_writer.writer());
|
|
||||||
try buffered_writer.flush();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn dumpAsJsonToWriter(
|
|
||||||
self: *Profiler,
|
|
||||||
allocator: std.mem.Allocator,
|
|
||||||
writer: anytype,
|
|
||||||
) !void {
|
|
||||||
const profile_data = try self.collectData(allocator);
|
|
||||||
defer profile_data.free(allocator);
|
|
||||||
|
|
||||||
if (profile_data.items().len == 0) {
|
|
||||||
log.warn("No profile data was collected: {}", .{self});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
var converter = TraceContainer.init(allocator);
|
|
||||||
defer converter.deinit();
|
|
||||||
try converter.parseXSpaceBytes(profile_data.items(), 1_000_000);
|
|
||||||
|
|
||||||
try converter.toJson(writer);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check(self: *Profiler, c_error: ?*Error) !void {
|
|
||||||
if (c_error) |err| {
|
|
||||||
self.last_error = err;
|
|
||||||
return error.PjrtProfilerError;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn deinit(self: Profiler) void {
|
|
||||||
switch (self.status) {
|
|
||||||
.started => log.warn("Profiler was never stopped", .{}),
|
|
||||||
.stopped => log.warn("Profiler data was never collected", .{}),
|
|
||||||
else => {},
|
|
||||||
}
|
|
||||||
if (self.api == null) return;
|
|
||||||
|
|
||||||
var args: c.PLUGIN_Profiler_Destroy_Args = .{ .profiler = self.inner };
|
|
||||||
_ = self.api.?.destroy.?(&args);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const ProfilingData = union(enum) {
|
|
||||||
owned: []const u8,
|
|
||||||
external: []const u8,
|
|
||||||
|
|
||||||
pub fn items(self: ProfilingData) []const u8 {
|
|
||||||
return switch (self) {
|
|
||||||
inline else => |x| x,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn free(self: ProfilingData, allocator: std.mem.Allocator) void {
|
|
||||||
switch (self) {
|
|
||||||
.owned => |data| allocator.free(data),
|
|
||||||
.external => {},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
@ -1,47 +0,0 @@
|
|||||||
const std = @import("std");
|
|
||||||
const stdx = @import("stdx");
|
|
||||||
const flags = stdx.flags;
|
|
||||||
|
|
||||||
const TraceContainer = @import("convert/trace_container.zig").TraceContainer;
|
|
||||||
|
|
||||||
const CliArgs = struct {
|
|
||||||
pub const help =
|
|
||||||
\\ llama --path=path_to_profiling_data
|
|
||||||
;
|
|
||||||
path: []const u8,
|
|
||||||
max_events: ?usize = null,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn main() !void {
|
|
||||||
var gpa = std.heap.GeneralPurposeAllocator(.{ .thread_safe = true }){};
|
|
||||||
defer _ = gpa.deinit();
|
|
||||||
const allocator = gpa.allocator();
|
|
||||||
|
|
||||||
var args = std.process.args();
|
|
||||||
const cli_args = flags.parse(&args, CliArgs);
|
|
||||||
|
|
||||||
var fd = try std.fs.openFileAbsolute(cli_args.path, .{});
|
|
||||||
defer fd.close();
|
|
||||||
|
|
||||||
const pb_buffer = try fd.readToEndAlloc(allocator, (try fd.stat()).size);
|
|
||||||
defer allocator.free(pb_buffer);
|
|
||||||
if (pb_buffer.len == 0) return error.EmptyBuffer;
|
|
||||||
|
|
||||||
var converter = TraceContainer.init(allocator);
|
|
||||||
defer converter.deinit();
|
|
||||||
try converter.parseXSpaceBytes(pb_buffer, cli_args.max_events);
|
|
||||||
|
|
||||||
var path_buffer: [1028]u8 = undefined;
|
|
||||||
|
|
||||||
const output_path = try std.fmt.bufPrint(&path_buffer, "{s}/{s}.json", .{
|
|
||||||
std.fs.path.dirname(cli_args.path) orelse "",
|
|
||||||
std.fs.path.stem(cli_args.path),
|
|
||||||
});
|
|
||||||
|
|
||||||
var output_file = try std.fs.createFileAbsolute(output_path, .{});
|
|
||||||
defer output_file.close();
|
|
||||||
|
|
||||||
try converter.toJson(output_file.writer().any());
|
|
||||||
|
|
||||||
std.debug.print("Wrote JSON to {s}\n", .{output_path});
|
|
||||||
}
|
|
||||||
@ -5,7 +5,6 @@ const dialects = @import("mlir/dialects");
|
|||||||
const mlir = @import("mlir");
|
const mlir = @import("mlir");
|
||||||
const pjrt = @import("pjrt");
|
const pjrt = @import("pjrt");
|
||||||
pub const ffi = pjrt.ffi;
|
pub const ffi = pjrt.ffi;
|
||||||
pub const Profiler = pjrt.Profiler;
|
|
||||||
pub const ApiError = pjrt.ApiError;
|
pub const ApiError = pjrt.ApiError;
|
||||||
pub const ErrorCode = pjrt.ErrorCode;
|
pub const ErrorCode = pjrt.ErrorCode;
|
||||||
pub const ExecuteContext = pjrt.ExecuteContext;
|
pub const ExecuteContext = pjrt.ExecuteContext;
|
||||||
@ -109,13 +108,6 @@ pub const Client = opaque {
|
|||||||
return try asynk.callBlocking(compileSync, .{ self, api, allocator, module, compile_options_pb });
|
return try asynk.callBlocking(compileSync, .{ self, api, allocator, module, compile_options_pb });
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the Profiler for this API.
|
|
||||||
/// Not all platform have a profiling api, for those the profiler object will do nothing.
|
|
||||||
/// Platforms with known profiler extensions: cuda, xpu
|
|
||||||
pub fn getProfiler(self: *const Client, api: *const Api, options: pjrt.Profiler.Options) pjrt.Profiler {
|
|
||||||
return self.inner().getProfiler(api, options);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn addressableMemories(self: *const Client, api: *const Api) []*const Memory {
|
pub fn addressableMemories(self: *const Client, api: *const Api) []*const Memory {
|
||||||
return self.inner().addressableMemories(api);
|
return self.inner().addressableMemories(api);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -85,13 +85,6 @@ pub const Platform = struct {
|
|||||||
pub fn deinit(self: *Platform) void {
|
pub fn deinit(self: *Platform) void {
|
||||||
self.pjrt_client.deinit(self.pjrt_api);
|
self.pjrt_client.deinit(self.pjrt_api);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the Profiler for this API.
|
|
||||||
/// Not all platform have a profiling api, for those the profiler object will do nothing.
|
|
||||||
/// Platforms with known profiler extensions: cuda, xpu
|
|
||||||
pub fn getProfiler(self: Platform, options: ?pjrt.Profiler.Options) pjrt.Profiler {
|
|
||||||
return self.pjrt_client.getProfiler(self.pjrt_api, options orelse pjrt.Profiler.default_options);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const _CreateOptions = struct {
|
const _CreateOptions = struct {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user