PJRT: add conversion of profiling protobuf output to JSON format.
This commit is contained in:
parent
f7bac1af10
commit
4681ce2f24
@ -1,6 +1,8 @@
|
|||||||
load("@rules_zig//zig:defs.bzl", "zig_library")
|
load("@rules_zig//zig:defs.bzl", "zig_library")
|
||||||
|
load("@zml//bazel:zig.bzl", "zig_cc_binary")
|
||||||
load("//bazel:zig_proto_library.bzl", "zig_proto_library")
|
load("//bazel:zig_proto_library.bzl", "zig_proto_library")
|
||||||
|
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "dlfcn",
|
name = "dlfcn",
|
||||||
hdrs = ["dlfcn.h"],
|
hdrs = ["dlfcn.h"],
|
||||||
@ -27,3 +29,27 @@ zig_proto_library(
|
|||||||
import_name = "//tsl:profiler_options_proto",
|
import_name = "//tsl:profiler_options_proto",
|
||||||
deps = ["@tsl//tsl/profiler/protobuf:profiler_options_proto"],
|
deps = ["@tsl//tsl/profiler/protobuf:profiler_options_proto"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
zig_proto_library(
|
||||||
|
name = "xplane_proto",
|
||||||
|
import_name = "//tsl:xplane_proto",
|
||||||
|
deps = ["@tsl//tsl/profiler/protobuf:xplane_proto"],
|
||||||
|
)
|
||||||
|
|
||||||
|
zig_proto_library(
|
||||||
|
name = "trace_events_proto",
|
||||||
|
import_name = "//tsl:trace_events_proto",
|
||||||
|
deps = ["@tsl//tsl/profiler/protobuf:trace_events_proto"],
|
||||||
|
)
|
||||||
|
|
||||||
|
zig_cc_binary(
|
||||||
|
name = "xspace_to_json",
|
||||||
|
srcs = glob(["convert/*.zig"]),
|
||||||
|
main = "xspace_to_json.zig",
|
||||||
|
visibility = ["//visibility:public"],
|
||||||
|
deps = [
|
||||||
|
":trace_events_proto",
|
||||||
|
":xplane_proto",
|
||||||
|
"//examples/third_party/tigerbeetle:flags",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|||||||
323
pjrt/convert/trace_container.zig
Normal file
323
pjrt/convert/trace_container.zig
Normal file
@ -0,0 +1,323 @@
|
|||||||
|
const std = @import("std");
|
||||||
|
const trace_events_proto = @import("//tsl:trace_events_proto");
|
||||||
|
const xplane_proto = @import("//tsl:xplane_proto");
|
||||||
|
const xplane_schema = @import("xplane_schema.zig");
|
||||||
|
const xplane_visitor = @import("xplane_visitor.zig");
|
||||||
|
|
||||||
|
// Constants used as trace_viewer PID (device_id in trace_events.proto).
|
||||||
|
// PID 0 is unused.
|
||||||
|
// Support up to 500 accelerator devices.
|
||||||
|
const first_device_id = 1;
|
||||||
|
const last_device_id = 500;
|
||||||
|
// Support Upto 200 custom planes as fake devices (i.e., planes with a
|
||||||
|
// "/custom:" prefix). See `<project_name>::custom_plane_prefix` for more
|
||||||
|
// information
|
||||||
|
const first_custom_plane_device_id = last_device_id + 1;
|
||||||
|
const max_custom_plane_devices_per_host = 200;
|
||||||
|
const last_custom_plane_device_id = first_custom_plane_device_id + max_custom_plane_devices_per_host - 1;
|
||||||
|
|
||||||
|
// Host threads are shown as a single fake device.
|
||||||
|
pub const host_threads_device_id = last_custom_plane_device_id + 1;
|
||||||
|
|
||||||
|
pub const xla_async_op_line_name = "Async XLA Ops";
|
||||||
|
|
||||||
|
pub const host_threads_plane_name = "/host:CPU";
|
||||||
|
pub const gpu_plane_prefix = "/device:GPU:";
|
||||||
|
pub const tpu_plane_prefix = "/device:TPU:";
|
||||||
|
pub const custom_plane_prefix = "/device:CUSTOM:";
|
||||||
|
|
||||||
|
pub const TraceContainer = struct {
|
||||||
|
arena: std.heap.ArenaAllocator,
|
||||||
|
events: std.ArrayListUnmanaged(TraceEvent) = .{},
|
||||||
|
devices: std.AutoArrayHashMapUnmanaged(u32, Device) = .{},
|
||||||
|
|
||||||
|
pub const Device = struct {
|
||||||
|
name: []const u8 = &[_]u8{},
|
||||||
|
device_id: u32 = 0,
|
||||||
|
resources: std.AutoArrayHashMapUnmanaged(u32, trace_events_proto.Resource) = .{},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const TraceEvent = struct {
|
||||||
|
device_id: u32 = 0,
|
||||||
|
resource_id: u32 = 0,
|
||||||
|
name: []const u8 = &[_]u8{},
|
||||||
|
timestamp_ps: u128 = 0,
|
||||||
|
duration_ps: u64 = 0,
|
||||||
|
args: std.StringArrayHashMapUnmanaged([]const u8) = .{},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn init(allocator: std.mem.Allocator, pb_buffer: []const u8, max_events: ?usize) !TraceContainer {
|
||||||
|
var self: TraceContainer = .{
|
||||||
|
.arena = std.heap.ArenaAllocator.init(allocator),
|
||||||
|
};
|
||||||
|
const arena = self.arena.allocator();
|
||||||
|
|
||||||
|
const xspace = try xplane_proto.XSpace.decode(pb_buffer, arena);
|
||||||
|
try self.fromXSpace(arena, &xspace, max_events);
|
||||||
|
return self;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deinit(self: *TraceContainer) void {
|
||||||
|
self.arena.deinit();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn picoToMicro(p: anytype) f64 {
|
||||||
|
return @as(f64, @floatFromInt(p)) / 1E6;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn xLineDisplayId(xline: *const xplane_proto.XLine) i64 {
|
||||||
|
return if (xline.display_id != 0) xline.display_id else xline.id;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn xLineDisplayName(xline: *const xplane_proto.XLine) []const u8 {
|
||||||
|
return switch (xline.display_name) {
|
||||||
|
.Empty => xline.name.getSlice(),
|
||||||
|
else => xline.display_name.getSlice(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
fn xstatValueToString(stat: *const xplane_proto.XStat, plane: *const xplane_visitor.XPlaneVisitor, writer: std.io.AnyWriter) !void {
|
||||||
|
if (stat.value) |val| {
|
||||||
|
switch (val) {
|
||||||
|
inline .int64_value, .uint64_value, .double_value => |v| try writer.print("{d}", .{v}),
|
||||||
|
.str_value => |*v| try writer.writeAll(v.getSlice()),
|
||||||
|
.bytes_value => try writer.writeAll("<opaque bytes>"),
|
||||||
|
.ref_value => |v| try writer.writeAll(plane.getStatMetadataName(@intCast(v))),
|
||||||
|
}
|
||||||
|
} else return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn findPlaneWithName(space: *const xplane_proto.XSpace, name: []const u8) ?*xplane_proto.XPlane {
|
||||||
|
for (space.planes.items) |*v| {
|
||||||
|
if (std.mem.eql(u8, v.name.getSlice(), name)) return v;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn findPlanesWithPrefix(
|
||||||
|
allocator: std.mem.Allocator,
|
||||||
|
space: *const xplane_proto.XSpace,
|
||||||
|
prefix: []const u8,
|
||||||
|
) ![]*const xplane_proto.XPlane {
|
||||||
|
var res = std.ArrayList(*const xplane_proto.XPlane).init(allocator);
|
||||||
|
for (space.planes.items) |*p| {
|
||||||
|
if (std.mem.startsWith(u8, p.name.getSlice(), prefix)) try res.append(p);
|
||||||
|
}
|
||||||
|
return res.toOwnedSlice();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn buildDeviceAndResources(allocator: std.mem.Allocator, device_id: u32, plane: *const xplane_visitor.XPlaneVisitor, device: *Device) !void {
|
||||||
|
device.name = plane.name();
|
||||||
|
device.device_id = device_id;
|
||||||
|
const sort_by_ordinal = (device_id == host_threads_device_id);
|
||||||
|
var ordinal: u32 = 0;
|
||||||
|
for (plane.plane.lines.items) |*xline| {
|
||||||
|
const resource_id: u32 = @intCast(xLineDisplayId(xline));
|
||||||
|
var resource: trace_events_proto.Resource = .{
|
||||||
|
.resource_id = resource_id,
|
||||||
|
.name = .{ .Const = xLineDisplayName(xline) },
|
||||||
|
};
|
||||||
|
|
||||||
|
if (sort_by_ordinal) {
|
||||||
|
ordinal += 1;
|
||||||
|
resource.sort_index = ordinal;
|
||||||
|
}
|
||||||
|
try device.resources.put(allocator, resource_id, resource);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn xplaneToTraceEvents(self: *TraceContainer, allocator: std.mem.Allocator, device_id: u32, xplane: *const xplane_visitor.XPlaneVisitor) !void {
|
||||||
|
// Convert devices and resources.
|
||||||
|
const device_entry = try self.devices.getOrPut(allocator, device_id);
|
||||||
|
if (!device_entry.found_existing) device_entry.value_ptr.* = .{};
|
||||||
|
|
||||||
|
try buildDeviceAndResources(allocator, device_id, xplane, device_entry.value_ptr);
|
||||||
|
|
||||||
|
// Convert events.
|
||||||
|
for (xplane.plane.lines.items) |*xline| {
|
||||||
|
const resource_id: u32 = @intCast(xLineDisplayId(xline));
|
||||||
|
|
||||||
|
if (std.mem.eql(u8, xLineDisplayName(xline), xla_async_op_line_name)) continue;
|
||||||
|
for (xline.events.items) |*xevent| {
|
||||||
|
const event_type = xplane.getEventType(xevent.metadata_id);
|
||||||
|
if (event_type.isInternalEvent()) continue;
|
||||||
|
var event = try self.createEvent(allocator);
|
||||||
|
event.device_id = device_id;
|
||||||
|
event.resource_id = resource_id;
|
||||||
|
|
||||||
|
if (xplane.event_metadata_by_id.get(xevent.metadata_id)) |metadata| {
|
||||||
|
if (metadata.display_name != .Empty) {
|
||||||
|
event.name = metadata.display_name.getSlice();
|
||||||
|
try event.args.put(allocator, "long_name", metadata.name.getSlice());
|
||||||
|
} else {
|
||||||
|
event.name = metadata.name.getSlice();
|
||||||
|
}
|
||||||
|
|
||||||
|
event.timestamp_ps = (@as(u128, @intCast(xline.timestamp_ns)) * 1000) + @as(u128, @intCast(xevent.data.?.offset_ps));
|
||||||
|
event.duration_ps = @intCast(xevent.duration_ps);
|
||||||
|
|
||||||
|
for (metadata.stats.items) |*xstat| {
|
||||||
|
if (xstat.value == null) continue;
|
||||||
|
var stat_buffer = std.ArrayList(u8).init(allocator);
|
||||||
|
try xstatValueToString(xstat, xplane, stat_buffer.writer().any());
|
||||||
|
const stat_str = try stat_buffer.toOwnedSlice();
|
||||||
|
const stat_type = xplane.getStatType(xstat.metadata_id);
|
||||||
|
if (stat_type.isInternalStat()) continue;
|
||||||
|
if (stat_type == .step_name) event.name = stat_str;
|
||||||
|
try event.args.put(allocator, xplane.getStatMetadataName(xstat.metadata_id), stat_str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (xevent.stats.items) |*xstat| {
|
||||||
|
if (xstat.value == null) continue;
|
||||||
|
var stat_buffer = std.ArrayList(u8).init(allocator);
|
||||||
|
try xstatValueToString(xstat, xplane, stat_buffer.writer().any());
|
||||||
|
const stat_str = try stat_buffer.toOwnedSlice();
|
||||||
|
const stat_type = xplane.getStatType(xstat.metadata_id);
|
||||||
|
if (stat_type.isInternalStat()) continue;
|
||||||
|
if (stat_type == .step_name) event.name = stat_str;
|
||||||
|
try event.args.put(allocator, xplane.getStatMetadataName(xstat.metadata_id), stat_str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fromXSpace(self: *TraceContainer, allocator: std.mem.Allocator, xspace: *const xplane_proto.XSpace, max_events: ?usize) !void {
|
||||||
|
if (findPlaneWithName(xspace, host_threads_plane_name)) |hp| {
|
||||||
|
const xplane = try xplane_visitor.XPlaneVisitor.init(allocator, hp);
|
||||||
|
try self.xplaneToTraceEvents(allocator, host_threads_device_id, &xplane);
|
||||||
|
}
|
||||||
|
|
||||||
|
var device_planes = try findPlanesWithPrefix(allocator, xspace, gpu_plane_prefix);
|
||||||
|
|
||||||
|
// We don't expect GPU and TPU planes and custom devices to be present in the
|
||||||
|
// same XSpace.
|
||||||
|
if (device_planes.len == 0) {
|
||||||
|
device_planes = try findPlanesWithPrefix(allocator, xspace, tpu_plane_prefix);
|
||||||
|
}
|
||||||
|
if (device_planes.len == 0) {
|
||||||
|
device_planes = try findPlanesWithPrefix(allocator, xspace, custom_plane_prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (device_planes) |dp| {
|
||||||
|
const xplane = try xplane_visitor.XPlaneVisitor.init(allocator, dp);
|
||||||
|
const device_id: u32 = first_device_id + @as(u32, @intCast(xplane.plane.id));
|
||||||
|
try self.xplaneToTraceEvents(allocator, device_id, &xplane);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trace viewer (non-streaming) has scalability issues, we need to drop
|
||||||
|
// events to avoid loading failure for trace viewer.
|
||||||
|
if (max_events) |limit| self.capEvents(limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn createEvent(self: *TraceContainer, allocator: std.mem.Allocator) !*TraceEvent {
|
||||||
|
try self.events.append(allocator, .{});
|
||||||
|
return &self.events.items[self.events.items.len - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn capEvents(self: *TraceContainer, max_count: u64) void {
|
||||||
|
const total_count = self.events.items.len;
|
||||||
|
if (total_count <= max_count) {
|
||||||
|
// Nothing to do. Events are not known sorted after return.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// sort the events according to start time.
|
||||||
|
// TODO: partial sort would improve performance.
|
||||||
|
std.mem.sort(TraceEvent, self.events.items, {}, struct {
|
||||||
|
pub fn call(_: void, lhs: TraceEvent, rhs: TraceEvent) bool {
|
||||||
|
return lhs.timestamp_ps < rhs.timestamp_ps;
|
||||||
|
}
|
||||||
|
}.call);
|
||||||
|
self.events.shrinkRetainingCapacity(max_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn toJson(self: *TraceContainer, writer: std.io.AnyWriter) !void {
|
||||||
|
try writer.writeAll(
|
||||||
|
\\{"displayTimeUnit":"ns","metadata":{"highres-ticks":true},"traceEvents":[
|
||||||
|
);
|
||||||
|
|
||||||
|
self.devices.sort(struct {
|
||||||
|
keys: []const u32,
|
||||||
|
pub fn lessThan(ctx: @This(), lhs: usize, rhs: usize) bool {
|
||||||
|
return ctx.keys[lhs] < ctx.keys[rhs];
|
||||||
|
}
|
||||||
|
}{ .keys = self.devices.keys() });
|
||||||
|
|
||||||
|
for (self.devices.keys(), self.devices.values()) |device_id, *device| {
|
||||||
|
if (device.name.len != 0) {
|
||||||
|
try writer.print(
|
||||||
|
\\{{"ph":"M","pid":{d},"name":"process_name","args":{{"name":"{s}"}}}},
|
||||||
|
, .{ device_id, device.name });
|
||||||
|
}
|
||||||
|
try writer.print(
|
||||||
|
\\{{"ph":"M","pid":{d},"name":"process_sort_index","args":{{"sort_index":{d}}}}},
|
||||||
|
, .{
|
||||||
|
device_id,
|
||||||
|
device_id,
|
||||||
|
});
|
||||||
|
|
||||||
|
device.resources.sort(struct {
|
||||||
|
keys: []const u32,
|
||||||
|
pub fn lessThan(ctx: @This(), lhs: usize, rhs: usize) bool {
|
||||||
|
return ctx.keys[lhs] < ctx.keys[rhs];
|
||||||
|
}
|
||||||
|
}{ .keys = device.resources.keys() });
|
||||||
|
|
||||||
|
for (device.resources.keys(), device.resources.values()) |resource_id, resource| {
|
||||||
|
if (resource.name.getSlice().len != 0) {
|
||||||
|
try writer.print(
|
||||||
|
\\{{"ph":"M","pid":{d},"tid":{d},"name":"thread_name","args":{{"name":"{s}"}}}},
|
||||||
|
, .{
|
||||||
|
device_id,
|
||||||
|
resource_id,
|
||||||
|
resource.name.getSlice(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const sort_index = if (resource.sort_index != 0) resource.sort_index else resource_id;
|
||||||
|
try writer.print(
|
||||||
|
\\{{"ph":"M","pid":{d},"tid":{d},"name":"thread_sort_index","args":{{"sort_index":{d}}}}},
|
||||||
|
, .{ device_id, resource_id, sort_index });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (self.events.items) |*event| {
|
||||||
|
const duration_ps = @max(event.duration_ps, 1);
|
||||||
|
try writer.print(
|
||||||
|
\\{{"ph":"X","pid":{d},"tid":{d},"ts":{d:.17},"dur":{d:.17},"name":"{s}"
|
||||||
|
, .{
|
||||||
|
event.device_id,
|
||||||
|
event.resource_id,
|
||||||
|
picoToMicro(event.timestamp_ps),
|
||||||
|
picoToMicro(duration_ps),
|
||||||
|
event.name,
|
||||||
|
});
|
||||||
|
if (event.args.count() != 0) {
|
||||||
|
try writer.writeAll(
|
||||||
|
\\,"args":{
|
||||||
|
);
|
||||||
|
event.args.sort(struct {
|
||||||
|
keys: []const []const u8,
|
||||||
|
|
||||||
|
pub fn lessThan(ctx: @This(), lhs: usize, rhs: usize) bool {
|
||||||
|
return std.mem.order(u8, ctx.keys[lhs], ctx.keys[rhs]).compare(std.math.CompareOperator.lt);
|
||||||
|
}
|
||||||
|
}{ .keys = event.args.keys() });
|
||||||
|
|
||||||
|
for (event.args.keys(), event.args.values(), 0..) |key, value, i| {
|
||||||
|
if (i < event.args.count() - 1) {
|
||||||
|
try writer.print(
|
||||||
|
\\"{s}":"{s}",
|
||||||
|
, .{ key, value });
|
||||||
|
} else {
|
||||||
|
// Last item has closing bracket rather than trailing comma.
|
||||||
|
try writer.print(
|
||||||
|
\\"{s}":"{s}"}}
|
||||||
|
, .{ key, value });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try writer.writeAll("},");
|
||||||
|
}
|
||||||
|
try writer.writeAll("{}]}");
|
||||||
|
}
|
||||||
|
};
|
||||||
297
pjrt/convert/xplane_schema.zig
Normal file
297
pjrt/convert/xplane_schema.zig
Normal file
@ -0,0 +1,297 @@
|
|||||||
|
const std = @import("std");
|
||||||
|
|
||||||
|
// `HostEventType` uses the unconventional casing/formatting
|
||||||
|
// so that the string representation of the enum used in the
|
||||||
|
// protobuf encoding directly maps to the zig enum tag name.
|
||||||
|
pub const HostEventType = enum(u16) {
|
||||||
|
unknown = 0,
|
||||||
|
TraceContext,
|
||||||
|
SessionRun,
|
||||||
|
FunctionRun,
|
||||||
|
RunGraph,
|
||||||
|
RunGraphDone,
|
||||||
|
TfOpRun,
|
||||||
|
EagerExecute,
|
||||||
|
@"ExecutorState::Process",
|
||||||
|
ExecutorDoneCallback,
|
||||||
|
MemoryAllocation,
|
||||||
|
MemoryDeallocation,
|
||||||
|
// Performance counter related.
|
||||||
|
RemotePerfCounter,
|
||||||
|
// tf.data captured function events.
|
||||||
|
@"InstantiatedCapturedFunction::Run",
|
||||||
|
@"InstantiatedCapturedFunction::RunWithBorrowedArgs",
|
||||||
|
@"InstantiatedCapturedFunction::RunInstantiated",
|
||||||
|
@"InstantiatedCapturedFunction::RunAsync",
|
||||||
|
// Loop ops.
|
||||||
|
ParallelForOp,
|
||||||
|
ForeverOp,
|
||||||
|
@"WhileOp-EvalCond",
|
||||||
|
@"WhileOp-StartBody",
|
||||||
|
ForOp,
|
||||||
|
// tf.data related.
|
||||||
|
@"IteratorGetNextOp::DoCompute",
|
||||||
|
@"IteratorGetNextAsOptionalOp::DoCompute",
|
||||||
|
Iterator,
|
||||||
|
@"Iterator::Prefetch::Generator",
|
||||||
|
PrefetchProduce,
|
||||||
|
PrefetchConsume,
|
||||||
|
ParallelInterleaveProduce,
|
||||||
|
ParallelInterleaveConsume,
|
||||||
|
ParallelInterleaveInitializeInput,
|
||||||
|
ParallelMapProduce,
|
||||||
|
ParallelMapConsume,
|
||||||
|
MapAndBatchProduce,
|
||||||
|
MapAndBatchConsume,
|
||||||
|
ParseExampleProduce,
|
||||||
|
ParseExampleConsume,
|
||||||
|
ParallelBatchProduce,
|
||||||
|
ParallelBatchConsume,
|
||||||
|
// Batching related.
|
||||||
|
BatchingSessionRun,
|
||||||
|
ProcessBatch,
|
||||||
|
BrainSessionRun,
|
||||||
|
ConcatInputTensors,
|
||||||
|
MergeInputTensors,
|
||||||
|
ScheduleWithoutSplit,
|
||||||
|
ScheduleWithSplit,
|
||||||
|
ScheduleWithEagerSplit,
|
||||||
|
@"ASBSQueue::Schedule",
|
||||||
|
// TFRT related.
|
||||||
|
TfrtModelRun,
|
||||||
|
// Serving related.
|
||||||
|
ServingModelRun,
|
||||||
|
// GPU related.
|
||||||
|
KernelLaunch,
|
||||||
|
KernelExecute,
|
||||||
|
// TPU related
|
||||||
|
EnqueueRequestLocked,
|
||||||
|
RunProgramRequest,
|
||||||
|
HostCallbackRequest,
|
||||||
|
TransferH2DRequest,
|
||||||
|
TransferPreprocessedH2DRequest,
|
||||||
|
TransferD2HRequest,
|
||||||
|
OnDeviceSendRequest,
|
||||||
|
OnDeviceRecvRequest,
|
||||||
|
OnDeviceSendRecvLocalRequest,
|
||||||
|
CustomWait,
|
||||||
|
OnDeviceSendRequestMulti,
|
||||||
|
OnDeviceRecvRequestMulti,
|
||||||
|
PjrtAsyncWait,
|
||||||
|
DoEnqueueProgram,
|
||||||
|
DoEnqueueContinuationProgram,
|
||||||
|
WriteHbm,
|
||||||
|
ReadHbm,
|
||||||
|
TpuExecuteOp,
|
||||||
|
CompleteCallbacks,
|
||||||
|
@"tpu::System::TransferToDevice=>IssueEvent",
|
||||||
|
@"tpu::System::TransferToDevice=>IssueEvent=>Done",
|
||||||
|
@"tpu::System::TransferFromDevice=>IssueEvent",
|
||||||
|
@"tpu::System::TransferFromDevice=>IssueEvent=>Done",
|
||||||
|
@"tpu::System::Execute",
|
||||||
|
@"TPUPartitionedCallOp-InitializeVarOnTPU",
|
||||||
|
@"TPUPartitionedCallOp-ExecuteRemote",
|
||||||
|
@"TPUPartitionedCallOp-ExecuteLocal",
|
||||||
|
Linearize,
|
||||||
|
Delinearize,
|
||||||
|
@"TransferBufferFromDevice-FastPath",
|
||||||
|
|
||||||
|
pub fn fromString(event_name: []const u8) HostEventType {
|
||||||
|
return std.meta.stringToEnum(HostEventType, event_name) orelse .unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn isInternalEvent(event_type: HostEventType) bool {
|
||||||
|
// TODO(b/162102421): Introduce a prefix for internal event names.
|
||||||
|
return switch (event_type) {
|
||||||
|
.MemoryAllocation,
|
||||||
|
.MemoryDeallocation,
|
||||||
|
.PrefetchProduce,
|
||||||
|
.PrefetchConsume,
|
||||||
|
.ParallelInterleaveProduce,
|
||||||
|
.ParallelInterleaveConsume,
|
||||||
|
.ParallelInterleaveInitializeInput,
|
||||||
|
.ParallelMapProduce,
|
||||||
|
.ParallelMapConsume,
|
||||||
|
.MapAndBatchProduce,
|
||||||
|
.MapAndBatchConsume,
|
||||||
|
.ParseExampleProduce,
|
||||||
|
.ParseExampleConsume,
|
||||||
|
=> true,
|
||||||
|
else => false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// `StatType` uses the unconventional casing/formatting
|
||||||
|
// so that the string representation of the enum used in the
|
||||||
|
// protobuf encoding directly maps to the zig enum tag name.
|
||||||
|
pub const StatType = enum(u16) {
|
||||||
|
unknown = 0,
|
||||||
|
// TraceMe arguments.
|
||||||
|
id,
|
||||||
|
device_ordinal,
|
||||||
|
chip_ordinal,
|
||||||
|
node_ordinal,
|
||||||
|
model_id,
|
||||||
|
queue_id,
|
||||||
|
queue_addr,
|
||||||
|
request_id,
|
||||||
|
run_id,
|
||||||
|
replica_id,
|
||||||
|
graph_type,
|
||||||
|
step_num,
|
||||||
|
iter_num,
|
||||||
|
index_on_host,
|
||||||
|
allocator_name,
|
||||||
|
bytes_reserved,
|
||||||
|
bytes_allocated,
|
||||||
|
bytes_available,
|
||||||
|
fragmentation,
|
||||||
|
peak_bytes_in_use,
|
||||||
|
requested_bytes,
|
||||||
|
allocation_bytes,
|
||||||
|
addr,
|
||||||
|
region_type,
|
||||||
|
data_type,
|
||||||
|
shape,
|
||||||
|
layout,
|
||||||
|
kpi_name,
|
||||||
|
kpi_value,
|
||||||
|
element_id,
|
||||||
|
parent_id,
|
||||||
|
core_type,
|
||||||
|
// XPlane semantics related.
|
||||||
|
_pt,
|
||||||
|
_ct,
|
||||||
|
_p,
|
||||||
|
_c,
|
||||||
|
_r,
|
||||||
|
_a,
|
||||||
|
// Device trace arguments.
|
||||||
|
device_id,
|
||||||
|
device_type_string,
|
||||||
|
context_id,
|
||||||
|
correlation_id,
|
||||||
|
// TODO(b/176137043): These "details" should differentiate between activity
|
||||||
|
// and API event sources.
|
||||||
|
memcpy_details,
|
||||||
|
memalloc_details,
|
||||||
|
MemFree_details,
|
||||||
|
Memset_details,
|
||||||
|
MemoryResidency_details,
|
||||||
|
nvtx_range,
|
||||||
|
kernel_details,
|
||||||
|
stream,
|
||||||
|
// Stats added when processing traces.
|
||||||
|
group_id,
|
||||||
|
flow,
|
||||||
|
step_name,
|
||||||
|
tf_op,
|
||||||
|
hlo_op,
|
||||||
|
deduplicated_name,
|
||||||
|
hlo_category,
|
||||||
|
hlo_module,
|
||||||
|
program_id,
|
||||||
|
equation,
|
||||||
|
is_eager,
|
||||||
|
is_func,
|
||||||
|
tf_function_call,
|
||||||
|
tracing_count,
|
||||||
|
flops,
|
||||||
|
model_flops,
|
||||||
|
bytes_accessed,
|
||||||
|
memory_access_breakdown,
|
||||||
|
source,
|
||||||
|
model_name,
|
||||||
|
model_version,
|
||||||
|
bytes_transferred,
|
||||||
|
queue,
|
||||||
|
dcn_collective_info,
|
||||||
|
// Performance counter related.
|
||||||
|
@"Raw Value",
|
||||||
|
@"Scaled Value",
|
||||||
|
@"Thread Id",
|
||||||
|
matrix_unit_utilization_percent,
|
||||||
|
// XLA metadata map related.
|
||||||
|
@"Hlo Proto",
|
||||||
|
// Device capability related.
|
||||||
|
clock_rate,
|
||||||
|
// For GPU, this is the number of SMs.
|
||||||
|
core_count,
|
||||||
|
memory_bandwidth,
|
||||||
|
memory_size,
|
||||||
|
compute_cap_major,
|
||||||
|
compute_cap_minor,
|
||||||
|
peak_teraflops_per_second,
|
||||||
|
peak_hbm_bw_gigabytes_per_second,
|
||||||
|
peak_sram_rd_bw_gigabytes_per_second,
|
||||||
|
peak_sram_wr_bw_gigabytes_per_second,
|
||||||
|
device_vendor,
|
||||||
|
// Batching related.
|
||||||
|
batch_size_after_padding,
|
||||||
|
padding_amount,
|
||||||
|
batching_input_task_size,
|
||||||
|
// GPU occupancy metrics
|
||||||
|
theoretical_occupancy_pct,
|
||||||
|
occupancy_min_grid_size,
|
||||||
|
occupancy_suggested_block_size,
|
||||||
|
// Aggregated Stats
|
||||||
|
self_duration_ps,
|
||||||
|
min_duration_ps,
|
||||||
|
total_profile_duration_ps,
|
||||||
|
max_iteration_num,
|
||||||
|
device_type,
|
||||||
|
uses_megacore,
|
||||||
|
symbol_id,
|
||||||
|
tf_op_name,
|
||||||
|
dma_stall_duration_ps,
|
||||||
|
key,
|
||||||
|
payload_size_bytes,
|
||||||
|
duration_us,
|
||||||
|
buffer_size,
|
||||||
|
transfers,
|
||||||
|
// Dcn message Stats
|
||||||
|
dcn_label,
|
||||||
|
dcn_source_slice_id,
|
||||||
|
dcn_source_per_slice_device_id,
|
||||||
|
dcn_destination_slice_id,
|
||||||
|
dcn_destination_per_slice_device_id,
|
||||||
|
dcn_chunk,
|
||||||
|
dcn_loop_index,
|
||||||
|
@"EdgeTPU Model information",
|
||||||
|
@"EdgeTPU Model Profile information",
|
||||||
|
@"EdgeTPU MLIR",
|
||||||
|
dropped_traces,
|
||||||
|
cuda_graph_id,
|
||||||
|
// Many events have `.cuda_graph_id`, such as graph sub events when tracing is in
|
||||||
|
// node level. Yet `.cuda_graph_exec_id` is used only for CudaGraphExecution events
|
||||||
|
// on the GPU device when tracing is in graph level.
|
||||||
|
cuda_graph_exec_id,
|
||||||
|
cuda_graph_orig_id,
|
||||||
|
step_idle_time_ps,
|
||||||
|
gpu_device_name,
|
||||||
|
source_stack,
|
||||||
|
device_offset_ps,
|
||||||
|
device_duration_ps,
|
||||||
|
|
||||||
|
pub fn fromString(stat_name: []const u8) StatType {
|
||||||
|
return std.meta.stringToEnum(StatType, stat_name) orelse .unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn isInternalStat(stat_type: StatType) bool {
|
||||||
|
return switch (stat_type) {
|
||||||
|
.kernel_details,
|
||||||
|
._pt,
|
||||||
|
._p,
|
||||||
|
._ct,
|
||||||
|
._c,
|
||||||
|
._r,
|
||||||
|
.flops,
|
||||||
|
.bytes_accessed,
|
||||||
|
.program_id,
|
||||||
|
.symbol_id,
|
||||||
|
=> true,
|
||||||
|
else => false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
50
pjrt/convert/xplane_visitor.zig
Normal file
50
pjrt/convert/xplane_visitor.zig
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
const std = @import("std");
|
||||||
|
const xplane_proto = @import("//tsl:xplane_proto");
|
||||||
|
const xplane_schema = @import("xplane_schema.zig");
|
||||||
|
|
||||||
|
pub const XPlaneVisitor = struct {
|
||||||
|
plane: *const xplane_proto.XPlane,
|
||||||
|
event_metadata_by_id: std.AutoHashMapUnmanaged(i64, *const xplane_proto.XEventMetadata) = .{},
|
||||||
|
stat_metadata_by_id: std.AutoHashMapUnmanaged(i64, *const xplane_proto.XStatMetadata) = .{},
|
||||||
|
|
||||||
|
pub fn init(
|
||||||
|
allocator: std.mem.Allocator,
|
||||||
|
plane: *const xplane_proto.XPlane,
|
||||||
|
) !XPlaneVisitor {
|
||||||
|
var res: XPlaneVisitor = .{ .plane = plane };
|
||||||
|
|
||||||
|
// build event metadata map
|
||||||
|
for (plane.event_metadata.items) |*event_metadata| {
|
||||||
|
try res.event_metadata_by_id.put(allocator, event_metadata.key, &event_metadata.value.?);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build stat metadata map
|
||||||
|
for (plane.stat_metadata.items) |*stat_metadata| {
|
||||||
|
try res.stat_metadata_by_id.put(allocator, stat_metadata.key, &stat_metadata.value.?);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn getEventType(self: *const XPlaneVisitor, event_metadata_id: i64) xplane_schema.HostEventType {
|
||||||
|
if (self.event_metadata_by_id.get(event_metadata_id)) |v| {
|
||||||
|
return xplane_schema.HostEventType.fromString(v.name.getSlice());
|
||||||
|
} else return .unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn name(self: *const XPlaneVisitor) []const u8 {
|
||||||
|
return self.plane.name.getSlice();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn getStatMetadataName(self: *const XPlaneVisitor, stat_metadata_id: i64) []const u8 {
|
||||||
|
if (self.stat_metadata_by_id.get(stat_metadata_id)) |v| {
|
||||||
|
return v.name.getSlice();
|
||||||
|
} else return &[_]u8{};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn getStatType(self: *const XPlaneVisitor, stat_metadata_id: i64) xplane_schema.StatType {
|
||||||
|
if (self.stat_metadata_by_id.get(stat_metadata_id)) |v| {
|
||||||
|
return xplane_schema.StatType.fromString(v.name.getSlice());
|
||||||
|
} else return .unknown;
|
||||||
|
}
|
||||||
|
};
|
||||||
45
pjrt/xspace_to_json.zig
Normal file
45
pjrt/xspace_to_json.zig
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
const flags = @import("tigerbeetle/flags");
|
||||||
|
const std = @import("std");
|
||||||
|
|
||||||
|
const TraceContainer = @import("convert/trace_container.zig").TraceContainer;
|
||||||
|
|
||||||
|
const CliArgs = struct {
|
||||||
|
pub const help =
|
||||||
|
\\ llama --path=path_to_profiling_data
|
||||||
|
;
|
||||||
|
path: []const u8,
|
||||||
|
max_events: ?usize = null,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn main() !void {
|
||||||
|
var gpa = std.heap.GeneralPurposeAllocator(.{ .thread_safe = true }){};
|
||||||
|
defer _ = gpa.deinit();
|
||||||
|
const allocator = gpa.allocator();
|
||||||
|
|
||||||
|
var args = std.process.args();
|
||||||
|
const cli_args = flags.parse(&args, CliArgs);
|
||||||
|
|
||||||
|
var fd = try std.fs.openFileAbsolute(cli_args.path, .{});
|
||||||
|
defer fd.close();
|
||||||
|
|
||||||
|
const pb_buffer = try fd.readToEndAlloc(allocator, (try fd.stat()).size);
|
||||||
|
defer allocator.free(pb_buffer);
|
||||||
|
if (pb_buffer.len == 0) return error.EmptyBuffer;
|
||||||
|
|
||||||
|
var converter = try TraceContainer.init(allocator, pb_buffer, cli_args.max_events);
|
||||||
|
defer converter.deinit();
|
||||||
|
|
||||||
|
var path_buffer: [1028]u8 = undefined;
|
||||||
|
|
||||||
|
const output_path = try std.fmt.bufPrint(&path_buffer, "{s}/{s}.json", .{
|
||||||
|
std.fs.path.dirname(cli_args.path) orelse "",
|
||||||
|
std.fs.path.stem(cli_args.path),
|
||||||
|
});
|
||||||
|
|
||||||
|
var output_file = try std.fs.createFileAbsolute(output_path, .{});
|
||||||
|
defer output_file.close();
|
||||||
|
|
||||||
|
try converter.toJson(output_file.writer().any());
|
||||||
|
|
||||||
|
std.debug.print("Wrote JSON to {s}\n", .{output_path});
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user