2023-09-21 11:15:50 +00:00
|
|
|
///! Conversion utilities between different Floating point formats.
|
2023-01-02 14:28:25 +00:00
|
|
|
const std = @import("std");
|
|
|
|
|
|
2023-01-23 16:28:19 +00:00
|
|
|
test {
|
|
|
|
|
std.testing.refAllDecls(@This());
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
fn FloatHelpers(Float: type) type {
|
|
|
|
|
const info = @typeInfo(Float);
|
|
|
|
|
const err_msg = "FloatHelpers expect a packed struct { mantissa: uXX, exponent: uXX, sign: u1}";
|
|
|
|
|
if (info != .@"struct" or info.@"struct".backing_integer == null) {
|
|
|
|
|
@compileError(err_msg);
|
|
|
|
|
}
|
|
|
|
|
comptime {
|
|
|
|
|
for (info.@"struct".fields, &.{ "mantissa", "exponent", "sign" }) |field, expected_name| {
|
|
|
|
|
if (!std.mem.eql(u8, field.name, expected_name))
|
|
|
|
|
@compileError(err_msg);
|
|
|
|
|
}
|
|
|
|
|
}
|
2023-09-21 11:15:50 +00:00
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
return struct {
|
|
|
|
|
const mantissa_bits: u8 = @typeInfo(@FieldType(Float, "mantissa")).int.bits;
|
|
|
|
|
const exponent_bits: u8 = @typeInfo(@FieldType(Float, "exponent")).int.bits;
|
2025-09-19 12:13:32 +00:00
|
|
|
const f32_mantissa_bits: u8 = @typeInfo(@FieldType(Float32, "mantissa")).int.bits;
|
|
|
|
|
const exp_bias: i16 = std.math.maxInt(std.meta.Int(.unsigned, exponent_bits - 1));
|
|
|
|
|
const exp_off: u8 = FloatHelpers(Float32).exp_bias - exp_bias;
|
2023-01-02 14:28:25 +00:00
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
pub const zero: Float = .{ .sign = 0, .exponent = 0, .mantissa = 0 };
|
2023-01-02 14:28:25 +00:00
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
pub fn neg(x: Float) Float {
|
2023-01-02 14:28:25 +00:00
|
|
|
return .{
|
2024-07-23 17:43:43 +00:00
|
|
|
.sign = x.sign ^ 1,
|
|
|
|
|
.exponent = x.exponent,
|
|
|
|
|
.mantissa = x.mantissa,
|
2023-01-02 14:28:25 +00:00
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Lossy conversion from f32, similar to @floatCast
|
2024-07-23 17:43:43 +00:00
|
|
|
pub fn fromF32(f: f32) Float {
|
2025-09-19 12:13:32 +00:00
|
|
|
@setRuntimeSafety(false);
|
|
|
|
|
|
2023-01-02 14:28:25 +00:00
|
|
|
const vf32: Float32 = @bitCast(f);
|
2025-09-19 12:13:32 +00:00
|
|
|
const exponent: i16 = @as(i16, vf32.exponent) - exp_off;
|
2024-07-23 17:43:43 +00:00
|
|
|
const overflow = exponent > std.math.maxInt(@FieldType(Float, "exponent"));
|
2023-09-21 11:15:50 +00:00
|
|
|
if (overflow) {
|
2025-09-19 12:13:32 +00:00
|
|
|
@branchHint(.unlikely);
|
|
|
|
|
return if (@hasDecl(Float, "inf"))
|
|
|
|
|
if (vf32.sign == 0) Float.inf else Float.minus_inf
|
|
|
|
|
else
|
|
|
|
|
Float.nan;
|
2023-09-21 11:15:50 +00:00
|
|
|
}
|
2025-09-19 12:13:32 +00:00
|
|
|
|
|
|
|
|
return if (exponent <= 0)
|
|
|
|
|
.{
|
|
|
|
|
.sign = vf32.sign,
|
|
|
|
|
.exponent = 0,
|
|
|
|
|
.mantissa = shiftMantissa(vf32.mantissa, @intCast(-exponent)),
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
.{
|
|
|
|
|
.sign = vf32.sign,
|
|
|
|
|
.exponent = @intCast(exponent),
|
|
|
|
|
.mantissa = truncMantissa(vf32.mantissa),
|
|
|
|
|
};
|
2023-01-02 14:28:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Lossless conversion to f32.
|
2024-07-23 17:43:43 +00:00
|
|
|
pub fn toF32(x: Float) f32 {
|
2025-09-19 12:13:32 +00:00
|
|
|
@setRuntimeSafety(false);
|
|
|
|
|
|
|
|
|
|
if (x == zero) return 0.0;
|
|
|
|
|
if (isInf(x)) {
|
|
|
|
|
@branchHint(.unlikely);
|
2024-07-23 17:43:43 +00:00
|
|
|
return if (x.sign == 0) std.math.inf(f32) else -std.math.inf(f32);
|
2023-09-21 11:15:50 +00:00
|
|
|
}
|
2025-09-19 12:13:32 +00:00
|
|
|
|
|
|
|
|
const vf32: Float32 = if (x.exponent > 0)
|
|
|
|
|
.{
|
|
|
|
|
.sign = x.sign,
|
|
|
|
|
.exponent = @as(u8, x.exponent) + exp_off,
|
|
|
|
|
.mantissa = f32Mantissa(x),
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
.{
|
|
|
|
|
.sign = x.sign,
|
|
|
|
|
.exponent = exp_off - @clz(x.mantissa),
|
|
|
|
|
.mantissa = @as(u23, x.mantissa) << @clz(x.mantissa),
|
|
|
|
|
};
|
2023-01-02 14:28:25 +00:00
|
|
|
return @bitCast(vf32);
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
fn truncMantissa(f32_mantissa: u32) @FieldType(Float, "mantissa") {
|
|
|
|
|
const rounding_val: u32 = @as(u32, 1) << (f32_mantissa_bits - mantissa_bits - 1);
|
|
|
|
|
return @truncate((f32_mantissa + rounding_val) >> (f32_mantissa_bits - mantissa_bits));
|
2023-01-02 14:28:25 +00:00
|
|
|
}
|
|
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
fn shiftMantissa(f32_mantissa: u32, underflow: u8) @FieldType(Float, "mantissa") {
|
|
|
|
|
const upper_bit: u32 = @as(u32, 1) << f32_mantissa_bits;
|
|
|
|
|
const full_mant32: u32 = f32_mantissa | upper_bit;
|
|
|
|
|
// divide the mantissa proportionally to the exponent underflow
|
|
|
|
|
const shifted_mant: u32 = full_mant32 >> @truncate(underflow + 1);
|
|
|
|
|
return truncMantissa(shifted_mant);
|
2023-09-21 11:15:50 +00:00
|
|
|
}
|
|
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
fn f32Mantissa(x: Float) @FieldType(Float32, "mantissa") {
|
|
|
|
|
const T = @FieldType(Float32, "mantissa");
|
|
|
|
|
return @as(T, x.mantissa) << f32_mantissa_bits - mantissa_bits;
|
2023-01-02 14:28:25 +00:00
|
|
|
}
|
|
|
|
|
|
2025-07-28 13:54:28 +00:00
|
|
|
pub fn formatNumber(x: Float, writer: *std.io.Writer, n: std.fmt.Number) std.io.Writer.Error!void {
|
|
|
|
|
switch (n.mode) {
|
|
|
|
|
.binary, .octal, .hex => try writer.print("{{ .sign={}, .exp={}, .mantissa={} }}", .{ x.sign, x.exponent, x.mantissa }),
|
|
|
|
|
else => try writer.printFloat(x.toF32(), n),
|
2023-09-21 11:15:50 +00:00
|
|
|
}
|
2023-01-02 14:28:25 +00:00
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
pub const Float32 = packed struct(u32) {
|
|
|
|
|
mantissa: u23,
|
|
|
|
|
exponent: u8,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
pub const inf: Float32 = .{ .sign = 0, .exponent = std.math.maxInt(u8), .mantissa = 0 };
|
|
|
|
|
pub const minus_inf = neg(inf);
|
|
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
2025-07-28 13:54:28 +00:00
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
2024-07-23 17:43:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const f32_exp_bias = FloatHelpers(Float32).expBias();
|
|
|
|
|
|
|
|
|
|
pub const Float64 = packed struct(u64) {
|
|
|
|
|
mantissa: u52,
|
|
|
|
|
exponent: u11,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
2025-07-28 13:54:28 +00:00
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
2024-07-23 17:43:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub const Float8E4M3B11FNUZ = packed struct(u8) {
|
|
|
|
|
mantissa: u3,
|
|
|
|
|
exponent: u4,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
pub const nan: Float8E4M3B11FNUZ = .{ .sign = 1, .exponent = 0, .mantissa = 0 };
|
2023-01-02 14:28:25 +00:00
|
|
|
|
|
|
|
|
pub fn isNan(self: Float8E4M3B11FNUZ) bool {
|
|
|
|
|
return self.sign == 1 and self.exponent == 0 and self.mantissa == 0;
|
|
|
|
|
}
|
2024-07-23 17:43:43 +00:00
|
|
|
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
2025-07-28 13:54:28 +00:00
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
2024-07-23 17:43:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub const Float8E4M3FN = packed struct(u8) {
|
|
|
|
|
mantissa: u3,
|
|
|
|
|
exponent: u4,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
|
|
|
|
pub const nan: Float8E4M3FN = .{ .sign = 0, .exponent = std.math.maxInt(u4), .mantissa = std.math.maxInt(u3) };
|
2023-01-02 14:28:25 +00:00
|
|
|
|
|
|
|
|
pub fn isNan(self: Float8E4M3FN) bool {
|
2025-09-19 12:13:32 +00:00
|
|
|
return self.exponent == nan.exponent and self.mantissa == nan.mantissa;
|
2023-01-02 14:28:25 +00:00
|
|
|
}
|
2024-07-23 17:43:43 +00:00
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
2025-07-28 13:54:28 +00:00
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
2024-07-23 17:43:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub const Float8E4M3FNUZ = packed struct(u8) {
|
|
|
|
|
mantissa: u3,
|
|
|
|
|
exponent: u4,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
pub const nan: Float8E4M3FNUZ = .{ .sign = 1, .exponent = 0, .mantissa = 0 };
|
2023-01-02 14:28:25 +00:00
|
|
|
|
|
|
|
|
pub fn isNan(self: Float8E4M3FNUZ) bool {
|
|
|
|
|
return self.sign == 1 and self.exponent == 0 and self.mantissa == 0;
|
|
|
|
|
}
|
2024-07-23 17:43:43 +00:00
|
|
|
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
2025-07-28 13:54:28 +00:00
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
2024-07-23 17:43:43 +00:00
|
|
|
};
|
2023-01-02 14:28:25 +00:00
|
|
|
|
2023-09-21 11:15:50 +00:00
|
|
|
test "Float8E4" {
|
2025-09-19 12:13:32 +00:00
|
|
|
// With 4 bits of exponents power of two can be represented exactly up to 64.
|
2023-09-21 11:15:50 +00:00
|
|
|
const test_case_e4: TestCase = .{
|
2025-09-19 12:13:32 +00:00
|
|
|
.lossless = &[_]f32{ 0, 1.0, -2, 1.0 / 64.0, -128, -1.125 / 64.0 },
|
|
|
|
|
.lossy = &[_]f32{ 3.02344107628, 1.0 / 128.0, 1.0 / 512.0 },
|
2023-09-21 11:15:50 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
inline for (.{
|
|
|
|
|
Float8E4M3B11FNUZ,
|
|
|
|
|
Float8E4M3FN,
|
|
|
|
|
Float8E4M3FNUZ,
|
|
|
|
|
}) |Float8T| {
|
|
|
|
|
try testCustomFloat(Float8T, test_case_e4);
|
2025-09-19 12:13:32 +00:00
|
|
|
try std.testing.expectEqual(0.0, Float8T.fromF32(1.0 / 2048.0).toF32());
|
|
|
|
|
if (@hasDecl(Float8T, "inf")) {
|
|
|
|
|
try std.testing.expectEqual(Float8T.inf, Float8T.fromF32(128.0));
|
|
|
|
|
try std.testing.expectEqual(Float8T.inf.neg(), Float8T.fromF32(-128.0));
|
|
|
|
|
}
|
2023-09-21 11:15:50 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
pub const Float8E5M2 = packed struct(u8) {
|
|
|
|
|
mantissa: u2,
|
|
|
|
|
exponent: u5,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
pub const nan: Float8E5M2 = .{ .sign = 0, .exponent = std.math.maxInt(u5), .mantissa = 1 };
|
2023-01-02 14:28:25 +00:00
|
|
|
|
|
|
|
|
pub fn isNan(self: Float8E5M2) bool {
|
2025-09-19 12:13:32 +00:00
|
|
|
return self.exponent == nan.exponent and self.mantissa != 0;
|
2023-01-02 14:28:25 +00:00
|
|
|
}
|
|
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
pub const inf: Float8E5M2 = .{
|
|
|
|
|
.sign = 0,
|
|
|
|
|
.exponent = std.math.maxInt(u5),
|
|
|
|
|
.mantissa = 0,
|
|
|
|
|
};
|
2023-01-02 14:28:25 +00:00
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
pub const minus_inf: Float8E5M2 = .neg(inf);
|
2024-07-23 17:43:43 +00:00
|
|
|
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
2025-07-28 13:54:28 +00:00
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
2024-07-23 17:43:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub const Float8E5M2FNUZ = packed struct(u8) {
|
|
|
|
|
mantissa: u2,
|
|
|
|
|
exponent: u5,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
|
|
|
|
pub const nan: Float8E5M2FNUZ = .{ .sign = 1, .exponent = 0, .mantissa = 0 };
|
2023-01-02 14:28:25 +00:00
|
|
|
|
|
|
|
|
pub fn isNan(self: Float8E5M2FNUZ) bool {
|
|
|
|
|
return self.sign == 1 and self.exponent == 0 and self.mantissa == 0;
|
|
|
|
|
}
|
2024-07-23 17:43:43 +00:00
|
|
|
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
2025-07-28 13:54:28 +00:00
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
2024-07-23 17:43:43 +00:00
|
|
|
};
|
2023-01-02 14:28:25 +00:00
|
|
|
|
2023-09-21 11:15:50 +00:00
|
|
|
test "Float8E5" {
|
|
|
|
|
const test_case_e5: TestCase = .{
|
|
|
|
|
.lossless = &[_]f32{ 0, 1.0, -2, 1.0 / 128.0, -128 },
|
|
|
|
|
.lossy = &[_]f32{3.02344107628},
|
|
|
|
|
};
|
|
|
|
|
inline for (.{ Float8E5M2, Float8E5M2FNUZ }) |Float8T| {
|
|
|
|
|
try testCustomFloat(Float8T, test_case_e5);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
pub const BFloat16 = packed struct(u16) {
|
|
|
|
|
mantissa: u7,
|
|
|
|
|
exponent: u8,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
|
|
|
|
pub const nan: BFloat16 = .{ .sign = 0, .exponent = std.math.maxInt(u8), .mantissa = 1 };
|
2023-01-02 14:28:25 +00:00
|
|
|
|
|
|
|
|
pub fn isNan(self: BFloat16) bool {
|
|
|
|
|
return allBitsOne(self.exponent) and self.mantissa != 0;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
pub const inf: BFloat16 = .{
|
|
|
|
|
.sign = 0,
|
|
|
|
|
.exponent = std.math.maxInt(u8),
|
|
|
|
|
.mantissa = 0,
|
|
|
|
|
};
|
2023-01-02 14:28:25 +00:00
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
pub const minus_inf: BFloat16 = .neg(inf);
|
2024-10-28 11:21:46 +00:00
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
// Specialized versions of to/from F32. Since BFloat16 has the same exponent range than F32,
|
|
|
|
|
// no overflow/underflow can happen, simplifiying conversion logic.
|
2024-10-28 11:21:46 +00:00
|
|
|
pub fn toF32(self: BFloat16) f32 {
|
|
|
|
|
// Pad the BF16 with zeros 0
|
|
|
|
|
return @bitCast([2]u16{ 0, @bitCast(self) });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn fromF32(float32: f32) BFloat16 {
|
|
|
|
|
var int: u32 = @bitCast(float32);
|
|
|
|
|
// Round up if needed.
|
|
|
|
|
int += 0x8000;
|
|
|
|
|
const parts: [2]u16 = @bitCast(int);
|
|
|
|
|
return @bitCast(parts[1]);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-23 17:43:43 +00:00
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
2025-07-28 13:54:28 +00:00
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
2024-07-23 17:43:43 +00:00
|
|
|
};
|
2023-01-02 14:28:25 +00:00
|
|
|
|
|
|
|
|
test BFloat16 {
|
|
|
|
|
// From https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#Examples
|
|
|
|
|
try std.testing.expectEqual(BFloat16.fromF32(0), BFloat16{ .sign = 0, .exponent = 0, .mantissa = 0 });
|
|
|
|
|
try std.testing.expectEqual(BFloat16.fromF32(-2), BFloat16{ .sign = 1, .exponent = 127 + 1, .mantissa = 0 });
|
2024-10-28 11:21:46 +00:00
|
|
|
try std.testing.expectEqual(BFloat16.fromF32(3.02344107628), BFloat16{ .sign = 0, .exponent = 127 + 1, .mantissa = 66 });
|
2023-01-02 14:28:25 +00:00
|
|
|
try std.testing.expectEqual(BFloat16.fromF32(1.0 / 128.0), BFloat16{ .sign = 0, .exponent = 127 - 7, .mantissa = 0 });
|
2024-07-23 17:43:43 +00:00
|
|
|
try std.testing.expectEqual(std.mem.toBytes(BFloat16.inf.neg()), [_]u8{ 0x80, 0xff });
|
|
|
|
|
try std.testing.expectEqual(BFloat16.inf, BFloat16.fromF32(std.math.inf(f32)));
|
2023-01-02 14:28:25 +00:00
|
|
|
|
2023-09-21 11:15:50 +00:00
|
|
|
try testCustomFloat(BFloat16, .{
|
|
|
|
|
.lossless = &[_]f32{ 0, -2, 1.0 / 128.0, -1e64, std.math.inf(f32) },
|
|
|
|
|
.lossy = &[_]f32{3.02344107628},
|
|
|
|
|
});
|
2023-01-02 14:28:25 +00:00
|
|
|
}
|
|
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
pub const Float8E4M3 = packed struct(u8) {
|
|
|
|
|
mantissa: u3,
|
|
|
|
|
exponent: u4,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
|
|
|
|
pub const nan: Float8E4M3 = @bitCast(0xFF);
|
|
|
|
|
|
|
|
|
|
pub fn isNan(self: Float8E4M3) bool {
|
|
|
|
|
return self == nan or self == comptime nan.neg();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub const inf: Float8E4M3 = .{
|
|
|
|
|
.sign = 0,
|
|
|
|
|
.exponent = std.math.maxInt(u4),
|
|
|
|
|
.mantissa = 0,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub const minus_inf = neg(inf);
|
|
|
|
|
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
|
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub const Float8E3M4 = packed struct(u8) {
|
|
|
|
|
mantissa: u4,
|
|
|
|
|
exponent: u3,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
|
|
|
|
pub const nan: Float8E3M4 = @bitCast(0xFF);
|
|
|
|
|
|
|
|
|
|
pub fn isNan(self: Float8E3M4) bool {
|
|
|
|
|
return self == nan or self == comptime nan.neg();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub const inf: Float8E3M4 = .{
|
|
|
|
|
.sign = 0,
|
|
|
|
|
.exponent = std.math.maxInt(u3),
|
|
|
|
|
.mantissa = 0,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub const minus_inf = neg(inf);
|
|
|
|
|
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const toF32 = Helpers.toF32;
|
|
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub const Float8E8M0 = packed struct(u8) {
|
|
|
|
|
mantissa: u0 = 0,
|
|
|
|
|
exponent: u8,
|
|
|
|
|
sign: u0 = 0,
|
|
|
|
|
|
|
|
|
|
pub const min_scale: f32 = @bitCast(Float32{ .sign = 0, .exponent = 0, .mantissa = 0b1 << 22 });
|
|
|
|
|
|
|
|
|
|
/// Lossy conversion from f32, similar to @floatCast
|
|
|
|
|
pub fn fromF32(f: f32) Float8E8M0 {
|
|
|
|
|
const vf32: Float32 = @bitCast(f);
|
|
|
|
|
return .{ .exponent = @intCast(vf32.exponent) };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Lossless conversion to f32.
|
|
|
|
|
pub fn toF32(x: Float8E8M0) f32 {
|
|
|
|
|
if (x.exponent == 0) return min_scale;
|
|
|
|
|
const vf32: Float32 = .{
|
|
|
|
|
.sign = 0,
|
|
|
|
|
.exponent = x.exponent,
|
|
|
|
|
.mantissa = 0,
|
|
|
|
|
};
|
|
|
|
|
return @bitCast(vf32);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
test Float8E8M0 {
|
|
|
|
|
try std.testing.expectEqual(Float8E8M0{ .exponent = 127 }, Float8E8M0.fromF32(1.0));
|
|
|
|
|
// try std.testing.expectEqual(5.877472e-39, Float8E8M0.toF32(.{ .exponent = 0}));
|
|
|
|
|
|
|
|
|
|
try testCustomFloat(Float8E8M0, .{
|
|
|
|
|
.lossless = &[_]f32{ Float8E8M0.min_scale, 1.0, 64.0, 1.0 / 128.0, std.math.pow(f32, 2.0, 127) },
|
|
|
|
|
.lossy = &[_]f32{1.00001},
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub const Float4E2M1 = packed struct(u4) {
|
|
|
|
|
mantissa: u1,
|
|
|
|
|
exponent: u2,
|
|
|
|
|
sign: u1,
|
|
|
|
|
|
|
|
|
|
pub const nan: Float4E2M1 = @bitCast(@as(u4, 0xF));
|
|
|
|
|
const Helpers = FloatHelpers(@This());
|
|
|
|
|
pub const zero = Helpers.zero;
|
|
|
|
|
pub const neg = Helpers.neg;
|
|
|
|
|
pub const fromF32 = Helpers.fromF32;
|
|
|
|
|
pub const formatNumber = Helpers.formatNumber;
|
|
|
|
|
|
|
|
|
|
pub const values = [_]f32{ 0.0, 0.5, 1, 1.5, 2, 3, 4, 6, -0.0, -0.5, -1, -1.5, -2, -3, -4, -6 };
|
|
|
|
|
|
|
|
|
|
pub fn toF32(x: Float4E2M1) f32 {
|
2025-10-06 15:29:57 +00:00
|
|
|
// faster implementation
|
2025-09-19 12:13:32 +00:00
|
|
|
return values[@as(u4, @bitCast(x))];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test toF32 {
|
|
|
|
|
var to_f32_res: [16]f32 = undefined;
|
|
|
|
|
for (&to_f32_res, 0..) |*r, i| {
|
|
|
|
|
const x_f4: Float4E2M1 = @bitCast(@as(u4, @intCast(i)));
|
|
|
|
|
r.* = x_f4.toF32();
|
|
|
|
|
}
|
|
|
|
|
try std.testing.expectEqualSlices(f32, &Float4E2M1.values, &to_f32_res);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
test fromF32 {
|
|
|
|
|
var from_f32_res: [16]Float4E2M1 = undefined;
|
|
|
|
|
for (&from_f32_res, 0..) |*r, i| {
|
|
|
|
|
r.* = .fromF32(Float4E2M1.values[i]);
|
|
|
|
|
}
|
|
|
|
|
try std.testing.expectEqualSlices(u4, &.{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, @ptrCast(&from_f32_res));
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2023-01-02 14:28:25 +00:00
|
|
|
pub fn floatCast(T: type, x: anytype) T {
|
2025-09-19 12:13:32 +00:00
|
|
|
return switch (T) {
|
|
|
|
|
f64, f32, f16 => switch (@TypeOf(x)) {
|
|
|
|
|
f64, f32, f16 => @floatCast(x),
|
|
|
|
|
else => @floatCast(x.toF32()),
|
|
|
|
|
},
|
|
|
|
|
else => switch (@TypeOf(x)) {
|
|
|
|
|
f64, f32, f16 => .fromF32(x),
|
|
|
|
|
else => .fromF32(x.toF32()),
|
|
|
|
|
},
|
2023-01-02 14:28:25 +00:00
|
|
|
};
|
|
|
|
|
}
|
2023-09-21 11:15:50 +00:00
|
|
|
|
2025-09-19 12:13:32 +00:00
|
|
|
pub fn isInf(x: anytype) bool {
|
|
|
|
|
const Float = @TypeOf(x);
|
|
|
|
|
switch (Float) {
|
|
|
|
|
f64, f32, f16 => return std.math.isInf(x),
|
|
|
|
|
else => {},
|
|
|
|
|
}
|
|
|
|
|
if (!@hasDecl(Float, "inf")) return false;
|
|
|
|
|
|
|
|
|
|
const FBits = std.meta.Int(.unsigned, @bitSizeOf(Float));
|
|
|
|
|
const remove_sign = ~@as(FBits, 0) >> 1;
|
|
|
|
|
return @as(FBits, @bitCast(x)) & remove_sign == @as(FBits, @bitCast(Float.inf));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn allBitsOne(v: anytype) bool {
|
|
|
|
|
return v == std.math.maxInt(@TypeOf(v));
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-21 11:15:50 +00:00
|
|
|
const TestCase = struct {
|
|
|
|
|
lossless: []const f32,
|
|
|
|
|
lossy: []const f32,
|
|
|
|
|
tolerance: f32 = 1e-2,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
fn testCustomFloat(FloatT: type, test_case: TestCase) !void {
|
|
|
|
|
for (test_case.lossless) |x| {
|
|
|
|
|
try std.testing.expectEqual(x, FloatT.fromF32(x).toF32());
|
|
|
|
|
}
|
|
|
|
|
for (test_case.lossy) |x| {
|
|
|
|
|
try expectApproxEqRel(f32, x, FloatT.fromF32(x).toF32(), test_case.tolerance);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn expectApproxEqRel(FloatT: type, x: FloatT, y: FloatT, tolerance: FloatT) !void {
|
|
|
|
|
if (!std.math.approxEqRel(f32, x, y, tolerance)) {
|
|
|
|
|
std.log.err("expected ~{d}, got {d}", .{ x, y });
|
|
|
|
|
return error.TestUnexpectedResult;
|
|
|
|
|
}
|
|
|
|
|
}
|