2023-01-02 14:28:25 +00:00
const std = @import ( " std " ) ;
2023-04-07 16:45:58 +00:00
const log = std . log . scoped ( . zml_aio ) ;
/// All possible pickle operators.
/// Reference: https://github.com/python/cpython/blob/3.13/Lib/pickletools.py
pub const OpCode = enum ( u8 ) {
/// Push an integer or bool.
///
/// The argument is a newline-terminated decimal literal string.
///
/// The intent may have been that this always fit in a short Python int,
/// but INT can be generated in pickles written on a 64-bit box that
/// require a Python long on a 32-bit box. The difference between this
/// and LONG then is that INT skips a trailing 'L', and produces a short
/// int whenever possible.
///
/// Another difference is due to that, when bool was introduced as a
/// distinct type in 2.3, builtin names True and False were also added to
/// 2.2.2, mapping to ints 1 and 0. For compatibility in both directions,
/// True gets pickled as INT + "I01\n", and False as INT + "I00\n".
/// Leading zeroes are never produced for a genuine integer. The 2.3
/// (and later) unpicklers special-case these and return bool instead;
/// earlier unpicklers ignore the leading "0" and return the int.
int = 'I' ,
/// Push a four-byte signed integer.
/// Introduced in protocol 1.
///
/// This handles the full range of Python (short) integers on a 32-bit
/// box, directly as binary bytes (1 for the opcode and 4 for the integer).
/// If the integer is non-negative and fits in 1 or 2 bytes, pickling via
/// BININT1 or BININT2 saves space.
binint = 'J' ,
/// Push a one-byte unsigned integer.
/// Introduced in protocol 1.
///
/// This is a space optimization for pickling very small non-negative ints,
/// in range(256).
binint1 = 'K' ,
/// Push a two-byte unsigned integer.
/// Introduced in protocol 1.
///
/// This is a space optimization for pickling small positive ints, in
/// range(256, 2**16). Integers in range(256) can also be pickled via
/// BININT2, but BININT1 instead saves a byte.
binint2 = 'M' ,
/// Push a long integer.
///
/// The same as INT, except that the literal ends with 'L', and always
/// unpickles to a Python long. There doesn't seem a real purpose to the
/// trailing 'L'.
///
/// Note that LONG takes time quadratic in the number of digits when
/// unpickling (this is simply due to the nature of decimal->binary
/// conversion). Proto 2 added linear-time (in C; still quadratic-time
/// in Python) LONG1 and LONG4 opcodes.
long = 'L' ,
/// Long integer using one-byte length.
/// Introduced in protocol 2.
///
/// A more efficient encoding of a Python long; the long1 encoding
long1 = 0x8a ,
/// Long integer using four-byte length.
/// Introduced in protocol 2.
///
/// A more efficient encoding of a Python long; the long4 encoding
long4 = 0x8b ,
/// Push a Python string object.
///
/// The argument is a repr-style string, with bracketing quote characters,
/// and perhaps embedded escapes. The argument extends until the next
/// newline character. These are usually decoded into a str instance
/// using the encoding given to the Unpickler constructor. or the default,
/// 'ASCII'. If the encoding given was 'bytes' however, they will be
/// decoded as bytes object instead.
string = 'S' ,
/// Push a Python string object.
/// Introduced in protocol 1.
///
/// There are two arguments: the first is a 4-byte little-endian
/// signed int giving the number of bytes in the string, and the
/// second is that many bytes, which are taken literally as the string
/// content. These are usually decoded into a str instance using the
/// encoding given to the Unpickler constructor. or the default,
/// 'ASCII'. If the encoding given was 'bytes' however, they will be
/// decoded as bytes object instead.
binstring = 'T' ,
/// Push a Python string object.
/// Introduced in protocol 1.
///
/// There are two arguments: the first is a 1-byte unsigned int giving
/// the number of bytes in the string, and the second is that many
/// bytes, which are taken literally as the string content. These are
/// usually decoded into a str instance using the encoding given to
/// the Unpickler constructor. or the default, 'ASCII'. If the
/// encoding given was 'bytes' however, they will be decoded as bytes
/// object instead.
short_binstring = 'U' ,
/// Push a Python bytes object.
/// Introduced in protocol 3.
///
/// There are two arguments: the first is a 4-byte little-endian unsigned int
/// giving the number of bytes, and the second is that many bytes, which are
/// taken literally as the bytes content.
binbytes = 'B' ,
/// Push a Python bytes object.
/// Introduced in protocol 3.
///
/// There are two arguments: the first is a 1-byte unsigned int giving
/// the number of bytes, and the second is that many bytes, which are taken
/// literally as the string content.
short_binbytes = 'C' ,
/// Push a Python bytes object.
/// Introduced in protocol 4.
///
/// There are two arguments: the first is an 8-byte unsigned int giving
/// the number of bytes in the string, and the second is that many bytes,
/// which are taken literally as the string content.
binbytes8 = 0x8e ,
/// Push a Python bytearray object.
/// Introduced in protocol 5.
///
/// There are two arguments: the first is an 8-byte unsigned int giving
/// the number of bytes in the bytearray, and the second is that many bytes,
/// which are taken literally as the bytearray content.
bytearray8 = 0x96 ,
/// Introduced in protocol 5.
next_buffer = 0x97 ,
/// Introduced in protocol 5.
readonly_buffer = 0x98 ,
none = 'N' ,
/// Introduced in protocol 2.
newtrue = 0x88 ,
/// Introduced in protocol 2.
newfalse = 0x89 ,
/// Push a Python Unicode string object.
///
/// The argument is a raw-unicode-escape encoding of a Unicode string,
/// and so may contain embedded escape sequences. The argument extends
/// until the next newline character.
unicode = 'V' ,
/// Push a Python Unicode string object.
/// Introduced in protocol 4.
///
/// There are two arguments: the first is a 1-byte little-endian signed int
/// giving the number of bytes in the string. The second is that many
/// bytes, and is the UTF-8 encoding of the Unicode string.
short_binunicode = 0x8c ,
/// Push a Python Unicode string object.
/// Introduced in protocol 1.
///
/// There are two arguments: the first is a 4-byte little-endian unsigned int
/// giving the number of bytes in the string. The second is that many
/// bytes, and is the UTF-8 encoding of the Unicode string.
binunicode = 'X' ,
/// Push a Python Unicode string object.
/// Introduced in protocol 4.
///
/// There are two arguments: the first is an 8-byte little-endian signed int
/// giving the number of bytes in the string. The second is that many
/// bytes, and is the UTF-8 encoding of the Unicode string.
binunicode8 = 0x8d ,
/// Newline-terminated decimal float literal.
///
/// The argument is repr(a_float), and in general requires 17 significant
/// digits for roundtrip conversion to be an identity (this is so for
/// IEEE-754 double precision values, which is what Python float maps to
/// on most boxes).
///
/// In general, FLOAT cannot be used to transport infinities, NaNs, or
/// minus zero across boxes (or even on a single box, if the platform C
/// library can't read the strings it produces for such things -- Windows
/// is like that), but may do less damage than BINFLOAT on boxes with
/// greater precision or dynamic range than IEEE-754 double.
float = 'F' ,
/// Float stored in binary form, with 8 bytes of data.
/// Introduced in protocol 1.
///
/// This generally requires less than half the space of FLOAT encoding.
/// In general, BINFLOAT cannot be used to transport infinities, NaNs, or
/// minus zero, raises an exception if the exponent exceeds the range of
/// an IEEE-754 double, and retains no more than 53 bits of precision (if
/// there are more than that, "add a half and chop" rounding is used to
/// cut it back to 53 significant bits).
binfloat = 'G' ,
/// Introduced in protocol 1.
empty_list = ']' ,
/// Append an object to a list.
///
/// Stack before: ... pylist anyobject
/// Stack after: ... pylist+[anyobject]
///
/// although pylist is really extended in-place.
append = 'a' ,
/// Extend a list by a slice of stack objects.
/// Introduced in protocol 1.
///
/// Stack before: ... pylist markobject stackslice
/// Stack after: ... pylist+stackslice
///
/// although pylist is really extended in-place.
appends = 'e' ,
/// Build a list out of the topmost stack slice, after markobject.
///
/// All the stack entries following the topmost markobject are placed into
/// a single Python list, which single list object replaces all of the
/// stack from the topmost markobject onward. For example,
///
/// Stack before: ... markobject 1 2 3 'abc'
/// Stack after: ... [1, 2, 3, 'abc']
list = 'l' ,
/// Introduced in protocol 1.
empty_tuple = ')' ,
/// Build a tuple out of the topmost stack slice, after markobject.
///
/// All the stack entries following the topmost markobject are placed into
/// a single Python tuple, which single tuple object replaces all of the
/// stack from the topmost markobject onward. For example,
///
/// Stack before: ... markobject 1 2 3 'abc'
/// Stack after: ... (1, 2, 3, 'abc')
tuple = 't' ,
/// Build a one-tuple out of the topmost item on the stack.
/// Introduced in protocol 2.
///
/// This code pops one value off the stack and pushes a tuple of
/// length 1 whose one item is that value back onto it. In other
/// words:
///
/// stack[-1] = tuple(stack[-1:])
tuple1 = 0x85 ,
/// Build a two-tuple out of the top two items on the stack.
/// Introduced in protocol 2.
///
/// This code pops two values off the stack and pushes a tuple of
/// length 2 whose items are those values back onto it. In other
/// words:
///
/// stack[-2:] = [tuple(stack[-2:])]
tuple2 = 0x86 ,
/// Build a three-tuple out of the top three items on the stack.
/// Introduced in protocol 2.
///
/// This code pops three values off the stack and pushes a tuple of
/// length 3 whose items are those values back onto it. In other
/// words:
///
/// stack[-3:] = [tuple(stack[-3:])]
tuple3 = 0x87 ,
/// Introduced in protocol 1.
empty_dict = '}' ,
/// Build a dict out of the topmost stack slice, after markobject.
///
/// All the stack entries following the topmost markobject are placed into
/// a single Python dict, which single dict object replaces all of the
/// stack from the topmost markobject onward. The stack slice alternates
/// key, value, key, value, .... For example,
///
/// Stack before: ... markobject 1 2 3 'abc'
/// Stack after: ... {1: 2, 3: 'abc'}
dict = 'd' ,
/// Add a key+value pair to an existing dict.
///
/// Stack before: ... pydict key value
/// Stack after: ... pydict
///
/// where pydict has been modified via pydict[key] = value.
setitem = 's' ,
/// Add an arbitrary number of key+value pairs to an existing dict.
/// Introduced in protocol 1.
///
/// The slice of the stack following the topmost markobject is taken as
/// an alternating sequence of keys and values, added to the dict
/// immediately under the topmost markobject. Everything at and after the
/// topmost markobject is popped, leaving the mutated dict at the top
/// of the stack.
///
/// Stack before: ... pydict markobject key_1 value_1 ... key_n value_n
/// Stack after: ... pydict
///
/// where pydict has been modified via pydict[key_i] = value_i for i in
/// 1, 2, ..., n, and in that order.
setitems = 'u' ,
/// Introduced in protocol 4.
empty_set = 0x8f ,
/// Add an arbitrary number of items to an existing set.
/// Introduced in protocol 4.
///
/// The slice of the stack following the topmost markobject is taken as
/// a sequence of items, added to the set immediately under the topmost
/// markobject. Everything at and after the topmost markobject is popped,
/// leaving the mutated set at the top of the stack.
///
/// Stack before: ... pyset markobject item_1 ... item_n
/// Stack after: ... pyset
///
/// where pyset has been modified via pyset.add(item_i) = item_i for i in
/// 1, 2, ..., n, and in that order.
additems = 0x90 ,
/// Build a frozenset out of the topmost slice, after markobject.
/// Introduced in protocol 4.
///
/// All the stack entries following the topmost markobject are placed into
/// a single Python frozenset, which single frozenset object replaces all
/// of the stack from the topmost markobject onward. For example,
///
/// Stack before: ... markobject 1 2 3
/// Stack after: ... frozenset({1, 2, 3})
frozenset = 0x91 ,
pop = '0' ,
dup = '2' ,
/// Push markobject onto the stack.
///
/// markobject is a unique object, used by other opcodes to identify a
/// region of the stack containing a variable number of objects for them
/// to work on. See markobject.doc for more detail.
mark = '(' ,
/// Pop all the stack objects at and above the topmost markobject.
/// Introduced in protocol 1.
///
/// When an opcode using a variable number of stack objects is done,
/// POP_MARK is used to remove those objects, and to remove the markobject
/// that delimited their starting position on the stack.
pop_mark = '1' ,
/// Read an object from the memo and push it on the stack.
///
/// The index of the memo object to push is given by the newline-terminated
/// decimal string following. BINGET and LONG_BINGET are space-optimized
/// versions.
get = 'g' ,
/// Read an object from the memo and push it on the stack.
/// Introduced in protocol 1.
///
/// The index of the memo object to push is given by the 1-byte unsigned
/// integer following.
binget = 'h' ,
/// Read an object from the memo and push it on the stack.
/// Introduced in protocol 1.
///
/// The index of the memo object to push is given by the 4-byte unsigned
/// little-endian integer following.
long_binget = 'j' ,
/// Store the stack top into the memo. The stack is not popped.
///
/// The index of the memo location to write into is given by the newline-
/// terminated decimal string following. BINPUT and LONG_BINPUT are
/// space-optimized versions.
put = 'p' ,
/// Store the stack top into the memo. The stack is not popped.
/// Introduced in protocol 1.
///
/// The index of the memo location to write into is given by the 1-byte
/// unsigned integer following.
binput = 'q' ,
/// Store the stack top into the memo. The stack is not popped.
/// Introduced in protocol 1.
///
/// The index of the memo location to write into is given by the 4-byte
/// unsigned little-endian integer following.
long_binput = 'r' ,
/// Store the stack top into the memo. The stack is not popped.
/// Introduced in protocol 4.
///
/// The index of the memo location to write is the number of
/// elements currently present in the memo.
memoize = 0x94 ,
/// Extension code.
/// Introduced in protocol 2.
///
/// This code and the similar EXT2 and EXT4 allow using a registry
/// of popular objects that are pickled by name, typically classes.
/// It is envisioned that through a global negotiation and
/// registration process, third parties can set up a mapping between
/// ints and object names.
///
/// In order to guarantee pickle interchangeability, the extension
/// code registry ought to be global, although a range of codes may
/// be reserved for private use.
///
/// EXT1 has a 1-byte integer argument. This is used to index into the
/// extension registry, and the object at that index is pushed on the stack.
ext1 = 0x82 ,
/// Extension code.
/// Introduced in protocol 2.
///
/// See EXT1. EXT2 has a two-byte integer argument.
ext2 = 0x83 ,
/// Extension code.
/// Introduced in protocol 2.
///
/// See EXT1. EXT4 has a four-byte integer argument.
ext4 = 0x84 ,
/// Push a global object (module.attr) on the stack.
///
/// Two newline-terminated strings follow the GLOBAL opcode. The first is
/// taken as a module name, and the second as a class name. The class
/// object module.class is pushed on the stack. More accurately, the
/// object returned by self.find_class(module, class) is pushed on the
/// stack, so unpickling subclasses can override this form of lookup.
global = 'c' ,
/// Push a global object (module.attr) on the stack.
/// Introduced in protocol 4.
stack_global = 0x93 ,
/// Push an object built from a callable and an argument tuple.
///
/// The opcode is named to remind of the __reduce__() method.
///
/// Stack before: ... callable pytuple
/// Stack after: ... callable(*pytuple)
///
/// The callable and the argument tuple are the first two items returned
/// by a __reduce__ method. Applying the callable to the argtuple is
/// supposed to reproduce the original object, or at least get it started.
/// If the __reduce__ method returns a 3-tuple, the last component is an
/// argument to be passed to the object's __setstate__, and then the REDUCE
/// opcode is followed by code to create setstate's argument, and then a
/// BUILD opcode to apply __setstate__ to that argument.
///
/// If not isinstance(callable, type), REDUCE complains unless the
/// callable has been registered with the copyreg module's
/// safe_constructors dict, or the callable has a magic
/// '__safe_for_unpickling__' attribute with a true value. I'm not sure
/// why it does this, but I've sure seen this complaint often enough when
/// I didn't want to <wink>.
reduce = 'R' ,
/// Finish building an object, via __setstate__ or dict update.
///
/// Stack before: ... anyobject argument
/// Stack after: ... anyobject
///
/// where anyobject may have been mutated, as follows:
///
/// If the object has a __setstate__ method,
///
/// anyobject.__setstate__(argument)
///
/// is called.
///
/// Else the argument must be a dict, the object must have a __dict__, and
/// the object is updated via
///
/// anyobject.__dict__.update(argument)
build = 'b' ,
/// Build a class instance.
///
/// This is the protocol 0 version of protocol 1's OBJ opcode.
/// INST is followed by two newline-terminated strings, giving a
/// module and class name, just as for the GLOBAL opcode (and see
/// GLOBAL for more details about that). self.find_class(module, name)
/// is used to get a class object.
///
/// In addition, all the objects on the stack following the topmost
/// markobject are gathered into a tuple and popped (along with the
/// topmost markobject), just as for the TUPLE opcode.
///
/// Now it gets complicated. If all of these are true:
///
/// + The argtuple is empty (markobject was at the top of the stack
/// at the start).
///
/// + The class object does not have a __getinitargs__ attribute.
///
/// then we want to create an old-style class instance without invoking
/// its __init__() method (pickle has waffled on this over the years; not
/// calling __init__() is current wisdom). In this case, an instance of
/// an old-style dummy class is created, and then we try to rebind its
/// __class__ attribute to the desired class object. If this succeeds,
/// the new instance object is pushed on the stack, and we're done.
///
/// Else (the argtuple is not empty, it's not an old-style class object,
/// or the class object does have a __getinitargs__ attribute), the code
/// first insists that the class object have a __safe_for_unpickling__
/// attribute. Unlike as for the __safe_for_unpickling__ check in REDUCE,
/// it doesn't matter whether this attribute has a true or false value, it
/// only matters whether it exists (XXX this is a bug). If
/// __safe_for_unpickling__ doesn't exist, UnpicklingError is raised.
///
/// Else (the class object does have a __safe_for_unpickling__ attr),
/// the class object obtained from INST's arguments is applied to the
/// argtuple obtained from the stack, and the resulting instance object
/// is pushed on the stack.
///
/// NOTE: checks for __safe_for_unpickling__ went away in Python 2.3.
/// NOTE: the distinction between old-style and new-style classes does
/// not make sense in Python 3.
inst = 'i' ,
/// Build a class instance.
/// Introduced in protocol 1.
///
/// This is the protocol 1 version of protocol 0's INST opcode, and is
/// very much like it. The major difference is that the class object
/// is taken off the stack, allowing it to be retrieved from the memo
/// repeatedly if several instances of the same class are created. This
/// can be much more efficient (in both time and space) than repeatedly
/// embedding the module and class names in INST opcodes.
///
/// Unlike INST, OBJ takes no arguments from the opcode stream. Instead
/// the class object is taken off the stack, immediately above the
/// topmost markobject:
///
/// Stack before: ... markobject classobject stackslice
/// Stack after: ... new_instance_object
///
/// As for INST, the remainder of the stack above the markobject is
/// gathered into an argument tuple, and then the logic seems identical,
/// except that no __safe_for_unpickling__ check is done (XXX this is
/// a bug). See INST for the gory details.
///
/// NOTE: In Python 2.3, INST and OBJ are identical except for how they
/// get the class object. That was always the intent; the implementations
/// had diverged for accidental reasons.
obj = 'o' ,
/// Build an object instance.
/// Introduced in protocol 2.
///
/// The stack before should be thought of as containing a class
/// object followed by an argument tuple (the tuple being the stack
/// top). Call these cls and args. They are popped off the stack,
/// and the value returned by cls.__new__(cls, *args) is pushed back
/// onto the stack.
newobj = 0x81 ,
/// Build an object instance.
/// Introduced in protocol 4.
///
/// The stack before should be thought of as containing a class
/// object followed by an argument tuple and by a keyword argument dict
/// (the dict being the stack top). Call these cls and args. They are
/// popped off the stack, and the value returned by
/// cls.__new__(cls, *args, *kwargs) is pushed back onto the stack.
newobj_ex = 0x92 ,
/// Protocol version indicator.
/// Introduced in protocol 2.
///
/// For protocol 2 and above, a pickle must start with this opcode.
/// The argument is the protocol version, an int in range(2, 256).
proto = 0x80 ,
/// Stop the unpickling machine.
///
/// Every pickle ends with this opcode. The object at the top of the stack
/// is popped, and that's the result of unpickling. The stack should be
/// empty then.
stop = '.' ,
/// Indicate the beginning of a new frame.
/// Introduced in protocol 4.
///
/// The unpickler may use this opcode to safely prefetch data from its
/// underlying stream.
frame = 0x95 ,
/// Push an object identified by a persistent ID.
///
/// The pickle module doesn't define what a persistent ID means. PERSID's
/// argument is a newline-terminated str-style (no embedded escapes, no
/// bracketing quote characters) string, which *is* "the persistent ID".
/// The unpickler passes this string to self.persistent_load(). Whatever
/// object that returns is pushed on the stack. There is no implementation
/// of persistent_load() in Python's unpickler: it must be supplied by an
/// unpickler subclass.
persid = 'P' ,
/// Push an object identified by a persistent ID.
/// Introduced in protocol 1.
///
/// Like PERSID, except the persistent ID is popped off the stack (instead
/// of being a string embedded in the opcode bytestream). The persistent
/// ID is passed to self.persistent_load(), and whatever object that
/// returns is pushed on the stack. See PERSID for more detail.
binpersid = 'Q' ,
_ ,
} ;
// The above enum was generated with the following Python code,
// run inside pickletools.py
//
// def generate_zig():
// print("""/// All possible pickle operators.
// /// Reference: https://github.com/python/cpython/blob/3.13/Lib/pickletools.py
// pub const OpCode = enum(u8) {
// """)
// for op in opcodes:
// lines = [_cleanup(l) for l in op.doc.split("\n")[:-1]]
// if op.proto > 0:
// lines.insert(1, _cleanup(f"Introduced in protocol {op.proto}."))
// doc = "\n".join(lines)
// op_code = op.code.__repr__()
// if op_code.startswith("'\\x"):
// op_code = "0x" + op_code[3:-1]
// print(f"""{doc}
// {op.name.lower()} = {op_code},
// """)
//
// print(" _,")
// print("};")
//
// def _cleanup(line: str) -> str:
// indent = " "
// if (line.startswith(indent)):
// line = line[len(indent):]
// line = line.replace(". ", ". ")
// if line:
// line = " " + line
// line = " ///" + line
// return line
2023-01-02 14:28:25 +00:00
/// A decoded Pickle operation in its natural state.
2023-04-07 16:45:58 +00:00
/// This is a bit different from Op enum,
/// because operators having same semantics, but different encoding have been merged.
/// ex: string, binstring, short_binstring -> string.
pub const Op = union ( enum ) {
// Initially numbers were represented by strings...
2023-03-28 16:17:00 +00:00
int : [ ] const u8 ,
2023-01-02 14:28:25 +00:00
binint : i32 ,
2023-03-28 16:17:00 +00:00
long : [ ] const u8 ,
string : [ ] const u8 ,
2023-04-07 16:45:58 +00:00
bytes : [ ] const u8 ,
bytearray : [ ] u8 ,
next_buffer ,
readonly_buffer ,
none ,
bool : bool ,
2023-03-28 16:17:00 +00:00
unicode : [ ] const u8 ,
2023-04-07 16:45:58 +00:00
float : [ ] const u8 ,
binfloat : f64 ,
empty_list ,
2023-01-02 14:28:25 +00:00
append ,
appends ,
list ,
empty_tuple ,
2023-04-07 16:45:58 +00:00
tuple ,
2023-01-02 14:28:25 +00:00
tuple1 ,
tuple2 ,
tuple3 ,
2023-04-07 16:45:58 +00:00
empty_dict ,
dict ,
setitem ,
setitems ,
2023-01-02 14:28:25 +00:00
empty_set ,
additems ,
frozenset ,
2023-04-07 16:45:58 +00:00
pop ,
dup ,
mark ,
pop_mark ,
get : u32 ,
put : u32 ,
2023-01-02 14:28:25 +00:00
memoize ,
2023-04-07 16:45:58 +00:00
ext1 : u8 ,
ext2 : i16 ,
ext4 : i32 ,
global : PyType ,
stack_global ,
reduce ,
build ,
inst : PyType ,
obj ,
newobj ,
newobj_ex ,
proto : u8 ,
stop ,
frame : u64 , // new frame and its size
persid : [ ] const u8 ,
binpersid ,
2023-01-02 14:28:25 +00:00
2023-03-28 16:17:00 +00:00
pub const PyType = struct { module : [ ] const u8 , class : [ ] const u8 } ;
2023-04-04 17:20:53 +00:00
pub fn deinit ( self : Op , allocator : std . mem . Allocator ) void {
2023-01-02 14:28:25 +00:00
switch ( self ) {
2023-04-07 16:45:58 +00:00
// Use a switch on the type of the stored data,
// this is easier than listing every opcode.
inline else = > | v | switch ( @TypeOf ( v ) ) {
void , bool , u8 , u16 , u32 , u64 , i16 , i32 , f64 = > { } ,
[ ] const u8 , [ ] u8 = > allocator . free ( v ) ,
PyType = > {
allocator . free ( v . module ) ;
allocator . free ( v . class ) ;
} ,
else = > @compileError ( " please explicit how to free this new opcode: " + + @typeName ( @TypeOf ( v ) ) ) ,
2023-01-02 14:28:25 +00:00
} ,
}
}
2023-04-04 17:20:53 +00:00
pub fn clone ( self : Op , allocator : std . mem . Allocator ) ! Op {
2023-01-02 14:28:25 +00:00
return switch ( self ) {
2023-04-07 16:45:58 +00:00
// Use a switch on the type of the stored data,
// this is easier than listing every opcode.
inline else = > | v , tag | switch ( @TypeOf ( v ) ) {
void , bool , u8 , u16 , u32 , u64 , i16 , i32 , f64 = > self ,
[ ] const u8 , [ ] u8 = > @unionInit ( Op , @tagName ( tag ) , try allocator . dupe ( u8 , v ) ) ,
PyType = > @unionInit ( Op , @tagName ( tag ) , . {
2023-03-28 16:17:00 +00:00
. module = try allocator . dupe ( u8 , v . module ) ,
. class = try allocator . dupe ( u8 , v . class ) ,
2023-04-07 16:45:58 +00:00
} ) ,
else = > @compileError ( " please explicit how to close this new opcode: " + + @typeName ( @TypeOf ( v ) ) ) ,
2023-01-02 14:28:25 +00:00
} ,
} ;
}
} ;
2023-04-07 16:45:58 +00:00
/// Read a stream of bytes, and interpret it as a stream of Pickle operators.
pub fn parse ( allocator : std . mem . Allocator , reader : anytype , max_line_len : usize ) ! [ ] const Op {
var results = std . ArrayList ( Op ) . init ( allocator ) ;
errdefer results . deinit ( ) ;
const len = max_line_len ;
while ( true ) {
const b = try reader . readByte ( ) ;
const code : OpCode = @enumFromInt ( b ) ;
const op : Op = switch ( code ) {
. int = > blk : {
const buf = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) ;
// Legacy hack, see OpCode.int documentation
// We do this parsing right away to simplify downstream code.
if ( std . mem . eql ( u8 , " 00 " , buf ) ) break : blk . { . bool = false } ;
if ( std . mem . eql ( u8 , " 01 " , buf ) ) break : blk . { . bool = true } ;
break : blk . { . int = buf } ;
} ,
. binint = > . { . binint = try reader . readInt ( i32 , . little ) } ,
. binint1 = > . { . binint = try reader . readByte ( ) } ,
. binint2 = > . { . binint = try reader . readInt ( u16 , . little ) } ,
// TODO: long should handle the trailing 'L' -> add a test.
. long = > . { . long = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) } ,
. long1 = > . { . long = try _readSlice ( reader , allocator , 1 ) } ,
. long4 = > . { . long = try _readSlice ( reader , allocator , 4 ) } ,
. string = > . { . string = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) } ,
. binstring = > . { . string = try _readSlice ( reader , allocator , 4 ) } ,
. short_binstring = > . { . string = try _readSlice ( reader , allocator , 1 ) } ,
. binbytes = > . { . bytes = try _readSlice ( reader , allocator , 4 ) } ,
. binbytes8 = > . { . bytes = try _readSlice ( reader , allocator , 8 ) } ,
. short_binbytes = > . { . bytes = try _readSlice ( reader , allocator , 1 ) } ,
. bytearray8 = > . { . bytearray = try _readSlice ( reader , allocator , 8 ) } ,
. next_buffer = > . next_buffer ,
. readonly_buffer = > . readonly_buffer ,
. none = > . none ,
. newtrue = > . { . bool = true } ,
. newfalse = > . { . bool = false } ,
. unicode = > . { . unicode = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) } ,
. short_binunicode = > . { . unicode = try _readSlice ( reader , allocator , 1 ) } ,
. binunicode = > . { . unicode = try _readSlice ( reader , allocator , 4 ) } ,
. binunicode8 = > . { . unicode = try _readSlice ( reader , allocator , 8 ) } ,
. float = > . { . float = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) } ,
. binfloat = > . { . binfloat = @bitCast ( try reader . readInt ( u64 , . big ) ) } ,
. empty_list = > . empty_list ,
. append = > . append ,
. appends = > . appends ,
. list = > . list ,
. empty_tuple = > . empty_tuple ,
. tuple = > . tuple ,
. tuple1 = > . tuple1 ,
. tuple2 = > . tuple2 ,
. tuple3 = > . tuple3 ,
. empty_dict = > . empty_dict ,
. dict = > . dict ,
. setitem = > . setitem ,
. setitems = > . setitems ,
. empty_set = > . empty_set ,
. additems = > . additems ,
. frozenset = > . frozenset ,
. pop = > . pop ,
. dup = > . dup ,
. mark = > . mark ,
. pop_mark = > . pop_mark ,
. get = > blk : {
const buf = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) ;
defer allocator . free ( buf ) ;
// If we fail to parse delay the error to the evaluation.
const n = std . fmt . parseInt ( u32 , buf , 10 ) catch std . math . maxInt ( u32 ) ;
break : blk . { . get = n } ;
} ,
. binget = > . { . get = try reader . readByte ( ) } ,
. long_binget = > . { . get = try reader . readInt ( u32 , . little ) } ,
. put = > blk : {
const buf = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) ;
defer allocator . free ( buf ) ;
const n = std . fmt . parseInt ( u32 , buf , 10 ) catch std . math . maxInt ( u32 ) ;
break : blk . { . put = n } ;
} ,
. binput = > . { . put = try reader . readByte ( ) } ,
. long_binput = > . { . put = try reader . readInt ( u32 , . little ) } ,
. memoize = > . memoize ,
. ext1 = > . { . ext1 = try reader . readByte ( ) } ,
. ext2 = > . { . ext2 = try reader . readInt ( i16 , . little ) } ,
. ext4 = > . { . ext4 = try reader . readInt ( i32 , . little ) } ,
. global = > . { . global = . {
. module = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) ,
. class = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) ,
} } ,
. stack_global = > . stack_global ,
. reduce = > . reduce ,
. build = > . build ,
. inst = > . { . inst = . {
. module = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) ,
. class = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) ,
} } ,
. obj = > . obj ,
. newobj = > . newobj ,
. newobj_ex = > . newobj_ex ,
. proto = > blk : {
const version = try reader . readByte ( ) ;
if ( version > 5 ) log . warn ( " zml.aio.torch.pickle.parse expects a Python pickle object of version <=5, got version {}. Will try to interpret anyway, but this may lead to more errors. " , . { version } ) ;
break : blk . { . proto = version } ;
} ,
. stop = > . stop ,
// This is not documented in pickletools but in https://peps.python.org/pep-3154/
// The frame size is stored right after the frame header.
// The loader is allowed to prefetch framesize from the underlying reader,
// and ops are not allowed to cross a frame boundary.
// We don't prefetch because we assume the reader is going to use some kind of buffered reader.
// We could try to enforce frame boundaries, but we would need to track
// how many bytes we are reading from the stream.
. frame = > . { . frame = try reader . readInt ( u64 , . little ) } ,
. persid = > . { . persid = try reader . readUntilDelimiterAlloc ( allocator , '\n' , len ) } ,
. binpersid = > . binpersid ,
_ = > | unk_tag | {
log . err ( " Unknow pickle operator {}, note we are only supporting pickle protocol up to version 5. " , . { unk_tag } ) ;
return error . NotSupported ;
} ,
} ;
try results . append ( op ) ;
if ( op = = . stop ) break ;
}
return results . toOwnedSlice ( ) ;
}
test parse {
const allocator = std . testing . allocator ;
const file = try std . fs . cwd ( ) . openFile ( " zml/aio/torch/simple_test.pickle " , . { . mode = . read_only } ) ;
var buffered_reader = std . io . bufferedReader ( file . reader ( ) ) ;
const ops = try parse ( allocator , buffered_reader . reader ( ) , 4096 ) ;
defer {
// Test we are correctly freeing every allocation.
for ( ops ) | op | op . deinit ( allocator ) ;
allocator . free ( ops ) ;
}
try std . testing . expect ( ops . len = = 35 ) ;
// this can be obtained by running: `python -m pickletools simple_test.pickle`
const expected = [ _ ] Op {
. { . proto = 4 } ,
. { . frame = 83 } ,
. empty_dict ,
. memoize ,
. mark ,
. { . unicode = " hello " } ,
. memoize ,
. { . unicode = " world " } ,
. memoize ,
. { . unicode = " int " } ,
. memoize ,
. { . binint = 1 } ,
. { . unicode = " float " } ,
. memoize ,
. { . binfloat = 3.141592 } ,
. { . unicode = " list " } ,
. memoize ,
. empty_list ,
. memoize ,
. mark ,
. { . binint = 0 } ,
. { . binint = 1 } ,
. { . binint = 2 } ,
. { . binint = 3 } ,
. { . binint = 4 } ,
. appends ,
. { . unicode = " tuple " } ,
. memoize ,
. { . unicode = " a " } ,
. memoize ,
. { . binint = 10 } ,
. tuple2 ,
. memoize ,
. setitems ,
. stop ,
} ;
try std . testing . expectEqualDeep ( & expected , ops ) ;
}
fn _readSlice ( reader : anytype , allocator : std . mem . Allocator , comptime len_bytes : u8 ) ! [ ] u8 {
const T = std . meta . Int ( . unsigned , 8 * len_bytes ) ;
const str_len : u64 = try reader . readInt ( T , . little ) ;
const buf = try allocator . alloc ( u8 , str_len ) ;
errdefer allocator . free ( buf ) ;
_ = try reader . read ( buf ) ;
return buf ;
}