This recipe provides a higher level wrapper around the struct module. It provides a more convenient syntax for defining and using structs, and adds additional features such as: - Allows embedding structures within other structures - Allows defining arrays of items (or other structures) - Class based syntax, allowing access and updates by field name, not position - Extension of structures by inheritance
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | import struct
class Format(object):
"""Endianness and size format for structures."""
Native = "@" # Native format, native size
StandardNative = "=" # Native format, standard size
LittleEndian = "<" # Standard size
BigEndian = ">" # Standard size
class Element(object):
"""A single element in a struct."""
id=0
def __init__(self, typecode):
Element.id+=1 # Note: not thread safe
self.id = Element.id
self.typecode = typecode
self.size = struct.calcsize(typecode)
def __len__(self):
return self.size
def decode(self, format, s):
"""Additional decode steps once converted via struct.unpack"""
return s
def encode(self, format, val):
"""Additional encode steps to allow packing with struct.pack"""
return val
def __str__(self):
return self.typecode
def __call__(self, num):
"""Define this as an array of elements."""
# Special case - strings already handled as one blob.
if self.typecode in 'sp':
# Strings handled specially - only one item
return Element('%ds' % num)
else:
return ArrayElement(self, num)
def __getitem__(self, num): return self(num)
class ArrayElement(Element):
def __init__(self, basic_element, num):
Element.__init__(self, '%ds' % (len(basic_element) * num))
self.num = num
self.basic_element = basic_element
def decode(self, format, s):
# NB. We use typecode * size, not %s%s' % (size, typecode),
# so we deal with typecodes that already have numbers,
# ie 2*'4s' != '24s'
return [self.basic_element.decode(format, x) for x in
struct.unpack('%s%s' % (format,
self.num * self.basic_element.typecode),s)]
def encode(self, format, vals):
fmt = format + (self.basic_element.typecode * self.num)
return struct.pack(fmt, *[self.basic_element.encode(format,v)
for v in vals])
class EmbeddedStructElement(Element):
def __init__(self, structure):
Element.__init__(self, '%ds' % structure._struct_size)
self.struct = structure
# Note: Structs use their own endianness format, not their parent's
def decode(self, format, s):
return self.struct(s)
def encode(self, format, s):
return self.struct._pack(s)
name_to_code = {
'Char' : 'c',
'Byte' : 'b',
'UnsignedByte' : 'B',
'Int' : 'i',
'UnsignedInt' : 'I',
'Short' : 'h',
'UnsignedShort' : 'H',
'Long' : 'l',
'UnsignedLong' : 'L',
'String' : 's',
'PascalString' : 'p',
'Pointer' : 'P',
'Float' : 'f',
'Double' : 'd',
'LongLong' : 'q',
'UnsignedLongLong' : 'Q',
}
class Type(object):
def __getattr__(self, name):
return Element(name_to_code[name])
def Struct(self, struct):
return EmbeddedStructElement(struct)
Type=Type()
class MetaStruct(type):
def __init__(cls, name, bases, d):
type.__init__(cls, name, bases, d)
if hasattr(cls, '_struct_data'): # Allow extending by inheritance
cls._struct_info = list(cls._struct_info) # use copy.
else:
cls._struct_data=''
cls._struct_info=[] # name / element pairs
# Get each Element field, sorted by id.
elems = sorted(((k,v) for (k,v) in d.iteritems()
if isinstance(v, Element)),
key=lambda x:x[1].id)
cls._struct_data += ''.join(str(v) for (k,v) in elems)
cls._struct_info += elems
cls._struct_size = struct.calcsize(cls._format + cls._struct_data)
class Struct(object):
"""Represent a binary structure."""
__metaclass__=MetaStruct
_format = Format.Native # Default to native format, native size
def __init__(self, _data=None, **kwargs):
if _data is None:
_data ='\0' * self._struct_size
fieldvals = zip(self._struct_info, struct.unpack(self._format +
self._struct_data, _data))
for (name, elem), val in fieldvals:
setattr(self, name, elem.decode(self._format, val))
for k,v in kwargs.iteritems():
setattr(self, k, v)
def _pack(self):
return struct.pack(self._format + self._struct_data,
*[elem.encode(self._format, getattr(self, name))
for (name,elem) in self._struct_info])
def __str__(self):
return self._pack()
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self._pack())
###################################################################
# End of implementation - usage examples follow:
###################################################################
###################################################################
#
# Usage
#
# Using the above code, we can now define structures in a
# more readable class based syntax. For example:
###################################################################
class Point(Struct):
_format = Format.LittleEndian
x = Type.Short
y = Type.Short
p = Point('\x01\x00\x02\x00')
print p.x, p.y # Prints 1,2
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
assert(struct.pack('<hh',100,200) == str(p))
###################################################################
#
# Arrays and Embedded structures
#
# You can also embed arrays, (and arrays of arrays), as well
# as other structures within your struct definition.
###################################################################
class Shape(Struct):
_format = Format.BigEndian
name = Type.String[8]
numpoints = Type.Int
points = Type.Struct(Point)[4] # Array of 4 points.
s=Shape('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x05\x00\x05\x00\n\x00'
'\x00\x00\x00\x00\x00\x00')
# This will print "Triangle [(0,0), (5,5), (10,0)]"
print s.name, [(p.x, p.y) for p in s.points[:s.numpoints]]
# The same structure could be created as:
s2=Shape(name='Triangle', numpoints=3, points=[
Point(x=0,y=0),
Point(x=5,y=5),
Point(x=10,y=0),
Point(x=0,y=0)])
assert str(s2) == str(s)
# Note that even though Shape is in BigEndian format, the Points
# keep their LittleEndian setting, so mixing formats is possible,
# and the same struct will always have the same representation
# regardless of its context. Hence the following is true:
assert str(s.points[1]) == str( Point(x=5, y=5))
# It is also possible to define multi-dimensional arrays,
# which will be unpacked as lists of lists.
# In addition, it is possible to add methods and non-struct
# instance variables without interfering with the structure
# (Unless you overwrite structure field names of course)
class TicTacToe(Struct):
board = Type.Char[3][3] # 3x3 array of chars
ignored = 'This is not packed / unpacked by the structure'
def display(self):
print '\n'.join(''.join(row) for row in self.board)
game = TicTacToe('X.O.X...O')
print game.board # [['X', '.', 'O'], ['.', 'X', '.'], ['.', '.', 'O']]
game.display()
# Prints: X.O
# .X.
# ..O
game.board[0][1] = 'X'
game.display()
# Prints: XXO
# .X.
# ..O
print str(game) # prints 'XXO.X...O'
###################################################################
#
# Inheritance
#
# Structures may also be inherited from, in which case, additional
# fields will occur after the existing ones.
#
###################################################################
class Point3D(Point):
z = Type.Short
p = Point3D(x=1, y=2, z=3)
print repr(p) # prints Point3D('\x01\x00\x02\x00\x03\x00')
|
The standard struct module is useful when dealing with C structs, and various file and network formats, but is rather awkward to work with. It is fairly low-level, using strings of character codes to describe the structure, and unpacks to a tuple, rather than an object with appropriately named fields. This means that access to items is always by position, rather than the field name, as would be used in the C code.
It also fares badly when dealing with arrays, and embedded structures. Dealing with these within the struct module means each element becomes effectively the same as a top-level field, flattening their structure and losing the organisation of elements. This all results in code that is much messier and hard to maintain than the equivalent C code.
This recipe builds on top of the struct module and provides syntax for declaration of structures that is closer to the corresponding C code, and allows for more complex structures to be defined.
Structures are read by instantiating the appropriate class with the binary struct data, and packed by calling str(aStruct). I use the above code as a module named "structure".
Implementation:
Structs are defined by lists of Element objects. Each element contains the struct code for the type, and a unique id, incremented after each instantiation, which is used so that the elements can be sorted into the same order that they were defined within a struct. They also define __getitem__, returning another Element subclass, ArrayElement, dealing with arrays of simple elements. Another Element subclass, EmbeddedStructElement is used to represent substructures.
The Type object provides some syntax sugar for constructing these elements, allowing "x = Type.Int", instead of "x = Element("i")"
The MetaStruct metaclass checks for all fields which are subclasses of Element, sorts them into the definition order, and uses them to create the corresponding struct format string, and the corresponding list of Elements
Finally, the Struct class defines appropriate __init__ and __str__ methods to use this generated by the metaclass to encode and decode the structure. Arrays and substructures are treated as strings of the appropriate size, and implement an encode() and decode() method which will transform from the string to the appropriate data, or vice-versa.
Warnings and Caveats:
There are a few flaws in the above code currently. The main one is that there is no validation when setting struct fields. For instance, given a struct like:
>>> class MyStruct(Struct):
... name = Type.String[8]
... values = Type.Int[4]
>>> a=MyStruct() # Initialises with everything zeroed.
There is nothing preventing you from doing:
>>> a.values = [1,2,3,4,5] # One too many items.
This will now fail when trying to pack the structure with a struct error which doesn't give you any clue as to which field is corrupt. Probably the main thing that should be done is either some kind of pre-validation of items, or at least error handling that mentions what field is invalid. Worse is that doing:
>>> a.name = "thisnameistoobig"
will not raise an error even when packing the structure, but will instead silently truncate the name in the packed representation.
Another thing to be careful of is that duplicating names will cause the last definition to be used, rather than resulting in multiple elements, or giving an error. For instance:
>>> class MyStruct(Struct):
... item1=Type.Char[5]
... pad = Type.Char[3]
... item2=Type.Char[1]
... pad = Type.Char[3]
... item3=Type.Char[5]
Here the first pad bytes will not be used, as the second defintion of pad will override the name.
Also, note that you must create element objects (ie Type.Int) within the struct, rather than reusing them. For instance:
>>> myArrayType = Type.Int[8] # define an 8 element array
>>> class MyStruct(Struct):
... name = Type.String
... items = myArrayType # Don't do this
Here items will actually be packed before name, as it was created earlier, so the definition order will not match the packed order.
Finally, note that padding and alignment will be handled the same as the struct module.
Very interresting code :-). Class MetaStruct is interresting. I didn't know that it's possible to get attributes in the order that they are defined. I have to play with metaclasses :-) You have to see pyConstruct project: http://pyconstruct.wikispaces.com
I'm working on the same subject but with different syntax (different approach): http://hachoir.org/
Hachoir is a lazy-parser and fault tolerant. It allows to edit data and have tree organization with nice Python API.
Haypo
Dynamic arrays? Would it be possible to allow the arrays to be dynamic? I know some binary formats that use that, like
I haven't found any struct-replacement that does that.
Dynamic arrays. I think this would be fairly hard to do. Currently I'm relying on knowing the sizes of various substructures etc. in advance, in order to know how to represent them in containing structures. I think it should be possible (given some restrictions like the count appearing before the array in the struct), but would probably require a different approach.
Definition order. Getting the definition order requires a bit of a cheat. The idea is to create an object, and track the order they were created in, and then later sort based on this order. It does limit the syntax you can use to something that can create and return a new object though: "x=Foo.attr", "x=Foo()" and "x=Foo[1]" would all work given an appropriate Foo , but you can't use just "x = Foo".
pyConstruct looks pretty neat. I'll check it out.
Both Hachoir and pyConstruct allow dynamic structure. pyConstruct uses Python eval() function to access to other fields:
Hachoir approch is different: you directly access to the structure using [] operator (self["name"]):
Where self["size"] is a Field object and has many attributes: value, address, absolute_address, parent, display (unicode string), etc. I don't have enough place here to explain all Hachoir internals :-)
Haypo
Just to follow up. In fact, pyConstruct, linked to in the post above by Victor Stinner looks like it would handle this. I think the equivalent of your example would be something like:
Cool, thanks for the tips, I'll look into them!
Very nice recipe. This is a very nice recipe, worth the inclusion in the standard library!
I have a remark though. Is it possible to split the construction by keywords from the construction by decoding.
I mean:
s1=Shape(name='Triangle', numpoints=3, points=[ Point(x=0,y=0), Point(x=5,y=5), Point(x=10,y=0), Point(x=0,y=0)])
and
s2=Shape.decode('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x00\x05\x00\x05\x00\x0A' '\x00\x00\x00\x00\x00\x00')
This would be neater as you don't know in advance the contents of the string buffer and you might have a higher-level decode resting on several lower-level decode This is a very nice recipe, worth the inclusion in the standard library!
I have a remark though. Is it possible to split the construction by keywords from the construction by decoding.
I mean:
s1=Shape(name='Triangle', numpoints=3, points=[ Point(x=0,y=0), Point(x=5,y=5), Point(x=10,y=0), Point(x=0,y=0)])
and
s2=Shape.decode('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x00\x05\x00\x05\x00\x0A' '\x00\x00\x00\x00\x00\x00')
This would be neater as you don't know in advance the contents of the string buffer and you might have a higher-level decode resting on several lower-level decode
inheritance bug.
I'm not seeing it. Could you give some example code that fails. I've tried both
after the same code as above, and both seem to work. I did have a similar bug when developing it, but fixed it by taking a copy of _struct_data for the subclass (previously it was mutating the parent class's list). Is this still happening somewhere?
decode method. It should be pretty simple to do - just move the _data parameter and all but the "for k,v in kwargs.iteritems():" loop out of __init__ and into a new decode classmethod. I'm not sure what you mean by your use case though - wouldn't that also be possible through __init__ too?
The main reason I went with __init__ and __str__ overloads rather than pack / unpack methods was that I didn't want to add anything into the public namespace of the class, as it would prevent defining struct fields with the same name. If there's a good reason though, perhaps this isn't that important.
Sorry. Stupid me, I was experimenting with the code and i broke it myself ! Sorry !
Strange behaviour of class Point. Can you please explain me some strange behaviour.
I try to play with the Point class.
The code is:
class Point(Struct):
p = Point()
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
I have error (of struct.unpack(...)) in __init__ of Struct.
If I change the _format = Format.Native error is gone.
If I put the a = Type.Char after the y = Type.Short the error is gone too.
Strange behaviour of class Point. Can you please explain me some strange behaviour.
I try to play with the Point class.
The code is:
class Point(Struct):
p = Point()
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
I have error (of struct.unpack(...)) in __init__ of Struct.
If I change the _format = Format.Native error is gone.
If I put the a = Type.Char after the y = Type.Short the error is gone too.
Strange behaviour of class Point. Can you please explain me some strange behaviour.
I try to play with the Point class.
The code is:
class Point(Struct):
p = Point()
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
I have error (of struct.unpack(...)) in __init__ of Struct.
If I change the _format = Format.Native error is gone.
If I put the a = Type.Char after the y = Type.Short the error is gone too.
Strange behaviour of class Point. Can you please explain me some strange behaviour.
I try to play with the Point class.
The code is:
class Point(Struct):
p = Point()
p.x, p.y = 100,200
print repr(p) # Prints "Point('d\x00\xc8\x00')
I have error (of struct.unpack(...)) in __init__ of Struct.
If I change the _format = Format.Native error is gone.
If I put the a = Type.Char after the y = Type.Short the error is gone too.
Thats a bug. The padding applied by the struct module is different depending on the format selected. With native format, an extra pad byte is inserted after the char to align it to an even boundary. With the other formats, no padding is done.
I was including the format information when building the string, but not when calculating the size of the structure, so this was always defaulting to native format, giving a _struct_size of 6 when the struct expected 5. This meant that the initialisation string was the wrong size, giving the error you saw.
I've now updated the recipe to fix this. The relevant change was changing the line:
to
Thanks.
TypeError: 'Element' object is unindexable. Hello, I'm a beginning Python programmer and am very interested using your 'A higher level struct module' code. My problem may be as simple as version confusion on my part, but I'm stuck. Here's a transcript. I've saved the text source into packclass.py minus the examples at the end.
I can load that file and can successfully do the example with the Point class. However, when I try the Shape example, a class that uses an array type, I get an error.
Is there a workaround?
Thank you,
TypeError: 'Element' object is unindexable.
I'm not sure. I tried that code here, and I don't get an error. From the exception you're getting, it looks like its not finding __getitem__ on the Element object (I think it is the line "name = Type.String[8]" that is failing)
Could you check that the line
is correctly copied below class Element. If it is missing, or not indented with the rest of the Element members, it would cause the error you're seeing.
Just a warning to anyone who seriously tries this recipe, it encodes structs to char buffers like so: a long ( l ) into 8s, which means that it will go back and forth a few times. If you profile a script that uses this underneath, you'll find that it spends about 2/3's its time in this code. Just use the actual struct module with an __init__ method that unpacks something just this would into the desired attributes. You'll have a much more efficient script.