A higher level struct module « Python recipes

This recipe provides a higher level wrapper around the struct module. It provides a more convenient syntax for defining and using structs, and adds additional features such as: - Allows embedding structures within other structures - Allows defining arrays of items (or other structures) - Class based syntax, allowing access and updates by field name, not position - Extension of structures by inheritance

      import struct

class Format(object):
    """Endianness and size format for structures."""
    Native          = "@"       # Native format, native size
    StandardNative  = "="       # Native format, standard size
    LittleEndian    = "<"       # Standard size
    BigEndian       = ">"       # Standard size
    
class Element(object):
    """A single element in a struct."""
    id=0
    def __init__(self, typecode):
        Element.id+=1           # Note: not thread safe
        self.id = Element.id
        self.typecode = typecode
        self.size = struct.calcsize(typecode)

    def __len__(self):
        return self.size

    def decode(self, format, s):
        """Additional decode steps once converted via struct.unpack"""
        return s

    def encode(self, format, val):
        """Additional encode steps to allow packing with struct.pack"""
        return val

    def __str__(self):
        return self.typecode

    def __call__(self, num):
        """Define this as an array of elements."""
        # Special case - strings already handled as one blob.
        if self.typecode in 'sp':
            # Strings handled specially - only one item
            return Element('%ds' % num)
        else:
            return ArrayElement(self, num)

    def __getitem__(self, num): return self(num)

class ArrayElement(Element):
    def __init__(self, basic_element, num):
        Element.__init__(self, '%ds' % (len(basic_element) * num))
        self.num = num
        self.basic_element = basic_element

    def decode(self, format, s):
        # NB. We use typecode * size, not %s%s' % (size, typecode), 
        # so we deal with typecodes that already have numbers,  
        # ie 2*'4s' != '24s'
        return [self.basic_element.decode(format, x) for x in  
                    struct.unpack('%s%s' % (format, 
                            self.num * self.basic_element.typecode),s)]

    def encode(self, format, vals):
        fmt = format + (self.basic_element.typecode * self.num)
        return struct.pack(fmt, *[self.basic_element.encode(format,v) 
                                  for v in vals])

class EmbeddedStructElement(Element):
    def __init__(self, structure):
        Element.__init__(self, '%ds' % structure._struct_size)
        self.struct = structure

    # Note: Structs use their own endianness format, not their parent's
    def decode(self, format, s):
        return self.struct(s)

    def encode(self, format, s):
        return self.struct._pack(s)

name_to_code = {
    'Char'             : 'c',
    'Byte'             : 'b',
    'UnsignedByte'     : 'B',
    'Int'              : 'i',
    'UnsignedInt'      : 'I',
    'Short'            : 'h',
    'UnsignedShort'    : 'H',
    'Long'             : 'l',
    'UnsignedLong'     : 'L',
    'String'           : 's',  
    'PascalString'     : 'p',  
    'Pointer'          : 'P',
    'Float'            : 'f',
    'Double'           : 'd',
    'LongLong'         : 'q',
    'UnsignedLongLong' : 'Q',
    }

class Type(object):
    def __getattr__(self, name):
        return Element(name_to_code[name])

    def Struct(self, struct):
        return EmbeddedStructElement(struct)
        
Type=Type()

class MetaStruct(type):
    def __init__(cls, name, bases, d):
        type.__init__(cls, name, bases, d)
        if hasattr(cls, '_struct_data'):  # Allow extending by inheritance
            cls._struct_info = list(cls._struct_info) # use copy.
        else:
            cls._struct_data=''
            cls._struct_info=[]     # name / element pairs

        # Get each Element field, sorted by id.
        elems = sorted(((k,v) for (k,v) in d.iteritems() 
                        if isinstance(v, Element)),
                        key=lambda x:x[1].id)

        cls._struct_data += ''.join(str(v) for (k,v) in elems)
        cls._struct_info += elems
        cls._struct_size = struct.calcsize(cls._format + cls._struct_data)

class Struct(object):
    """Represent a binary structure."""
    __metaclass__=MetaStruct
    _format = Format.Native  # Default to native format, native size

    def __init__(self, _data=None, **kwargs):
        if _data is None:
            _data ='\0' * self._struct_size
            
        fieldvals = zip(self._struct_info, struct.unpack(self._format + 
                                             self._struct_data, _data))
        for (name, elem), val in fieldvals:
            setattr(self, name, elem.decode(self._format, val))
        
        for k,v in kwargs.iteritems():
            setattr(self, k, v)

    def _pack(self):
        return struct.pack(self._format + self._struct_data, 
            *[elem.encode(self._format, getattr(self, name)) 
                for (name,elem) in self._struct_info])                

    def __str__(self):
        return self._pack()
    
    def __repr__(self):
        return "%s(%r)" % (self.__class__.__name__, self._pack())
    
###################################################################
#  End of implementation - usage examples follow:
###################################################################

###################################################################
#
# Usage
#
# Using the above code, we can now define structures in a
# more readable class based syntax.  For example:
###################################################################
    
class Point(Struct):
    _format = Format.LittleEndian
    x = Type.Short
    y = Type.Short
    
p = Point('\x01\x00\x02\x00')

print p.x, p.y   # Prints 1,2
p.x, p.y = 100,200
print repr(p)     # Prints "Point('d\x00\xc8\x00')

assert(struct.pack('<hh',100,200) == str(p))
    
###################################################################
#
# Arrays and Embedded structures
#
# You can also embed arrays, (and arrays of arrays), as well
# as other structures within your struct definition.
###################################################################

class Shape(Struct):
    _format = Format.BigEndian
    name      = Type.String[8]
    numpoints = Type.Int
    points    = Type.Struct(Point)[4] # Array of 4 points.

s=Shape('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x05\x00\x05\x00\n\x00'
        '\x00\x00\x00\x00\x00\x00')

# This will print "Triangle [(0,0), (5,5), (10,0)]"
print s.name, [(p.x, p.y) for p in s.points[:s.numpoints]]

# The same structure could be created as:
s2=Shape(name='Triangle', numpoints=3, points=[
                                         Point(x=0,y=0),
                                         Point(x=5,y=5),
                                         Point(x=10,y=0),
                                         Point(x=0,y=0)])

assert str(s2) == str(s)

# Note that even though Shape is in BigEndian format, the Points
# keep their LittleEndian setting, so mixing formats is possible,
# and the same struct will always have the same representation
# regardless of its context.  Hence the following is true:

assert str(s.points[1]) == str( Point(x=5, y=5))

# It is also possible to define multi-dimensional arrays,
# which will be unpacked as lists of lists.
# In addition, it is possible to add methods and non-struct
# instance variables without interfering with the structure
# (Unless you overwrite structure field names of course)

class TicTacToe(Struct):
    board = Type.Char[3][3] # 3x3 array of chars

    ignored = 'This is not packed / unpacked by the structure'
    
    def display(self):
        print '\n'.join(''.join(row) for row in self.board)

game = TicTacToe('X.O.X...O')
print game.board  # [['X', '.', 'O'], ['.', 'X', '.'], ['.', '.', 'O']]

game.display()
# Prints: X.O
#         .X.
#         ..O

game.board[0][1] = 'X'
game.display()
# Prints: XXO
#         .X.
#         ..O
print str(game) # prints 'XXO.X...O'


###################################################################
#
# Inheritance
#
# Structures may also be inherited from, in which case, additional
# fields will occur after the existing ones.
#
###################################################################

class Point3D(Point):
    z = Type.Short

p = Point3D(x=1, y=2, z=3)

print repr(p)   # prints Point3D('\x01\x00\x02\x00\x03\x00')

      

The standard struct module is useful when dealing with C structs, and various file and network formats, but is rather awkward to work with. It is fairly low-level, using strings of character codes to describe the structure, and unpacks to a tuple, rather than an object with appropriately named fields. This means that access to items is always by position, rather than the field name, as would be used in the C code.

It also fares badly when dealing with arrays, and embedded structures. Dealing with these within the struct module means each element becomes effectively the same as a top-level field, flattening their structure and losing the organisation of elements. This all results in code that is much messier and hard to maintain than the equivalent C code.

This recipe builds on top of the struct module and provides syntax for declaration of structures that is closer to the corresponding C code, and allows for more complex structures to be defined.

Structures are read by instantiating the appropriate class with the binary struct data, and packed by calling str(aStruct). I use the above code as a module named "structure".

Implementation:

Structs are defined by lists of Element objects. Each element contains the struct code for the type, and a unique id, incremented after each instantiation, which is used so that the elements can be sorted into the same order that they were defined within a struct. They also define __getitem__, returning another Element subclass, ArrayElement, dealing with arrays of simple elements. Another Element subclass, EmbeddedStructElement is used to represent substructures.

The Type object provides some syntax sugar for constructing these elements, allowing "x = Type.Int", instead of "x = Element("i")"

The MetaStruct metaclass checks for all fields which are subclasses of Element, sorts them into the definition order, and uses them to create the corresponding struct format string, and the corresponding list of Elements

Finally, the Struct class defines appropriate __init__ and __str__ methods to use this generated by the metaclass to encode and decode the structure. Arrays and substructures are treated as strings of the appropriate size, and implement an encode() and decode() method which will transform from the string to the appropriate data, or vice-versa.

Warnings and Caveats:

There are a few flaws in the above code currently. The main one is that there is no validation when setting struct fields. For instance, given a struct like:

>>> class MyStruct(Struct):
...     name = Type.String[8]
...     values = Type.Int[4]



>>> a=MyStruct()  # Initialises with everything zeroed.

There is nothing preventing you from doing:

>>> a.values = [1,2,3,4,5] # One too many items.

This will now fail when trying to pack the structure with a struct error which doesn't give you any clue as to which field is corrupt. Probably the main thing that should be done is either some kind of pre-validation of items, or at least error handling that mentions what field is invalid. Worse is that doing:

>>> a.name = "thisnameistoobig"

will not raise an error even when packing the structure, but will instead silently truncate the name in the packed representation.

Another thing to be careful of is that duplicating names will cause the last definition to be used, rather than resulting in multiple elements, or giving an error. For instance:

>>> class MyStruct(Struct):
...     item1=Type.Char[5]
...     pad = Type.Char[3]
...     item2=Type.Char[1]
...     pad  = Type.Char[3]
...     item3=Type.Char[5]

Here the first pad bytes will not be used, as the second defintion of pad will override the name.

Also, note that you must create element objects (ie Type.Int) within the struct, rather than reusing them. For instance:

>>> myArrayType = Type.Int[8] # define an 8 element array
>>> class MyStruct(Struct):
...     name  = Type.String
...     items = myArrayType         # Don't do this

Here items will actually be packed before name, as it was created earlier, so the definition order will not match the packed order.

Finally, note that padding and alignment will be handled the same as the struct module.

Tags: database

21 comments

Victor Stinner 17 years, 7 months ago # | flag

Very interresting code :-). Class MetaStruct is interresting. I didn't know that it's possible to get attributes in the order that they are defined. I have to play with metaclasses :-) You have to see pyConstruct project: http://pyconstruct.wikispaces.com

I'm working on the same subject but with different syntax (different approach): http://hachoir.org/

Hachoir is a lazy-parser and fault tolerant. It allows to edit data and have tree organization with nice Python API.

Haypo

Chris Niekel 17 years, 7 months ago # | flag

Dynamic arrays? Would it be possible to allow the arrays to be dynamic? I know some binary formats that use that, like

number_fields: integer
dates: date[number_fields]

I haven't found any struct-replacement that does that.

Brian McErlean (author) 17 years, 7 months ago # | flag

Dynamic arrays. I think this would be fairly hard to do. Currently I'm relying on knowing the sizes of various substructures etc. in advance, in order to know how to represent them in containing structures. I think it should be possible (given some restrictions like the count appearing before the array in the struct), but would probably require a different approach.

Brian McErlean (author) 17 years, 7 months ago # | flag

Definition order. Getting the definition order requires a bit of a cheat. The idea is to create an object, and track the order they were created in, and then later sort based on this order. It does limit the syntax you can use to something that can create and return a new object though: "x=Foo.attr", "x=Foo()" and "x=Foo[1]" would all work given an appropriate Foo , but you can't use just "x = Foo".

pyConstruct looks pretty neat. I'll check it out.

Victor Stinner 17 years, 7 months ago # | flag

Both Hachoir and pyConstruct allow dynamic structure. pyConstruct uses Python eval() function to access to other fields:

>>> # a TLV is a type-length-value entity. the length of the value is specified
... # by the 'length' field
... tlv = Struct("tlv",
...     Byte("type"),
...     Byte("length"),
...     MetaBytes("value", "_.length"),
... )

Hachoir approch is different: you directly access to the structure using [] operator (self["name"]):

class Chunk(FieldSet):
   def createFields(self):
      yield UInt32(self, "size")
      yield String(self, "tag", 4, charset="ASCII")
      yield RawBytes(self, "data", self["size"].value)

Where self["size"] is a Field object and has many attributes: value, address, absolute_address, parent, display (unicode string), etc. I don't have enough place here to explain all Hachoir internals :-)

Haypo

Brian McErlean (author) 17 years, 7 months ago # | flag

Just to follow up. In fact, pyConstruct, linked to in the post above by Victor Stinner looks like it would handle this. I think the equivalent of your example would be something like:

Date=LittleFloat64("timestamp")

DynamicArray=Struct("dynamic_array",
            UInt32("number_fields"),
            MetaRepeater("_.number_fields", Date)
)

s= DynamicArray.build( Container(number_fields=2,
                           timestamp=[time.time(), time.time()]))

print repr(s)
print DynamicArray.parse(s)

Chris Niekel 17 years, 7 months ago # | flag

Cool, thanks for the tips, I'll look into them!

Alain Pointdexter 17 years, 7 months ago # | flag

Very nice recipe. This is a very nice recipe, worth the inclusion in the standard library!

I have a remark though. Is it possible to split the construction by keywords from the construction by decoding.

I mean:

s1=Shape(name='Triangle', numpoints=3, points=[ Point(x=0,y=0), Point(x=5,y=5), Point(x=10,y=0), Point(x=0,y=0)])

and

s2=Shape.decode('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x00\x05\x00\x05\x00\x0A' '\x00\x00\x00\x00\x00\x00')

This would be neater as you don't know in advance the contents of the string buffer and you might have a higher-level decode resting on several lower-level decode This is a very nice recipe, worth the inclusion in the standard library!

I have a remark though. Is it possible to split the construction by keywords from the construction by decoding.

I mean:

s1=Shape(name='Triangle', numpoints=3, points=[ Point(x=0,y=0), Point(x=5,y=5), Point(x=10,y=0), Point(x=0,y=0)])

and

s2=Shape.decode('Triangle\x00\x00\x00\x03\x00\x00\x00\x00\x00\x05\x00\x05\x00\x0A' '\x00\x00\x00\x00\x00\x00')

This would be neater as you don't know in advance the contents of the string buffer and you might have a higher-level decode resting on several lower-level decode

Alain Pointdexter 17 years, 7 months ago # | flag

inheritance bug.

Once you call Point3D, the parent class Point gets corrupted and it gets impossible to call it with only 2 arguments.
Alain

Brian McErlean (author) 17 years, 7 months ago # | flag

I'm not seeing it. Could you give some example code that fails. I've tried both

p = Point(x=1, y=2)
p = Point('\x01\x00\x02\x00')

after the same code as above, and both seem to work. I did have a similar bug when developing it, but fixed it by taking a copy of _struct_data for the subclass (previously it was mutating the parent class's list). Is this still happening somewhere?

Brian McErlean (author) 17 years, 7 months ago # | flag

decode method. It should be pretty simple to do - just move the _data parameter and all but the "for k,v in kwargs.iteritems():" loop out of __init__ and into a new decode classmethod. I'm not sure what you mean by your use case though - wouldn't that also be possible through __init__ too?

The main reason I went with __init__ and __str__ overloads rather than pack / unpack methods was that I didn't want to add anything into the public namespace of the class, as it would prevent defining struct fields with the same name. If there's a good reason though, perhaps this isn't that important.

Alain Pointdexter 17 years, 6 months ago # | flag

Sorry. Stupid me, I was experimenting with the code and i broke it myself ! Sorry !

Igor Lvovsky 17 years, 6 months ago # | flag

Strange behaviour of class Point. Can you please explain me some strange behaviour.

I try to play with the Point class.

The code is:

class Point(Struct):

_format = Format.LittleEndian

x = Type.Short

a = Type.Char        # the place is important

y = Type.Short

p = Point()

p.x, p.y = 100,200

print repr(p) # Prints "Point('d\x00\xc8\x00')

I have error (of struct.unpack(...)) in __init__ of Struct.

If I change the _format = Format.Native error is gone.

If I put the a = Type.Char after the y = Type.Short the error is gone too.

Igor Lvovsky 17 years, 6 months ago # | flag

Strange behaviour of class Point. Can you please explain me some strange behaviour.

I try to play with the Point class.

The code is:

class Point(Struct):

_format = Format.LittleEndian

x = Type.Short

a = Type.Char        # the place is important

y = Type.Short

p = Point()

p.x, p.y = 100,200

print repr(p) # Prints "Point('d\x00\xc8\x00')

I have error (of struct.unpack(...)) in __init__ of Struct.

If I change the _format = Format.Native error is gone.

If I put the a = Type.Char after the y = Type.Short the error is gone too.

Igor Lvovsky 17 years, 6 months ago # | flag

Strange behaviour of class Point. Can you please explain me some strange behaviour.

I try to play with the Point class.

The code is:

class Point(Struct):

_format = Format.LittleEndian

x = Type.Short

a = Type.Char        # the place is important

y = Type.Short

p = Point()

p.x, p.y = 100,200

print repr(p) # Prints "Point('d\x00\xc8\x00')

I have error (of struct.unpack(...)) in __init__ of Struct.

If I change the _format = Format.Native error is gone.

If I put the a = Type.Char after the y = Type.Short the error is gone too.

Igor Lvovsky 17 years, 6 months ago # | flag

Strange behaviour of class Point. Can you please explain me some strange behaviour.

I try to play with the Point class.

The code is:

class Point(Struct):

_format = Format.LittleEndian

x = Type.Short

a = Type.Char        # the place is important

y = Type.Short

p = Point()

p.x, p.y = 100,200

print repr(p) # Prints "Point('d\x00\xc8\x00')

I have error (of struct.unpack(...)) in __init__ of Struct.

If I change the _format = Format.Native error is gone.

If I put the a = Type.Char after the y = Type.Short the error is gone too.

Brian McErlean (author) 17 years, 6 months ago # | flag

Thats a bug. The padding applied by the struct module is different depending on the format selected. With native format, an extra pad byte is inserted after the char to align it to an even boundary. With the other formats, no padding is done.

I was including the format information when building the string, but not when calculating the size of the structure, so this was always defaulting to native format, giving a _struct_size of 6 when the struct expected 5. This meant that the initialisation string was the wrong size, giving the error you saw.

I've now updated the recipe to fix this. The relevant change was changing the line:

cls._struct_size = struct.calcsize(cls._struct_data)

cls._struct_size = struct.calcsize(cls._format + cls._struct_data)

Thanks.

Mark Shirley 17 years, 5 months ago # | flag

TypeError: 'Element' object is unindexable. Hello, I'm a beginning Python programmer and am very interested using your 'A higher level struct module' code. My problem may be as simple as version confusion on my part, but I'm stuck. Here's a transcript. I've saved the text source into packclass.py minus the examples at the end.

I can load that file and can successfully do the example with the Point class. However, when I try the Shape example, a class that uses an array type, I get an error.

Is there a workaround?

Thank you,

Mark Shirley

Python 2.5 (r25:51908, Sep 19 2006, 09:52:17) [MSC v.1310 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from packclass import *
>>> class Point(Struct):
    _format = Format.LittleEndian
    x = Type.Short
    y = Type.Short

... ... ... ... >>> p = Point('\x01\x00\x02\x00')
>>> print p.x, p.y
1 2
>>> print repr(p)
Point('\x01\x00\x02\x00')
>>> class Shape(Struct):
    _format = Format.BigEndian
    name      = Type.String[8]
    numpoints = Type.Int
    points    = Type.Struct(Point)[4] # Array of 4 points.

... ... ... ... ... Traceback (most recent call last):
  File "", line 1, in
  File "", line 3, in Shape
TypeError: 'Element' object is unindexable
>>>

Mark Shirley 17 years, 5 months ago # | flag

TypeError: 'Element' object is unindexable.

[Sorry to repeat this.  I submitted it earlier and saw it in the
list of comments, but it's not there now.]

Hello,
    I'm a beginning Python programmer and am very interested using
your 'A higher level struct module' recipe.  My problem may be as
simple as version confusion on my part, but I'm stuck.  Here's a
transcript.  I've saved the text source into packclass.py minus the
examples at the end.

I can load that file and can successfully do the example with the
Point class.  However, when I try the Shape example, a class that
uses an array type, I get an error.

Is there a workaround?
 Thank you,
    Mark Shirley


Python 2.5 (r25:51908, Sep 19 2006, 09:52:17) [MSC v.1310 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> from packclass import *
>>> class Point(Struct):
    _format = Format.LittleEndian
    x = Type.Short
    y = Type.Short

... ... ... ... >>> p = Point('\x01\x00\x02\x00')
>>> print p.x, p.y
1 2
>>> print repr(p)
Point('\x01\x00\x02\x00')
>>> class Shape(Struct):
    _format = Format.BigEndian
    name      = Type.String[8]
    numpoints = Type.Int
    points    = Type.Struct(Point)[4] # Array of 4 points.

... ... ... ... ... Traceback (most recent call last):
  File "", line 1, in
  File "", line 3, in Shape
TypeError: 'Element' object is unindexable
>>>

Brian McErlean (author) 17 years, 5 months ago # | flag

I'm not sure. I tried that code here, and I don't get an error. From the exception you're getting, it looks like its not finding __getitem__ on the Element object (I think it is the line "name = Type.String[8]" that is failing)

Could you check that the line

def __getitem__(self, num): return self(num)

is correctly copied below class Element. If it is missing, or not indented with the rest of the Element members, it would cause the error you're seeing.

XE IO 15 years, 5 months ago # | flag

Just a warning to anyone who seriously tries this recipe, it encodes structs to char buffers like so: a long ( l ) into 8s, which means that it will go back and forth a few times. If you profile a script that uses this underneath, you'll find that it spends about 2/3's its time in this code. Just use the actual struct module with an __init__ method that unpacks something just this would into the desired attributes. You'll have a much more efficient script.

◄	Python recipes (4591)	►
◄	Brian McErlean's recipes (3)	►

A higher level struct module (Python recipe) by Brian McErlean
ActiveState Code (http://code.activestate.com/recipes/498149/)

21 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

A higher level struct module (Python recipe) by Brian McErlean ActiveState Code (http://code.activestate.com/recipes/498149/)

21 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

A higher level struct module (Python recipe) by Brian McErlean
ActiveState Code (http://code.activestate.com/recipes/498149/)