Welcome, guest | Sign In | My Account | Store | Cart

This recipe presents a way of serializing & de-serializing XML using the marshal module. The XML is converted to an equivalent Python dictionary first, which is marshaled to serialize it. De-serialization first unmarshals the dictionary from the file, and constructs the original XML.

Python, 217 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
"""Simple XML marshaling (serializing) and
  unmarshaling(de-serializing) module using Python
  dictionaries and the marshal module.
"""

from xml.sax.handler import ContentHandler
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import XMLReader
from xml.sax import make_parser
import marshal
import os,sys,zlib

class XMLDictionaryHandler(ContentHandler):
    """SAX Handler class which converts an XML
    file to a corresponding Python dictionary """

    def __init__(self):
        self.curr=''
        self.parent=''
        self.count=0
        self.d = {}
        self.currd = {}
        self.parentd = {}
        self.stack = []
        self.stack2 = []

    def startElement(self, name, attrs):
        """ Start element handler """

        if self.count==0:
            self.parent=name
            self.d[name] = [dict(attrs),
                            '',
                            []]
            self.currd = self.d
        else:
            chld={name: [dict(attrs),
                         '',
                         [] ]}
            self.parent = self.stack[-1]
            self.parentd = self.stack2[-1]

            chldlist = (self.parentd[self.parent])[2]
            chldlist.append(chld)
            self.currd = chld

        self.stack.append(name)
        self.stack2.append(self.currd)

        self.curr=name
        self.count += 1

    def endElement(self, name):
        """ End element handler """

        self.stack.remove(name)
        for item in self.stack2:
            if item.has_key(name):
                self.stack2.remove(item)

    def characters(self, content):
        """ Character handler """

        content = (content.encode('utf-8')).strip()

        if content:
            myd=((self.parentd[self.parent])[2])[-1]
            currcontent = (myd[self.curr])[1]
            (myd[self.curr])[1] = "".join((currcontent, content))

    def endDocument(self):
        """ End document handler """
        
        # Compress all text items
        self.packtext(self.d)
        
    def packtext(self, map):
    
        for key, value in map.items():
            text = value[1]
            value[1] = zlib.compress(text)
            children = value[2]
            for submap in children:
                self.packtext(submap)
        
class BinXMLSAXParser(XMLReader):
    """A parser for Python binary marshal files representing
    XML information using SAX interfaces """

    def __init__(self):
        XMLReader.__init__(self)
        self.depth = 0

    def parse(self, stream):
        """ Parse Method """

        # Check if it is a file object
        if type(stream) is file:
            try:
                self.d = marshal.load(stream)
            except Exception, e:
                sys.exit(e)

        # Check if it is a file path
        elif os.path.exists(stream):
            try:
                self.d = marshal.load(open(stream,'rb'))
            except Exception, e:
                sys.exit(e)
        else:
            raise 'BinXMLSAXParserException: Invalid Input Source'

        self._cont_handler.startDocument()
        self.__parse(self.d)
        self._cont_handler.endDocument()

    def __parse(self, map):
        """ Recursive parse method for
        XML dictionary """

        for key, value in map.items():
            # For pretty printing
            self._cont_handler.ignorableWhitespace(" "*self.depth)
            attrs = value[0]
            text = value[1]
            children = value[2]
            # Fire startElement handler event for key
            self._cont_handler.startElement(key, attrs)
            # Fire character handler event for value
            self._cont_handler.characters(zlib.decompress(text))
            # Nested element, recursively call
            # this function...
            self.depth += 1
            # For pretty printing
            self._cont_handler.ignorableWhitespace('\n')
            for child in children:
                self.__parse(child)
            self.depth -= 1
            # For pretty printing
            self._cont_handler.ignorableWhitespace(" "*self.depth)
            # Fire end element handler event
            self._cont_handler.endElement(key)
            # For pretty printing
            self._cont_handler.ignorableWhitespace('\n')

class XMLMarshal(object):
    """ The XML marshalling class """

    def dump(stream, xmlfile):
        """ Serialize XML data to a file """

        try:
            p=make_parser()
            h = XMLDictionaryHandler()
            p.setContentHandler(h)
            p.parse(open(xmlfile))
            # print h.d
            marshal.dump(h.d, stream)
        except Exception, e:
            sys.exit(e)

    def dumps(stream, xmlfile):
        """ Serialize XML data to a string """

        try:
            p=make_parser()
            p.setContentHandler()
            h = XMLDictionaryHandler()
            p.parse(open(xmlfile))
            return marshal.dumps(h.d, stream)
        except Exception, e:
            sys.exit(e)

        return None

    def load(stream, out=sys.stdout):
        """ Load an XML binary stream
        and send XML text to the output
        stream 'out' """

        try:
            p=BinXMLSAXParser()
            p.setContentHandler(XMLGenerator(out))
            p.parse(stream)
        except Exception, e:
            sys.exit(e)

    def loads(stream):
        """ Load an XML binary stream
        and return XML text as string """

        import cStringIO
        c=cStringIO.StringIO()

        try:
            p=BinXMLSAXParser()
            p.setContentHandler(XMLGenerator(c))
            p.parse(stream)
        except Exception, e:
            sys.exit(e)

        return c.getvalue()

    dump=staticmethod(dump)
    dumps=staticmethod(dumps)
    load=staticmethod(load)
    loads=staticmethod(loads)

if __name__ == '__main__':

    fname = 'sample.xml'
    binname = os.path.splitext(fname)[0] + '.bin'

    # Dump XML text to binary
    XMLMarshal.dump(open(binname,'wb'), fname)
    # Dump XML binary to text
    XMLMarshal.load(open(binname,'rb'), open('sample.xml','w'))

This recipe provides a quick way to convert XML to an equivalent Python dictionary, without using too many objects as intermediate representations. The XML representation is pretty lightweight since we are using a standard Python data structure to represent it.

It also provides a class which can be used to serialize and de-serialize the equivalent dictionary to the disk by using the marshal module. The resultant binary file is nearly 87% the size of the original XML file in most cases. Though it is not huge savings in space, the binary file compresses nearly 85% better than the original XML text when using bzip2/gzip.

This recipe can be used in your program if you want a simple representation of an XML file without going through DOM or heavy XML 'objectifiers' such as xml_objectify or for 'quick n dirty' serialization.

version 1.1:

  • Take care of unicode encoded characters
  • Compress text using zlib to save space

2 comments

Hani Musallam 15 years, 7 months ago  # | flag

Hi Anand,

Is it possible to get a usage example. I'm a novice at programming and am having difficulty getting the code to work. Say I have a filelike xml object (just retrieved using urllib and urlib2) called brooklyn_weather, how would I then serialize it into a python object (composed of lists and dictionaries) that I can manipulate and use internally. I don't need to save it to disk or to output as xml after.

Thanks, Hani

designcurve 14 years, 7 months ago  # | flag

An example was provided! Take a look at lines 209-217.

Cheers