This recipe presents a way of serializing & de-serializing XML using the marshal module. The XML is converted to an equivalent Python dictionary first, which is marshaled to serialize it. De-serialization first unmarshals the dictionary from the file, and constructs the original XML.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | """Simple XML marshaling (serializing) and
unmarshaling(de-serializing) module using Python
dictionaries and the marshal module.
"""
from xml.sax.handler import ContentHandler
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import XMLReader
from xml.sax import make_parser
import marshal
import os,sys,zlib
class XMLDictionaryHandler(ContentHandler):
"""SAX Handler class which converts an XML
file to a corresponding Python dictionary """
def __init__(self):
self.curr=''
self.parent=''
self.count=0
self.d = {}
self.currd = {}
self.parentd = {}
self.stack = []
self.stack2 = []
def startElement(self, name, attrs):
""" Start element handler """
if self.count==0:
self.parent=name
self.d[name] = [dict(attrs),
'',
[]]
self.currd = self.d
else:
chld={name: [dict(attrs),
'',
[] ]}
self.parent = self.stack[-1]
self.parentd = self.stack2[-1]
chldlist = (self.parentd[self.parent])[2]
chldlist.append(chld)
self.currd = chld
self.stack.append(name)
self.stack2.append(self.currd)
self.curr=name
self.count += 1
def endElement(self, name):
""" End element handler """
self.stack.remove(name)
for item in self.stack2:
if item.has_key(name):
self.stack2.remove(item)
def characters(self, content):
""" Character handler """
content = (content.encode('utf-8')).strip()
if content:
myd=((self.parentd[self.parent])[2])[-1]
currcontent = (myd[self.curr])[1]
(myd[self.curr])[1] = "".join((currcontent, content))
def endDocument(self):
""" End document handler """
# Compress all text items
self.packtext(self.d)
def packtext(self, map):
for key, value in map.items():
text = value[1]
value[1] = zlib.compress(text)
children = value[2]
for submap in children:
self.packtext(submap)
class BinXMLSAXParser(XMLReader):
"""A parser for Python binary marshal files representing
XML information using SAX interfaces """
def __init__(self):
XMLReader.__init__(self)
self.depth = 0
def parse(self, stream):
""" Parse Method """
# Check if it is a file object
if type(stream) is file:
try:
self.d = marshal.load(stream)
except Exception, e:
sys.exit(e)
# Check if it is a file path
elif os.path.exists(stream):
try:
self.d = marshal.load(open(stream,'rb'))
except Exception, e:
sys.exit(e)
else:
raise 'BinXMLSAXParserException: Invalid Input Source'
self._cont_handler.startDocument()
self.__parse(self.d)
self._cont_handler.endDocument()
def __parse(self, map):
""" Recursive parse method for
XML dictionary """
for key, value in map.items():
# For pretty printing
self._cont_handler.ignorableWhitespace(" "*self.depth)
attrs = value[0]
text = value[1]
children = value[2]
# Fire startElement handler event for key
self._cont_handler.startElement(key, attrs)
# Fire character handler event for value
self._cont_handler.characters(zlib.decompress(text))
# Nested element, recursively call
# this function...
self.depth += 1
# For pretty printing
self._cont_handler.ignorableWhitespace('\n')
for child in children:
self.__parse(child)
self.depth -= 1
# For pretty printing
self._cont_handler.ignorableWhitespace(" "*self.depth)
# Fire end element handler event
self._cont_handler.endElement(key)
# For pretty printing
self._cont_handler.ignorableWhitespace('\n')
class XMLMarshal(object):
""" The XML marshalling class """
def dump(stream, xmlfile):
""" Serialize XML data to a file """
try:
p=make_parser()
h = XMLDictionaryHandler()
p.setContentHandler(h)
p.parse(open(xmlfile))
# print h.d
marshal.dump(h.d, stream)
except Exception, e:
sys.exit(e)
def dumps(stream, xmlfile):
""" Serialize XML data to a string """
try:
p=make_parser()
p.setContentHandler()
h = XMLDictionaryHandler()
p.parse(open(xmlfile))
return marshal.dumps(h.d, stream)
except Exception, e:
sys.exit(e)
return None
def load(stream, out=sys.stdout):
""" Load an XML binary stream
and send XML text to the output
stream 'out' """
try:
p=BinXMLSAXParser()
p.setContentHandler(XMLGenerator(out))
p.parse(stream)
except Exception, e:
sys.exit(e)
def loads(stream):
""" Load an XML binary stream
and return XML text as string """
import cStringIO
c=cStringIO.StringIO()
try:
p=BinXMLSAXParser()
p.setContentHandler(XMLGenerator(c))
p.parse(stream)
except Exception, e:
sys.exit(e)
return c.getvalue()
dump=staticmethod(dump)
dumps=staticmethod(dumps)
load=staticmethod(load)
loads=staticmethod(loads)
if __name__ == '__main__':
fname = 'sample.xml'
binname = os.path.splitext(fname)[0] + '.bin'
# Dump XML text to binary
XMLMarshal.dump(open(binname,'wb'), fname)
# Dump XML binary to text
XMLMarshal.load(open(binname,'rb'), open('sample.xml','w'))
|
This recipe provides a quick way to convert XML to an equivalent Python dictionary, without using too many objects as intermediate representations. The XML representation is pretty lightweight since we are using a standard Python data structure to represent it.
It also provides a class which can be used to serialize and de-serialize the equivalent dictionary to the disk by using the marshal module. The resultant binary file is nearly 87% the size of the original XML file in most cases. Though it is not huge savings in space, the binary file compresses nearly 85% better than the original XML text when using bzip2/gzip.
This recipe can be used in your program if you want a simple representation of an XML file without going through DOM or heavy XML 'objectifiers' such as xml_objectify or for 'quick n dirty' serialization.
version 1.1:
- Take care of unicode encoded characters
- Compress text using zlib to save space
Hi Anand,
Is it possible to get a usage example. I'm a novice at programming and am having difficulty getting the code to work. Say I have a filelike xml object (just retrieved using urllib and urlib2) called brooklyn_weather, how would I then serialize it into a python object (composed of lists and dictionaries) that I can manipulate and use internally. I don't need to save it to disk or to output as xml after.
Thanks, Hani
An example was provided! Take a look at lines 209-217.
Cheers