If you want to serialize Python objects to XML then PyXML is a good choice. Except in the case when unicode strings come into play. In this case generic.Marshaller().dump() throws an ugly AttributeError: Marshaller instance has no attribute 'm_unicode' This recipe extends both PyXML Marshaller and Unmarshaller to support the de-/serialization of unicode strings. Put the following code in a separate module and test it with the given example. The output will look like <marshal> <list id="i2"> <string>text</string> <unicode>german umlaut: ü ö <>&</unicode> </list> </marshal>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | ----- unicodemarshal.py -----
from xml.marshal import generic
class UnicodeMarshaller(generic.Marshaller):
tag_unicode = 'unicode'
def m_unicode(self, value, dict):
name = self.tag_unicode
L = ['<' + name + '>']
s = value.encode('utf-8')
if '&' in s or '>' in s or '<' in s:
s = s.replace('&', '&')
s = s.replace('<', '<')
s = s.replace('>', '>')
L.append(s)
L.append('</' + name + '>')
return L
class UnicodeUnmarshaller(generic.Unmarshaller):
def __init__(self):
self.unmarshal_meth['unicode'] = ('um_start_unicode','um_end_unicode')
# super maps the method names to methods
generic.Unmarshaller.__init__(self)
um_start_unicode = generic.Unmarshaller.um_start_generic
def um_end_unicode(self, name):
ds = self.data_stack
# the value is a utf-8 encoded unicode
ds[-1] = ''.join(ds[-1])
self.accumulating_chars = 0
---- example ----
>>> import sys,codecs
>>> from unicodemarshal import UnicodeMarshaller, UnicodeUnmarshaller
>>>
>>> if hasattr(sys, 'setdefaultencoding'):
... sys.setdefaultencoding('utf-8')
...
>>>
>>> def openUTF8File(path, mode):
... fp = codecs.open(filename=path, mode=mode, encoding='utf-8')
... return fp
...
>>>
>>> myList = ['text',
... u'german umlaut: \xfc \xf6 <>&']
>>>
>>> fp = openUTF8File("test.xml", mode='w')
>>> UnicodeMarshaller().dump(myList, fp)
>>> fp.close()
>>>
>>> fp = openUTF8File("test.xml", mode='r')
>>> myList = UnicodeUnmarshaller().load(fp)
>>> for s in myList:
... print type(s)
...
>>> fp.close()
<type 'str'>
<type 'unicode'>
|
When the sample runs, it sets the systemwide encoding to utf-8. To make this possible I commented the line "del sys.setdefaultencoding" in site.py (Python version 2.4.1) I also use codecs.open() to ensure, that the file contents are correctly encoded.