Welcome, guest | Sign In | My Account | Store | Cart
'''
XML2Py - XML to Python de-serialization

This code transforms an XML document into a Python data structure

Usage:
    deserializer = XML2Py()
    python_object = deserializer.parse( xml_string )
    print xml_string
    print python_object
'''

from lxml import etree

class XML2Py():

    def __init__( self ):

        self._parser = parser = etree.XMLParser( remove_blank_text=True )
        self._root = None  # root of etree structure
        self.data = None   # where we store the processed Python structure

    def parse( self, xmlString ):
        '''
        processes XML string into Python data structure
        '''
        self._root = etree.fromstring( xmlString, self._parser )
        self.data = self._parseXMLRoot()
        return self.data

    def tostring( self ):
        '''
        creates a string representation using our etree object
        '''
        if self._root != None:
            return etree.tostring( self._root )

    def _parseXMLRoot( self ):
        '''
        starts processing, takes care of first level idisyncrasies
        '''
        childDict = self._parseXMLNode( self._root )
        return { self._root.tag : childDict["children"] }

    def _parseXMLNode( self, element ):
        '''
        rest of the processing
        '''
        childContainer = None # either Dict or List

        # process any tag attributes
        # if we have attributes then the child container is a Dict
        #   otherwise a List
        if element.items():
            childContainer = {}
            childContainer.update( dict( element.items() ) )
        else:
            childContainer = []


        if element.text:
            # tag with no attributes and one that contains text
            childContainer.append( element.text )

        else:
            # tag might have children, let's process them
            for child_elem in element.getchildren():

                childDict = self._parseXMLNode( child_elem )

              # let's store our child based on container type
                #
                if isinstance( childContainer, dict ):
                    # these children are lone tag entities ( eg, 'copyright' )
                    childContainer.update( { childDict["tag"] : childDict["children"] } )

                else:
                    # these children are repeated tag entities ( eg, 'format' )
                    childContainer.append( childDict["children"] )

        return { "tag":element.tag, "children": childContainer }


def main():

    xml_string = '''
    <documents>
        <document date="June 6, 2009" title="The Newness of Python" author="John Doe">
            <copyright type="CC" url="http://www.creativecommons.org/" date="June 24, 2009" />
            <text>Python is very nice. Very, very nice.</text>
            <formats>
                <format type="pdf">
                    <info uri="http://www.python.org/newness-of-python.pdf" pages="245" />
                </format>
                <format type="web">
                    <info uri="http://www.python.org/newness-of-python.html" />
                </format>
            </formats>
        </document>
    </documents>
    '''
    deserializer = XML2Py()
    python_object = deserializer.parse( xml_string )
    print xml_string
    print python_object


if __name__ == '__main__':
    main()

Diff to Previous Revision

--- revision 1 2010-06-16 11:06:25
+++ revision 2 2010-06-16 11:39:44
@@ -4,106 +4,106 @@
 This code transforms an XML document into a Python data structure
 
 Usage:
-	deserializer = XML2Py()
-	python_object = deserializer.parse( xml_string )
-	print xml_string
-	print python_object
+    deserializer = XML2Py()
+    python_object = deserializer.parse( xml_string )
+    print xml_string
+    print python_object
 '''
 
 from lxml import etree
 
 class XML2Py():
 
-	def __init__( self ):
+    def __init__( self ):
 
-		self._parser = parser = etree.XMLParser( remove_blank_text=True )
-		self._root = None  # root of etree structure
-		self.data = None   # where we store the processed Python structure
+        self._parser = parser = etree.XMLParser( remove_blank_text=True )
+        self._root = None  # root of etree structure
+        self.data = None   # where we store the processed Python structure
 
-	def parse( self, xmlString ):
-		'''
-		processes XML string into Python data structure
-		'''
-		self._root = etree.fromstring( xmlString, self._parser )
-		self.data = self._parseXMLRoot()
-		return self.data
+    def parse( self, xmlString ):
+        '''
+        processes XML string into Python data structure
+        '''
+        self._root = etree.fromstring( xmlString, self._parser )
+        self.data = self._parseXMLRoot()
+        return self.data
 
-	def tostring( self ):
-		'''
-		creates a string representation using our etree object
-		'''
-		if self._root != None:
-			return etree.tostring( self._root )
+    def tostring( self ):
+        '''
+        creates a string representation using our etree object
+        '''
+        if self._root != None:
+            return etree.tostring( self._root )
 
-	def _parseXMLRoot( self ):
-		'''
-		starts processing, takes care of first level idisyncrasies
-		'''
-		childDict = self._parseXMLNode( self._root )
-		return { self._root.tag : childDict["children"] }
+    def _parseXMLRoot( self ):
+        '''
+        starts processing, takes care of first level idisyncrasies
+        '''
+        childDict = self._parseXMLNode( self._root )
+        return { self._root.tag : childDict["children"] }
 
-	def _parseXMLNode( self, element ):
-		'''
-		rest of the processing
-		'''
-		childContainer = None # either Dict or List
+    def _parseXMLNode( self, element ):
+        '''
+        rest of the processing
+        '''
+        childContainer = None # either Dict or List
 
-		# process any tag attributes
-		# if we have attributes then the child container is a Dict
-		#   otherwise a List
-		if element.items():
-			childContainer = {}
-			childContainer.update( dict( element.items() ) )
-		else:
-			childContainer = []
+        # process any tag attributes
+        # if we have attributes then the child container is a Dict
+        #   otherwise a List
+        if element.items():
+            childContainer = {}
+            childContainer.update( dict( element.items() ) )
+        else:
+            childContainer = []
 
 
-		if element.text:
-			# tag with no attributes and one that contains text
-			childContainer.append( element.text )
+        if element.text:
+            # tag with no attributes and one that contains text
+            childContainer.append( element.text )
 
-		else:
-			# tag might have children, let's process them
-			for child_elem in element.getchildren():
+        else:
+            # tag might have children, let's process them
+            for child_elem in element.getchildren():
 
-				childDict = self._parseXMLNode( child_elem )
+                childDict = self._parseXMLNode( child_elem )
 
-			  # let's store our child based on container type
-				#
-				if isinstance( childContainer, dict ):
-					# these children are lone tag entities ( eg, 'copyright' )
-					childContainer.update( { childDict["tag"] : childDict["children"] } )
+              # let's store our child based on container type
+                #
+                if isinstance( childContainer, dict ):
+                    # these children are lone tag entities ( eg, 'copyright' )
+                    childContainer.update( { childDict["tag"] : childDict["children"] } )
 
-				else:
-					# these children are repeated tag entities ( eg, 'format' )
-					childContainer.append( childDict["children"] )
+                else:
+                    # these children are repeated tag entities ( eg, 'format' )
+                    childContainer.append( childDict["children"] )
 
-		return { "tag":element.tag, "children": childContainer }
+        return { "tag":element.tag, "children": childContainer }
 
 
 def main():
 
-	xml_string = '''
-	<documents>
-		<document date="June 6, 2009" title="The Newness of Python" author="John Doe">
-			<copyright type="CC" url="http://www.creativecommons.org/" date="June 24, 2009" />
-			<text>Python is very nice. Very, very nice.</text>
-			<formats>
-				<format type="pdf">
-					<info uri="http://www.python.org/newness-of-python.pdf" pages="245" />
-				</format>
-				<format type="web">
-					<info uri="http://www.python.org/newness-of-python.html" />
-				</format>
-			</formats>
-		</document>
-	</documents>
-	'''
-	deserializer = XML2Py()
-	python_object = deserializer.parse( xml_string )
-	print xml_string
-	print python_object
+    xml_string = '''
+    <documents>
+        <document date="June 6, 2009" title="The Newness of Python" author="John Doe">
+            <copyright type="CC" url="http://www.creativecommons.org/" date="June 24, 2009" />
+            <text>Python is very nice. Very, very nice.</text>
+            <formats>
+                <format type="pdf">
+                    <info uri="http://www.python.org/newness-of-python.pdf" pages="245" />
+                </format>
+                <format type="web">
+                    <info uri="http://www.python.org/newness-of-python.html" />
+                </format>
+            </formats>
+        </document>
+    </documents>
+    '''
+    deserializer = XML2Py()
+    python_object = deserializer.parse( xml_string )
+    print xml_string
+    print python_object
 
 
 if __name__ == '__main__':
-	main()
+    main()

History