Welcome, guest | Sign In | My Account | Store | Cart

I updated John Bair's xml2obj to allow better support of actually parsing xml files. Some of the information for this came from Uche Ogbuji's articles on the web. Enjoy.

Python, 403 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
#!/usr/bin/env python


"""
Program:		XML Routines

Description:	This provides functions and classes to read an XML file building
				a tree of the data file's elements and attributes (to be added!).

Notes:
	--	Think about any of the following:
		--	Read an xml file using xmlData then parse with an LL1 type parser.
		--	Look up MacOSX schema for arrays etc.
		
History:
  2004/10/04	Added "return None" to getFirstChildByName().  Added class to
  				visit the Tree rather than having it embedded in the node
  				class which is a cleaner implementation.

!!! USE AT YOUR OWN RISK !!!
"""

__version__ = 1.0
__license__ = "none"




######################################################################
#								Imports
######################################################################

import		os
import		sys
import		optparse
import		string
import		xml.sax



######################################################################
#							Global Data
######################################################################

# Flags
fDebug = 0
fVerbose  = 0



		
######################################################################
#						set the Debug Flag.
######################################################################

def			setDebug( fValue ):
	"""Set the fDebug flag."""
	global		fDebug

	if fValue:
		fDebug = 1
	else:
		fDebug = 0



######################################################################
#				Read an XML file building a database tree.
######################################################################

# NOTE:		Currently, this system does not handle XML Attributes!

#---------------------------------------------------------------------
#							XML Element Class
#---------------------------------------------------------------------

class		xmlElement:

	def			__init__( self, name, parent, dictAttributes ):
		self.data = None
		self.name = name
		self.parent = parent
		self.dictAttributes = dictAttributes
		self.listChildren = [ ]
		self.dictData = { }

		
	def			__str__( self ):
		szStr = '%s' % (self.name)
		if self.dictAttributes:
			szStr += "%s" % (self.dictAttributes)
		if self.data:
			szStr += ": %s\n" % (self.data)
		else:
			szStr += "\n"
		return szStr

		
	def			addChild( self, node ):
		self.listChildren.append( node )


	def			getAttribute( self, name ):
		return self.dictAttributes[ name ]


	def			getAttributeNames( self ):
		return self.dictAttributes.keys( )


	def			getChildren( self ):
		return self.listChildren
		

	def			getData( self ):
		return self.data
		

	def			getFirstChildByName( self, name ):
		for oChild in self.iterChildrenByName( name ):
			return oChild
		return None
		

	def			getName( self ):
		return self.name
		

	def			hasChildByName( self, name ):
		for oChild in self.iterChildrenByName( name ):
			return 1
		return 0

	def			iterChildren( self ):
		iCur = 0
		while iCur < len( self.listChildren ):
			oChild = self.listChildren[iCur]
			yield oChild
			iCur = iCur + 1
		return
		

	def			iterChildrenByName( self, name ):
		iCur = 0
		while iCur < len( self.listChildren ):
			oChild = self.listChildren[iCur]
			if name == oChild.name:
				yield oChild
			iCur = iCur + 1
		return
		

	def			numChildren( self ):
		return len( self.listChildren )
		

	def			setData( self, data ):

		if string.strip( data ):
			data = data.encode( )
			self.data = data



#---------------------------------------------------------------------
#						XML Element Handler Class
#---------------------------------------------------------------------

class		xmlHandler(xml.sax.ContentHandler):

	def			__init__( self ):
		xml.sax.ContentHandler.__init__( self )
		self.iLevel = 0
		self.listKeys = [ ]
		self.listDataStack = [ xmlElement( u'root', None, None ) ]

		
	def			startElement( self, name, oAttributes ):
		'SAX start element even handler'
		global		fDebug
		
		if fDebug:
			print "startElement(",name.encode(),")"
		attr={ }
		for oKey in oAttributes.getNames( ):
			oData = oAttributes.getValue( oKey )
			if string.strip( oKey ):
				key = oKey.encode( )
				data = oData.strip( )
				attr[key] = data.encode( )
		parent = self.listDataStack[ -1 ]
		node = xmlElement( name.encode(), parent, attr )
		if parent is not None:
			parent.addChild( node )
		self.listDataStack.append( node )
		self.data = ''

		
	def			characters( self, data ):
		'SAX character data event handler'

		if string.strip( data ):
			data = data.encode( )
			self.data += data

		
	def			endElement( self, name ):
		'SAX end element event handler'
		global		fDebug
		
		if fDebug:
			print "endElement(",name,")"

		curData = self.listDataStack.pop( )
		curData.setData( self.data )
		self.data = ''

		
	def			xmlRoot( self ):
		return self.listDataStack[0]



#---------------------------------------------------------------------
#					Read an XML file function.
#---------------------------------------------------------------------

def			readXmlFile( szFileName ):
	" Read in the xml file and build a database tree."

	# Parse the xml file.	
	fileParser = xml.sax.make_parser( )
	fileParser.setFeature( xml.sax.handler.feature_namespaces, 0 ) # Turn off namespaces.
	curHandler = xmlHandler( )
	fileParser.setContentHandler( curHandler )
	fileIn = open( szFileName, 'r' )
	fileParser.parse( fileIn )
	fileIn.close( )
	xmlRoot = curHandler.xmlRoot( )
	return xmlRoot
		
		
#---------------------------------------------------------------------
#							XML Visit Class
#---------------------------------------------------------------------

class		xmlVisit:

	def			_depthFirst( self, curNode ):
		"""	This is a recursive method and therefore has a Python limitation
			of only being able to recurse 1000 times (ie 1000 levels of the
			tree).
		"""
		iRc = self.visitNode( curNode )
		if iRc:
			return iRc
		iRc = self.startChildren( curNode )
		listChildren = curNode.getChildren( )
		for curChild in listChildren:
			iRc = self._depthFirst( curChild )
			if iRc:
				return iRc
		iRc = self.endChildren( curNode )


	def			depthFirst( self, curNode ):
		self._depthFirst( curNode )
		
		
	def			endChildren( self, curNode ):
		""" called when a node's children have been processed.  curNode is the
			node that owns the children.
			Override this if necessary
		"""
		pass


	def			startChildren( self, curNode ):
		""" called when a node's children are about to be processed.  curNode is the
			node that owns the children.
			Override this if necessary
		"""
		pass


	def			visitNode( self, curNode ):
		" called when a node is to be processed. Override this if necessary"
		pass




#---------------------------------------------------------------------
#					Convert the XML Tree to a string.
#---------------------------------------------------------------------

class		xmlTree2String(xmlVisit):

	def			endChildren( self, curNode ):
		" convert the xml tree to a string. "

		self.iIndent -= 2


	def			convert( self, xmlRoot ):
		""" convert the xml tree to a string.
		"""

		# Visit each node adding it to the string.
		self.szString = ''
		self.iIndent = 0
		self.depthFirst( xmlRoot ) 
		return self.szString
		
		
	def			visitNode( self, curNode ):
		" convert the xml tree to a string. "

		s = self.iIndent * ' '
		s += "%s" % (curNode.name)
		if curNode.dictAttributes:
			s += "%s" % (curNode.dictAttributes)
		if curNode.data:
			s += ": %s\n" % (curNode.data)
		else:
			s += "\n"
		self.szString += s
		self.iIndent += 2
		return 0


#---------------------------------------------------------------------
#					Convert the XML Tree to a string.
#---------------------------------------------------------------------

class		xmlTree2KeyedData(xmlVisit):

	def			endChildren( self, curNode ):
		self.listKeys.pop( )


	def			visitNode( self, curNode ):
		self.listKeys.append( curNode.name )
		if curNode.data:
			# Build the key.
			newKey = '_'.join( self.listKeys )
			# Add the key and data to the nodeAccum.
			if newKey:
				self.nodeAccum.__dict__[newKey] = curNode.data
		return 0


	def			makeKeyedData( self, nodeAccum, xmlRoot ):
		self.listKeys = [ ]
		self.nodeAccum = nodeAccum
		self.depthFirst( xmlRoot ) 
		return self.nodeAccum
		

				

######################################################################
#						Command-line interface
######################################################################


def			main( argV=None ):
	"Command-line interface."
	global		fDebug
	global		fVerbose
	global		listComputers

	if argV is None:
		argV = sys.argv

	# Parse the command line.		
	szUsage = "usage: %prog [options] arg1 arg2 ..."
	oCmdPrs = optparse.OptionParser( usage=szUsage )
	oCmdPrs.add_option( "-d", "--debug", action="store_true",
						dest="fDebug", default=False,
						help="Set debug mode"
	)
	(oOptions, oArgs) = oCmdPrs.parse_args( argV )
	if oOptions.fDebug:
		fDebug = 1
		fahData.setDebug( )
		print "In DEBUG Mode..."

	# Build the XML Tree.
	xmlRoot = readXmlFile( argV[1] )
	
	# Now convert it to a string and print that.
	oTree2String = xmlTree2String( )
	szString = oTree2String.convert( xmlRoot )
	print szString
		
	return 0




if __name__ == "__main__":
	sys.exit( main( sys.argv ) or 0 )

Using this set of classes, I have successfully parsed several xml files including one that used a recursive definition. For instance, here is a short file and program excerpt to process it.

"" "" "" "apache" "net-www" "rc-update add apache2 default" "fServerApache" "" "mysql" "dev-db" "fServerApache" "" "" ""

Note that you need to remove the quotes which were not in the original file. Just here to bypass problems with your browser. Program excerpt:

<pre>

---------------------------------------------------------------------

Emerge a package as needed.

---------------------------------------------------------------------

def emergePackage( oXmlPackage, oXmlComputer, oXmlUser, oXmlGroup ): "Emerge the packages." global fDebug

# Check that the package should be installed.
fFetch = 0
if oXmlComputer.hasChildByName( 'fServerPortage' ):
    fFetch = 1
fEmerge = 1
for oOption in oXmlPackage.iterChildrenByName( 'option' ):
    if oXmlComputer.hasChildByName( oOption.getData( ) ):
        continue
    else:
        fEmerge = 0
        break

# Get the basic information.
oName = oXmlPackage.getFirstChildByName( 'name' )
oCat  = oXmlPackage.getFirstChildByName( 'category' )
print ">>>Package=",oName.getData()

# Emerge the package if needed.
if fEmerge:
    szCmd = "emerge --usepkg --noreplace %s/%s" % ( oCat.getData(), oName.getData() )
    if rmwCommon.doCmd( szCmd ):
        pass
    else:
        print "ERROR - failed to emerge %s/%s!"  % ( oCat.getData(), oName.getData() )
        sys.exit( 8 )
    for oCmd in oXmlPackage.iterChildrenByName( 'cmd' ):
        szCmd = oCmd.getData( )
        if rmwCommon.doCmd( szCmd ):
            pass
        else:
            print "ERROR - cmd(%s) failed for emerge %s/%s!"  % ( szCmd, oCat.getData(), oName.getData() )
            sys.exit( 8 )
elif fFetch:
    szCmd = "emerge --fetchonly --noreplace %s/%s" % ( oCat.getData(), oName.getData() )
    if rmwCommon.doCmd( szCmd ):
        pass
    else:
        print "ERROR - failed to emerge --fetchonly %s/%s!"  % ( oCat.getData(), oName.getData() )
        sys.exit( 8 )
else:
    print ">>>>>>>>>>>>>>>> NO MODE for %s/%s"  % ( oCat.getData(), oName.getData() )
    pass

# if we emerged this package, then we need to emerge its children packages.
if fEmerge or fFetch:
    for oXmlChild in oXmlPackage.iterChildrenByName( 'package' ):
        emergePackage( oXmlChild, oXmlComputer, oXmlUser, oXmlGroup )

</pre>

1 comment

bob w (author) 17 years, 2 months ago  # | flag

Updated on 2004/10/04. I updated this example to fix a bug in xmlElement.getFirstChildByName() and moved the depth-first-search into its own class making it a cleaner way to use it.