import StringIO, sys
from xml import sax
from xml.sax import handler, saxutils, xmlreader
RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
class RDFFilter (saxutils.XMLFilterBase):
def __init__ (self, *args):
saxutils.XMLFilterBase.__init__(self, *args)
self.in_rdf_stack = [False]
def startElementNS (self, (uri, localname), qname, attrs):
if uri == RDF_NS or self.in_rdf_stack[0] == True:
self.in_rdf_stack.insert(0, True)
return
# Delete attributes that belong to the RDF namespace
dict = {}
for key, value in attrs.items():
uri, localname = key
if uri != RDF_NS:
dict[key] = value
attrs = xmlreader.AttributesNSImpl(dict, attrs.getQNames())
self.in_rdf_stack.insert(0, self.in_rdf_stack[0])
saxutils.XMLFilterBase.startElementNS(self,
(uri, localname), qname, attrs)
def characters(self, content):
if self.in_rdf_stack[0]:
return
saxutils.XMLFilterBase.characters(self, content)
def endElementNS (self, (uri, localname), qname):
if self.in_rdf_stack.pop(0) == True:
return
saxutils.XMLFilterBase.endElementNS(self,
(uri, localname), qname)
def filter_rdf (input, output):
"""filter_rdf(input:file, output:file)
Parses the XML input from the input stream, filtering out all
elements and attributes that are in the RDF namespace.
"""
output_gen = saxutils.XMLGenerator(output)
parser = sax.make_parser()
filter = RDFFilter(parser)
filter.setFeature(handler.feature_namespaces, True)
filter.setContentHandler(output_gen)
filter.setErrorHandler(handler.ErrorHandler())
filter.parse(input)
if __name__ == '__main__':
TEST_RDF = '''<?xml version="1.0"?>
<metadata xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<title> This is non-RDF content </title>
<rdf:RDF>
<rdf:Description rdf:about="%s">
<dc:Creator>%s</dc:Creator>
</rdf:Description>
</rdf:RDF>
<element />
</metadata>
'''
input = StringIO.StringIO(TEST_RDF)
filter_rdf(input, sys.stdout)