Colorize xml source using the sax ContentHandler, LexicalHandler and Declhandler interfaces « Python recipes

This recipe take an xml file as input and output a colorized version of this file, using html or docbook (with emphasis elements and a particular role). It provides a little command line interface and it's really easy to configure your output.

      #!/usr/bin/python2.1 
import sys 
from xml.dom.ext import SplitQName 
from xml.sax.handler import ContentHandler 
from xml.sax.saxutils import escape

_ROOT, _STRING, _COMMENT, _NAME, _KEYWORD, _TEXT, _HEAD =0,1,2,3,4,5,6 
DOCBOOK = {
    _ROOT: ('<programlisting>','</programlisting>'),
    _STRING: ('<emphasis>', '</emphasis>'),
    _COMMENT:('<emphasis>', '</emphasis>'),
    _NAME:  ('', ''),
    _KEYWORD:('<emphasis role="bold">', '</emphasis>'),
    _TEXT:  ('', '')
    } HTML = {
    _ROOT: ('<div>', '</div>'),
    _STRING: ('<font color="#004080">', '</font>'),
    _COMMENT:('<font color="#008000">', '</font>'),
    _NAME:  ('', ''),
    _KEYWORD:('<font color="#C00000">', '</font>'),
    _TEXT:  ('', '')
    } 

class XmlFormatSaxHandler(ContentHandler):
    ''' format an xmlfile to docbook or html '''
    
    def __init__(self, head=1, output=sys.stdout, encoding='UTF-8'):
        self._out = output
        self._cod = encoding
        self._o_d = DOCBOOK
        self._in_cdata = 0
        self._in_entity = 0

    def set_format(self, format):
        if format == 'docbook':
            self._o_d = DOCBOOK
        if format == 'html':
            self._o_d = HTML
            
    ## content handler #####################################################
    def startDocument(self):
        self._out.write(self._o_d[_ROOT][0])
            
    def endDocument(self):
        self._out.write(self._o_d[_ROOT][1])
        
    def startElement(self, name, attrs):
        prefix, local = SplitQName(name)
        if prefix:
            self._out.write('&lt;%s%s%s:%s%s%s'.encode(self._cod) % (
                self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        else:
            self._out.write('&lt;%s%s%s'.encode(self._cod) % (
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        for key, val in attrs.items():
            prefix, local = SplitQName(key)
            if prefix:
                self._out.write('%s%s%s:%s%s%s=%s"%s"%s'.encode(self._cod) % (
                    self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                    self._o_d[_NAME][0], local, self._o_d[_NAME][1],
                    self._o_d[_STRING][0], val, self._o_d[_STRING][1]))
            else:
                self._out.write(' %s%s%s=%s"%s"%s'.encode(self._cod) % (
                    self._o_d[_NAME][0], local, self._o_d[_NAME][1],
                    self._o_d[_STRING][0], val, self._o_d[_STRING][1]))
        self._out.write('>')
        
    def endElement(self, name):
        prefix, local = SplitQName(name)
        if prefix:
            self._out.write('&lt;/%s%s%s:%s%s%s>'.encode(self._cod) % (
                self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        else:
            self._out.write('&lt;/%s%s%s>'.encode(self._cod) % (
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        
    def processingInstruction(self, target, data):
        self._out.write('&lt;?%s%s%s %s%s%s>'.encode(self._cod) % (
            self._o_d[_NAME][0], target, self._o_d[_NAME][1],
            self._o_d[_STRING][0], data, self._o_d[_STRING][1]))
        
    def characters(self, ch):
        if self._in_entity: return
	elif not self._in_cdata: ch = escape(ch)
        self._out.write('%s%s%s' % (
            self._o_d[_TEXT][0], ch.encode(self._cod), self._o_d[_TEXT][1]))
        
    ## lexical handler #####################################################
    def comment(self, comment):
        self._out.write('%s&lt;!--%s-->%s' % (
            self._o_d[_COMMENT][0],
            comment.replace('<', '&lt;').encode(self._cod),
            self._o_d[_COMMENT][1]))
        
    def startCDATA(self):
        self._out.write('&lt;%s[CDATA[%s' % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1]))
        self._in_cdata = 1
        
    def endCDATA(self):
        self._out.write('%s]]%s>' % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1]))
        self._in_cdata = 0
        
    def startDTD(self, name, public_id, system_id):
        self._out.write('&lt;%s!DOCTYPE%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s"%s"%s['.encode(self._cod) % (
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s ['.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
            
    def endDTD(self):
        self._out.write(']>')

    def startEntity(self, name):
        self._out.write('%s&%s;%s'.encode(self._cod) % (
                        self._o_d[_NAME][0], name, self._o_d[_NAME][1]))
        self._in_entity = 1

    def endEntity(self, name):
        self._in_entity = 0
        
    ## decl handler ########################################################
    def internalEntityDecl(self, name, value):
        self._out.write('&lt;%s!ENTITY%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s>'.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
            
    def externalEntityDecl(self, name, public_id, system_id):
        self._out.write('&lt;%s!ENTITY%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s"%s"%s>'.encode(self._cod)%(
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s>'.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))

    def elementDecl(self, elem_name, content_model):
        c_m = _decode_content_model(content_model)
        self._out.write('&lt;%s!ELEMENT%s %s %s>'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
            c_m))
        
    def attributeDecl(self,elem_name,attr_name,type_d,value_def,value):
        import types
        if type(type_d) is types.ListType:
            s = ''
            for pos in type_d:
                if not s:
                    s = '(%s' % pos
                else:
                    s = '%s|%s' % (s, pos)
            s = '%s)' % s
            self._out.write('&lt;%s!ATTLIST%s %s %s %s%s>'.encode(self._cod)%(
                self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
                attr_name, s , value_def))
        else:
            self._out.write('&lt;%s!ATTLIST%s %s %s%s>'.encode(self._cod)%(
                self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
                attr_name, type))
            
C_OP, C_VAL, C_NUM = 0, 1, 2 
def _decode_content_model(content_m):
    ''' recursively decode a content_model returned by parsers in 
	elementDecl '''
    s = ''
    if content_m[C_OP] == ',':
        for c_m in content_m[C_VAL]:
            if not s:
                s = '(%s' % _decode_content_model(c_m)
            else:
                s = '%s, %s' % (s, _decode_content_model(c_m))
        s = '%s)%s' % (s, content_m[C_NUM] )
    elif content_m[C_OP] == '|':
        for c_m in content_m[C_VAL]:
            if not s:
                s = '(%s' % _decode_content_model(c_m)
            else:
                s = '%s|%s' % (s, _decode_content_model(c_m))
        s = '%s)%s' % (s, content_m[C_NUM] )
    else:
        s = '%s%s' % (s, content_m[C_OP])
        s = '%s%s' % (s, content_m[-1])
    return s
            
USAGE = '''xml2dcbk: format xml source code to xml docbook using roles
Usage: xml2dcbk [options] source.py..., parse XML file(s)
       xml2dcbk -h/--help, print this help message and exit Options:
       _ -e/--encoding iso-8859-1, specify encoding to use in outputs
       _ -d/--docbook, format output as docbook xml (default)
       _ -w/--html, format output in html instead of docbook ''' 

def run(args):
    import getopt, os
    from xml.sax import make_parser
    from xml.sax.handler import property_lexical_handler,\
         property_declaration_handler
    ## get options
    (opt, args) = getopt.getopt(args, 'he:dw',
                                ['help', 'encoding=', 'docbook', 'html'])
    encod, format = 'UTF-8', 'docbook'
    for o in opt:
        if o[0] == '-h' or o[0] == '--help':
            print USAGE
            return
        elif o[0] == '-d' or o[0] == '--docbook':
            format = 'docbook'
        elif o[0] == '-w' or o[0] == '--html':
            format = 'html'
        elif o[0] == '-e' or o[0] == '--encoding':
            encod = o[1]
                
    ## transforms source files (xmlproc support property_lexical_handler while
    ## pyexpat doen't)
    p = make_parser(['xml.sax.drivers2.drv_xmlproc'])
    for file in args:
        source = open(file, 'r')
        ## prepare handler
        if file[-4:] != '.xml':
            print >>sys.stderr, 'Unknown extension %s, ignored file %s'%(
                file[-4:], file)
            continue
        dest = open('%s_dcbk.xml' % os.path.basename(file)[:-4], 'w+')
        h = XmlFormatSaxHandler(dest, encod)
        h.set_format(format)
        p.setContentHandler(h)
        try:
            p.setProperty(property_lexical_handler, h)
        except Exception, e:
            print e
        try:
            p.setProperty(property_declaration_handler, h)
        except Exception, e:
            print e
        print >>sys.stderr, "Formatting %s ..." % file
        ## parse and write colorized version to output file
        p.parse(source)
        
        source.close()
        dest.close()
    
if __name__ == "__main__":
    run(sys.argv[1:])

      

The main advantage of this recipe is that you have here a full Sax handler, using the standard ContentHandler with the exotics LexicalHandler and DeclHandler properties. Note that at this time, parsers do not support those properties fully. The only functions missing in this handlers are those in the ErrorHandler interface and setDocumentLocator and ignorableWhitespace in the ContentHandler interface (but inherited from the default ContentHandler).

Tags: xml

4 comments

Jürgen Hermann 22 years, 4 months ago # | flag

Always provide full interfaces. > ... startEntity and endEntity in the LexicalHandler interface (no parser call them!!) ...

Not true, PIRXX _does_ call them, besides you cannot know whether your assumption holds true in the future. So you either have to inherit from the interfaces (so you get empty implementations for methods you do not overload) or provide the empty callbacks yourself. Anything else is likely to break.

Sylvain Thenault (author) 22 years, 4 months ago # | flag

Always provide full interfaces. very true ! I'll add the missing callbacks...

Jürgen Hermann 22 years, 3 months ago # | flag

Wrong CDATA handling. You have to remember you are in a CDATA section and switch off escaping in the characters() event, see

http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/84516

for the correct handling.

Sylvain Thenault (author) 22 years, 3 months ago # | flag

Wrong CDATA handling, now fixed

◄	Python recipes (4591)	►
◄	Sylvain Thenault's recipes (2)	►

Colorize xml source using the sax ContentHandler, LexicalHandler and Declhandler interfaces (Python recipe) by Sylvain Thenault
ActiveState Code (http://code.activestate.com/recipes/102109/)

4 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Colorize xml source using the sax ContentHandler, LexicalHandler and Declhandler interfaces (Python recipe) by Sylvain Thenault ActiveState Code (http://code.activestate.com/recipes/102109/)

4 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Colorize xml source using the sax ContentHandler, LexicalHandler and Declhandler interfaces (Python recipe) by Sylvain Thenault
ActiveState Code (http://code.activestate.com/recipes/102109/)