This recipe take an xml file as input and output a colorized version of this file, using html or docbook (with emphasis elements and a particular role). It provides a little command line interface and it's really easy to configure your output.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 | #!/usr/bin/python2.1 
import sys 
from xml.dom.ext import SplitQName 
from xml.sax.handler import ContentHandler 
from xml.sax.saxutils import escape
_ROOT, _STRING, _COMMENT, _NAME, _KEYWORD, _TEXT, _HEAD =0,1,2,3,4,5,6 
DOCBOOK = {
    _ROOT: ('<programlisting>','</programlisting>'),
    _STRING: ('<emphasis>', '</emphasis>'),
    _COMMENT:('<emphasis>', '</emphasis>'),
    _NAME:  ('', ''),
    _KEYWORD:('<emphasis role="bold">', '</emphasis>'),
    _TEXT:  ('', '')
    } HTML = {
    _ROOT: ('<div>', '</div>'),
    _STRING: ('<font color="#004080">', '</font>'),
    _COMMENT:('<font color="#008000">', '</font>'),
    _NAME:  ('', ''),
    _KEYWORD:('<font color="#C00000">', '</font>'),
    _TEXT:  ('', '')
    } 
class XmlFormatSaxHandler(ContentHandler):
    ''' format an xmlfile to docbook or html '''
    
    def __init__(self, head=1, output=sys.stdout, encoding='UTF-8'):
        self._out = output
        self._cod = encoding
        self._o_d = DOCBOOK
        self._in_cdata = 0
        self._in_entity = 0
    def set_format(self, format):
        if format == 'docbook':
            self._o_d = DOCBOOK
        if format == 'html':
            self._o_d = HTML
            
    ## content handler #####################################################
    def startDocument(self):
        self._out.write(self._o_d[_ROOT][0])
            
    def endDocument(self):
        self._out.write(self._o_d[_ROOT][1])
        
    def startElement(self, name, attrs):
        prefix, local = SplitQName(name)
        if prefix:
            self._out.write('<%s%s%s:%s%s%s'.encode(self._cod) % (
                self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        else:
            self._out.write('<%s%s%s'.encode(self._cod) % (
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        for key, val in attrs.items():
            prefix, local = SplitQName(key)
            if prefix:
                self._out.write('%s%s%s:%s%s%s=%s"%s"%s'.encode(self._cod) % (
                    self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                    self._o_d[_NAME][0], local, self._o_d[_NAME][1],
                    self._o_d[_STRING][0], val, self._o_d[_STRING][1]))
            else:
                self._out.write(' %s%s%s=%s"%s"%s'.encode(self._cod) % (
                    self._o_d[_NAME][0], local, self._o_d[_NAME][1],
                    self._o_d[_STRING][0], val, self._o_d[_STRING][1]))
        self._out.write('>')
        
    def endElement(self, name):
        prefix, local = SplitQName(name)
        if prefix:
            self._out.write('</%s%s%s:%s%s%s>'.encode(self._cod) % (
                self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        else:
            self._out.write('</%s%s%s>'.encode(self._cod) % (
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        
    def processingInstruction(self, target, data):
        self._out.write('<?%s%s%s %s%s%s>'.encode(self._cod) % (
            self._o_d[_NAME][0], target, self._o_d[_NAME][1],
            self._o_d[_STRING][0], data, self._o_d[_STRING][1]))
        
    def characters(self, ch):
        if self._in_entity: return
	elif not self._in_cdata: ch = escape(ch)
        self._out.write('%s%s%s' % (
            self._o_d[_TEXT][0], ch.encode(self._cod), self._o_d[_TEXT][1]))
        
    ## lexical handler #####################################################
    def comment(self, comment):
        self._out.write('%s<!--%s-->%s' % (
            self._o_d[_COMMENT][0],
            comment.replace('<', '<').encode(self._cod),
            self._o_d[_COMMENT][1]))
        
    def startCDATA(self):
        self._out.write('<%s[CDATA[%s' % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1]))
        self._in_cdata = 1
        
    def endCDATA(self):
        self._out.write('%s]]%s>' % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1]))
        self._in_cdata = 0
        
    def startDTD(self, name, public_id, system_id):
        self._out.write('<%s!DOCTYPE%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s"%s"%s['.encode(self._cod) % (
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s ['.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
            
    def endDTD(self):
        self._out.write(']>')
    def startEntity(self, name):
        self._out.write('%s&%s;%s'.encode(self._cod) % (
                        self._o_d[_NAME][0], name, self._o_d[_NAME][1]))
        self._in_entity = 1
    def endEntity(self, name):
        self._in_entity = 0
        
    ## decl handler ########################################################
    def internalEntityDecl(self, name, value):
        self._out.write('<%s!ENTITY%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s>'.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
            
    def externalEntityDecl(self, name, public_id, system_id):
        self._out.write('<%s!ENTITY%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s"%s"%s>'.encode(self._cod)%(
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s>'.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
    def elementDecl(self, elem_name, content_model):
        c_m = _decode_content_model(content_model)
        self._out.write('<%s!ELEMENT%s %s %s>'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
            c_m))
        
    def attributeDecl(self,elem_name,attr_name,type_d,value_def,value):
        import types
        if type(type_d) is types.ListType:
            s = ''
            for pos in type_d:
                if not s:
                    s = '(%s' % pos
                else:
                    s = '%s|%s' % (s, pos)
            s = '%s)' % s
            self._out.write('<%s!ATTLIST%s %s %s %s%s>'.encode(self._cod)%(
                self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
                attr_name, s , value_def))
        else:
            self._out.write('<%s!ATTLIST%s %s %s%s>'.encode(self._cod)%(
                self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
                attr_name, type))
            
C_OP, C_VAL, C_NUM = 0, 1, 2 
def _decode_content_model(content_m):
    ''' recursively decode a content_model returned by parsers in 
	elementDecl '''
    s = ''
    if content_m[C_OP] == ',':
        for c_m in content_m[C_VAL]:
            if not s:
                s = '(%s' % _decode_content_model(c_m)
            else:
                s = '%s, %s' % (s, _decode_content_model(c_m))
        s = '%s)%s' % (s, content_m[C_NUM] )
    elif content_m[C_OP] == '|':
        for c_m in content_m[C_VAL]:
            if not s:
                s = '(%s' % _decode_content_model(c_m)
            else:
                s = '%s|%s' % (s, _decode_content_model(c_m))
        s = '%s)%s' % (s, content_m[C_NUM] )
    else:
        s = '%s%s' % (s, content_m[C_OP])
        s = '%s%s' % (s, content_m[-1])
    return s
            
USAGE = '''xml2dcbk: format xml source code to xml docbook using roles
Usage: xml2dcbk [options] source.py..., parse XML file(s)
       xml2dcbk -h/--help, print this help message and exit Options:
       _ -e/--encoding iso-8859-1, specify encoding to use in outputs
       _ -d/--docbook, format output as docbook xml (default)
       _ -w/--html, format output in html instead of docbook ''' 
def run(args):
    import getopt, os
    from xml.sax import make_parser
    from xml.sax.handler import property_lexical_handler,\
         property_declaration_handler
    ## get options
    (opt, args) = getopt.getopt(args, 'he:dw',
                                ['help', 'encoding=', 'docbook', 'html'])
    encod, format = 'UTF-8', 'docbook'
    for o in opt:
        if o[0] == '-h' or o[0] == '--help':
            print USAGE
            return
        elif o[0] == '-d' or o[0] == '--docbook':
            format = 'docbook'
        elif o[0] == '-w' or o[0] == '--html':
            format = 'html'
        elif o[0] == '-e' or o[0] == '--encoding':
            encod = o[1]
                
    ## transforms source files (xmlproc support property_lexical_handler while
    ## pyexpat doen't)
    p = make_parser(['xml.sax.drivers2.drv_xmlproc'])
    for file in args:
        source = open(file, 'r')
        ## prepare handler
        if file[-4:] != '.xml':
            print >>sys.stderr, 'Unknown extension %s, ignored file %s'%(
                file[-4:], file)
            continue
        dest = open('%s_dcbk.xml' % os.path.basename(file)[:-4], 'w+')
        h = XmlFormatSaxHandler(dest, encod)
        h.set_format(format)
        p.setContentHandler(h)
        try:
            p.setProperty(property_lexical_handler, h)
        except Exception, e:
            print e
        try:
            p.setProperty(property_declaration_handler, h)
        except Exception, e:
            print e
        print >>sys.stderr, "Formatting %s ..." % file
        ## parse and write colorized version to output file
        p.parse(source)
        
        source.close()
        dest.close()
    
if __name__ == "__main__":
    run(sys.argv[1:])
 | 
The main advantage of this recipe is that you have here a full Sax handler, using the standard ContentHandler with the exotics LexicalHandler and DeclHandler properties. Note that at this time, parsers do not support those properties fully. The only functions missing in this handlers are those in the ErrorHandler interface and setDocumentLocator and ignorableWhitespace in the ContentHandler interface (but inherited from the default ContentHandler).

 Download
Download Copy to clipboard
Copy to clipboard
Always provide full interfaces. > ... startEntity and endEntity in the LexicalHandler interface (no parser call them!!) ...
Not true, PIRXX _does_ call them, besides you cannot know whether your assumption holds true in the future. So you either have to inherit from the interfaces (so you get empty implementations for methods you do not overload) or provide the empty callbacks yourself. Anything else is likely to break.
Always provide full interfaces. very true ! I'll add the missing callbacks...
Wrong CDATA handling. You have to remember you are in a CDATA section and switch off escaping in the characters() event, see
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/84516
for the correct handling.
Wrong CDATA handling, now fixed