Welcome, guest | Sign In | My Account | Store | Cart

This recipe take an xml file as input and output a colorized version of this file, using html or docbook (with emphasis elements and a particular role). It provides a little command line interface and it's really easy to configure your output.

Python, 257 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/python2.1 
import sys 
from xml.dom.ext import SplitQName 
from xml.sax.handler import ContentHandler 
from xml.sax.saxutils import escape

_ROOT, _STRING, _COMMENT, _NAME, _KEYWORD, _TEXT, _HEAD =0,1,2,3,4,5,6 
DOCBOOK = {
    _ROOT: ('<programlisting>','</programlisting>'),
    _STRING: ('<emphasis>', '</emphasis>'),
    _COMMENT:('<emphasis>', '</emphasis>'),
    _NAME:  ('', ''),
    _KEYWORD:('<emphasis role="bold">', '</emphasis>'),
    _TEXT:  ('', '')
    } HTML = {
    _ROOT: ('<div>', '</div>'),
    _STRING: ('<font color="#004080">', '</font>'),
    _COMMENT:('<font color="#008000">', '</font>'),
    _NAME:  ('', ''),
    _KEYWORD:('<font color="#C00000">', '</font>'),
    _TEXT:  ('', '')
    } 

class XmlFormatSaxHandler(ContentHandler):
    ''' format an xmlfile to docbook or html '''
    
    def __init__(self, head=1, output=sys.stdout, encoding='UTF-8'):
        self._out = output
        self._cod = encoding
        self._o_d = DOCBOOK
        self._in_cdata = 0
        self._in_entity = 0

    def set_format(self, format):
        if format == 'docbook':
            self._o_d = DOCBOOK
        if format == 'html':
            self._o_d = HTML
            
    ## content handler #####################################################
    def startDocument(self):
        self._out.write(self._o_d[_ROOT][0])
            
    def endDocument(self):
        self._out.write(self._o_d[_ROOT][1])
        
    def startElement(self, name, attrs):
        prefix, local = SplitQName(name)
        if prefix:
            self._out.write('&lt;%s%s%s:%s%s%s'.encode(self._cod) % (
                self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        else:
            self._out.write('&lt;%s%s%s'.encode(self._cod) % (
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        for key, val in attrs.items():
            prefix, local = SplitQName(key)
            if prefix:
                self._out.write('%s%s%s:%s%s%s=%s"%s"%s'.encode(self._cod) % (
                    self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                    self._o_d[_NAME][0], local, self._o_d[_NAME][1],
                    self._o_d[_STRING][0], val, self._o_d[_STRING][1]))
            else:
                self._out.write(' %s%s%s=%s"%s"%s'.encode(self._cod) % (
                    self._o_d[_NAME][0], local, self._o_d[_NAME][1],
                    self._o_d[_STRING][0], val, self._o_d[_STRING][1]))
        self._out.write('>')
        
    def endElement(self, name):
        prefix, local = SplitQName(name)
        if prefix:
            self._out.write('&lt;/%s%s%s:%s%s%s>'.encode(self._cod) % (
                self._o_d[_KEYWORD][0], prefix, self._o_d[_KEYWORD][1],
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        else:
            self._out.write('&lt;/%s%s%s>'.encode(self._cod) % (
                self._o_d[_NAME][0], local, self._o_d[_NAME][1]))
        
    def processingInstruction(self, target, data):
        self._out.write('&lt;?%s%s%s %s%s%s>'.encode(self._cod) % (
            self._o_d[_NAME][0], target, self._o_d[_NAME][1],
            self._o_d[_STRING][0], data, self._o_d[_STRING][1]))
        
    def characters(self, ch):
        if self._in_entity: return
	elif not self._in_cdata: ch = escape(ch)
        self._out.write('%s%s%s' % (
            self._o_d[_TEXT][0], ch.encode(self._cod), self._o_d[_TEXT][1]))
        
    ## lexical handler #####################################################
    def comment(self, comment):
        self._out.write('%s&lt;!--%s-->%s' % (
            self._o_d[_COMMENT][0],
            comment.replace('<', '&lt;').encode(self._cod),
            self._o_d[_COMMENT][1]))
        
    def startCDATA(self):
        self._out.write('&lt;%s[CDATA[%s' % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1]))
        self._in_cdata = 1
        
    def endCDATA(self):
        self._out.write('%s]]%s>' % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1]))
        self._in_cdata = 0
        
    def startDTD(self, name, public_id, system_id):
        self._out.write('&lt;%s!DOCTYPE%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s"%s"%s['.encode(self._cod) % (
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s ['.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
            
    def endDTD(self):
        self._out.write(']>')

    def startEntity(self, name):
        self._out.write('%s&%s;%s'.encode(self._cod) % (
                        self._o_d[_NAME][0], name, self._o_d[_NAME][1]))
        self._in_entity = 1

    def endEntity(self, name):
        self._in_entity = 0
        
    ## decl handler ########################################################
    def internalEntityDecl(self, name, value):
        self._out.write('&lt;%s!ENTITY%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s>'.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
            
    def externalEntityDecl(self, name, public_id, system_id):
        self._out.write('&lt;%s!ENTITY%s %s'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], name))
        if public_id:
            self._out.write(' PUBLIC %s"%s"%s %s"%s"%s>'.encode(self._cod)%(
                self._o_d[_STRING][0], public_id, self._o_d[_STRING][1],
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))
        else:
            self._out.write(' SYSTEM %s"%s"%s>'.encode(self._cod) % (
                self._o_d[_STRING][0], system_id, self._o_d[_STRING][1]))

    def elementDecl(self, elem_name, content_model):
        c_m = _decode_content_model(content_model)
        self._out.write('&lt;%s!ELEMENT%s %s %s>'.encode(self._cod) % (
            self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
            c_m))
        
    def attributeDecl(self,elem_name,attr_name,type_d,value_def,value):
        import types
        if type(type_d) is types.ListType:
            s = ''
            for pos in type_d:
                if not s:
                    s = '(%s' % pos
                else:
                    s = '%s|%s' % (s, pos)
            s = '%s)' % s
            self._out.write('&lt;%s!ATTLIST%s %s %s %s%s>'.encode(self._cod)%(
                self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
                attr_name, s , value_def))
        else:
            self._out.write('&lt;%s!ATTLIST%s %s %s%s>'.encode(self._cod)%(
                self._o_d[_KEYWORD][0], self._o_d[_KEYWORD][1], elem_name,
                attr_name, type))
            
C_OP, C_VAL, C_NUM = 0, 1, 2 
def _decode_content_model(content_m):
    ''' recursively decode a content_model returned by parsers in 
	elementDecl '''
    s = ''
    if content_m[C_OP] == ',':
        for c_m in content_m[C_VAL]:
            if not s:
                s = '(%s' % _decode_content_model(c_m)
            else:
                s = '%s, %s' % (s, _decode_content_model(c_m))
        s = '%s)%s' % (s, content_m[C_NUM] )
    elif content_m[C_OP] == '|':
        for c_m in content_m[C_VAL]:
            if not s:
                s = '(%s' % _decode_content_model(c_m)
            else:
                s = '%s|%s' % (s, _decode_content_model(c_m))
        s = '%s)%s' % (s, content_m[C_NUM] )
    else:
        s = '%s%s' % (s, content_m[C_OP])
        s = '%s%s' % (s, content_m[-1])
    return s
            
USAGE = '''xml2dcbk: format xml source code to xml docbook using roles
Usage: xml2dcbk [options] source.py..., parse XML file(s)
       xml2dcbk -h/--help, print this help message and exit Options:
       _ -e/--encoding iso-8859-1, specify encoding to use in outputs
       _ -d/--docbook, format output as docbook xml (default)
       _ -w/--html, format output in html instead of docbook ''' 

def run(args):
    import getopt, os
    from xml.sax import make_parser
    from xml.sax.handler import property_lexical_handler,\
         property_declaration_handler
    ## get options
    (opt, args) = getopt.getopt(args, 'he:dw',
                                ['help', 'encoding=', 'docbook', 'html'])
    encod, format = 'UTF-8', 'docbook'
    for o in opt:
        if o[0] == '-h' or o[0] == '--help':
            print USAGE
            return
        elif o[0] == '-d' or o[0] == '--docbook':
            format = 'docbook'
        elif o[0] == '-w' or o[0] == '--html':
            format = 'html'
        elif o[0] == '-e' or o[0] == '--encoding':
            encod = o[1]
                
    ## transforms source files (xmlproc support property_lexical_handler while
    ## pyexpat doen't)
    p = make_parser(['xml.sax.drivers2.drv_xmlproc'])
    for file in args:
        source = open(file, 'r')
        ## prepare handler
        if file[-4:] != '.xml':
            print >>sys.stderr, 'Unknown extension %s, ignored file %s'%(
                file[-4:], file)
            continue
        dest = open('%s_dcbk.xml' % os.path.basename(file)[:-4], 'w+')
        h = XmlFormatSaxHandler(dest, encod)
        h.set_format(format)
        p.setContentHandler(h)
        try:
            p.setProperty(property_lexical_handler, h)
        except Exception, e:
            print e
        try:
            p.setProperty(property_declaration_handler, h)
        except Exception, e:
            print e
        print >>sys.stderr, "Formatting %s ..." % file
        ## parse and write colorized version to output file
        p.parse(source)
        
        source.close()
        dest.close()
    
if __name__ == "__main__":
    run(sys.argv[1:])

The main advantage of this recipe is that you have here a full Sax handler, using the standard ContentHandler with the exotics LexicalHandler and DeclHandler properties. Note that at this time, parsers do not support those properties fully. The only functions missing in this handlers are those in the ErrorHandler interface and setDocumentLocator and ignorableWhitespace in the ContentHandler interface (but inherited from the default ContentHandler).

4 comments

Jürgen Hermann 22 years, 4 months ago  # | flag

Always provide full interfaces. > ... startEntity and endEntity in the LexicalHandler interface (no parser call them!!) ...

Not true, PIRXX _does_ call them, besides you cannot know whether your assumption holds true in the future. So you either have to inherit from the interfaces (so you get empty implementations for methods you do not overload) or provide the empty callbacks yourself. Anything else is likely to break.

Sylvain Thenault (author) 22 years, 4 months ago  # | flag

Always provide full interfaces. very true ! I'll add the missing callbacks...

Jürgen Hermann 22 years, 3 months ago  # | flag

Wrong CDATA handling. You have to remember you are in a CDATA section and switch off escaping in the characters() event, see

http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/84516

for the correct handling.

Sylvain Thenault (author) 22 years, 3 months ago  # | flag

Wrong CDATA handling, now fixed

Created by Sylvain Thenault on Wed, 12 Dec 2001 (PSF)
Python recipes (4591)
Sylvain Thenault's recipes (2)

Required Modules

  • (none specified)

Other Information and Tasks