Converts Python source to a portable XHTML 1.0 strict document that includes a basic set of Dublin Core metadata. Based on the MoinMoin source colorizer.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | # -*- coding: utf-8 -*-
Colorize - Python source formatter that outputs Python code in XHTML.
This script is based on MoinMoin - The Python Source Parser.
Usage: [source file name] [optional author name]
# Imports
import cgi
import string
import sys
import cStringIO
import keyword
import token
import tokenize
import re
import os
#Filepath of source file from command line parameter.
sourcefile = sys.argv[1]
#Get file name of source file.
filename = os.path.split(sourcefile)[1]
#Get optional author name parameter (it is added to the DC metadata)
if len(sys.argv)> 2:
authorname = sys.argv[2]
authorname = "Unknown"
#Set up basic values.
_KEYWORD = token.NT_OFFSET + 1
_TEXT = token.NT_OFFSET + 2
_classes = {
token.NUMBER: 'token_number',
token.OP: 'token_op',
token.STRING: 'token_string',
tokenize.COMMENT: 'token_comment',
token.NAME: 'token_name',
token.ERRORTOKEN: 'token_error',
_KEYWORD: 'keyword',
_TEXT: 'text',
#Define start of XHtml document.
docstart = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "">
<html xmlns="" lang="en" xml:lang="en">
#Add css from file to output document.
docstart += """\n\t<style type="text/css">\n"""
cssfile = open("colorize.css", "r")
docstart +=
docstart += """\n\t</style>\n"""
#Close Xhtml document.
docend = "\n\t</body>\n</html>"
#Set default encoding for output document.
docencoding = _DEFAULTENCODING
def getEncodingOfFile(sourcefile):
"""Get encoding of source file. If no encoding found, returns _DEFAULTENCODING."""
myfile = file(sourcefile)
line = myfile.readline()
#try line 1
encoding = parseEncoding(line)
#encoding found?
if encoding != "":
return encoding
#if not - try line two
line = myfile.readline()
encoding = parseEncoding(line)
#encoding found?
if encoding != "":
return encoding
#if not - try BOM
if myfile.encoding != None:
return myfile.encoding
#if not - return default encoding
def parseEncoding(textline):
"""Parse encoding from textline."""
#Check line for encoding match
regex = re.compile("coding[=:]\s*([-\w.]+)")
match =, 1)
if match != None:
#Return found encoding
#return default
return ""
class Parser:
""" Send colored python source.
def __init__(self, raw, out = sys.stdout):
""" Store the source text.
self.raw = string.strip(string.expandtabs(raw))
self.out = out
def format(self, formatter, form):
""" Parse and send the colored source.
# store line offsets in self.lines
self.lines = [0, 0]
pos = 0
while 1:
pos = string.find(self.raw, '\n', pos) + 1
if not pos: break
# parse the source and write it
# write metadata
self.out.write("\n<title>" + filename + "</title>\n")
self.out.write('<link rel="schema.DC" href="" />\n')
self.out.write('<meta name="DC.Language" content="en" />\n')
self.out.write('<meta name="DC.Format" content="text/html" />\n')
self.out.write('<meta name="DC.Type" content="Software" />\n')
self.out.write('<meta name="DC.Title" content="Python source of %s" />\n' % filename)
self.out.write('<meta name="DC.Creator" content="%s" />\n' % authorname)
self.out.write('<meta http-equiv="Content-Type" content="text/html; charset=%s" />\n' % docencoding)
#Close head and begin body.
self.pos = 0
text = cStringIO.StringIO(self.raw)
tokenize.tokenize(text.readline, self)
except tokenize.TokenError, ex:
msg = ex[0]
line = ex[1][0]
self.out.write("<h3>ERROR: %s</h3>%s\n" % (
msg, self.raw[self.lines[line]:]))
def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line):
""" Token handler.
if 0:
print "type", toktype, token.tok_name[toktype], "text", toktext,
print "start", srow, scol, "end", erow, ecol, "<br />"
# calculate new positions
oldpos = self.pos
newpos = self.lines[srow] + scol
self.pos = newpos + len(toktext)
# handle newlines
if toktype in [token.NEWLINE, tokenize.NL]:
# send the original whitespace, if needed
if newpos > oldpos:
# skip indenting tokens
if toktype in [token.INDENT, token.DEDENT]:
self.pos = newpos
# map token type to a color/class group
if token.LPAR <= toktype and toktype <= token.OP:
toktype = token.OP
elif toktype == token.NAME and keyword.iskeyword(toktext):
toktype = _KEYWORD
classval = _classes.get(toktype, _classes[_TEXT])
style = ''
if toktype == token.ERRORTOKEN:
style = ' style="border: solid 1.5pt #FF0000;"'
# send text
self.out.write('<span class="%s"%s>' % (classval, style))
if __name__ == "__main__":
import os, sys
print "Formatting " + sourcefile
#Set up encoding
docencoding = getEncodingOfFile(sourcefile)
# open own source
source = open(sourcefile).read()
# write colorized version to "[filename].py.html"
Parser(source, open(sourcefile + '.html', 'wt')).format(None, None)
# done!
print "Done! Wrote result file " + sourcefile + ".html"
For a sample CSS file to use see or use:
pre { font-face: verdana, arial, helvetica, sans-serif; } .token_number { color: #0080C0; } .token_op { color: #0000DD;} .token_string { color: #cc0000; } .token_comment { color: #999; } .token_name { color: #000000; } .token_error { color: #FF8080; } .keyword { color: blue; font-weight:bold; } .text { color: #000000; }