This is a stab at converting a transcript generated by the Unix script
command that uses ANSI escape sequences, used to colour the terminal, to HTML.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | #!/usr/bin/env python
from __future__ import with_statement
import re
import cgi
colorcodes = {'bold':{True:'\033[1m',False:'\033[22m'},
'cyan':{True:'\033[36m',False:'\033[39m'},
'blue':{True:'\033[34m',False:'\033[39m'},
'red':{True:'\033[31m',False:'\033[39m'},
'magenta':{True:'\033[35m',False:'\033[39m'},
'green':{True:'\033[32m',False:'\033[39m'},
'underline':{True:'\033[4m',False:'\033[24m'}}
def recolor(color, text):
regexp = "(?:%s)(.*?)(?:%s)" % (colorcodes[color][True], colorcodes[color][False])
regexp = regexp.replace('[', r'\[')
return re.sub(regexp, r'''<span style="color: %s">\1</span>''' % color, text)
def bold(text):
regexp = "(?:%s)(.*?)(?:%s)" % (colorcodes['bold'][True], colorcodes['bold'][False])
regexp = regexp.replace('[', r'\[')
return re.sub(regexp, r'<span style="font-weight:bold">\1</span>', text)
def underline(text):
regexp = "(?:%s)(.*?)(?:%s)" % (colorcodes['underline'][True], colorcodes['underline'][False])
regexp = regexp.replace('[', r'\[')
return re.sub(regexp, r'<span style="text-decoration: underline">\1</span>', text)
def removebells(text):
return text.replace('\07', '')
def removebackspaces(text):
backspace_or_eol = r'(.\010)|(\033\[K)'
n = 1
while n > 0:
text, n = re.subn(backspace_or_eol, '', text, 1)
return text
template = '''\
<html>
<head>
<style text="text/css">
body {
font-family: Monaco,
"Bitstream Vera Sans Mono",
"Lucida Console",
Terminal,
monospace;
font-size: 14;
}
</style>
</head>
<body>
%s
</body>
</html>
'''
re_string = re.compile(r'(?P<htmlchars>[<&>])|(?P<space>^[ \t]+)|(?P<lineend>\r\n|\r|\n)|(?P<protocal>(^|\s)((http|ftp)://.*?))(\s|$)', re.S|re.M|re.I)
def plaintext2html(text, tabstop=4):
def do_sub(m):
c = m.groupdict()
if c['htmlchars']:
return cgi.escape(c['htmlchars'])
if c['lineend']:
return '<br>'
elif c['space']:
t = m.group().replace('\t', ' '*tabstop)
t = t.replace(' ', ' ')
return t
elif c['space'] == '\t':
return ' '*tabstop;
else:
url = m.group('protocal')
if url.startswith(' '):
prefix = ' '
url = url[1:]
else:
prefix = ''
last = m.groups()[-1]
if last in ['\n', '\r', '\r\n']:
last = '<br>'
return '%s%s' % (prefix, url)
result = re.sub(re_string, do_sub, text)
result = recolor('cyan', result)
result = recolor('blue', result)
result = recolor('red', result)
result = recolor('magenta', result)
result = recolor('green', result)
result = bold(result)
result = underline(result)
result = removebells(result)
result = removebackspaces(result)
return template % result
if __name__ == '__main__':
import sys
with open(sys.argv[-1]) as f:
text = f.read()
print plaintext2html(text)
|
This was developed in answer to my question here, for printing such transcripts.
The code is based on the snippet here which converts text to html.
The code is ugly at the moment and doesn't cover all ANSI escape sequences. It covers the ones used by cmd2 for colouring as that is where I need it. A small complexity which is now working using an ugly hack is how to remove backspaces embeddded in the transcript. This is the very inefficient removebackspaces
.
I tried it with Python 3.1 and it breaks in subtle ways: works giving mildly incorrect results. As I am not using Python 3 I won't bother further. I am happy to have any correction to this or other issues though.