Welcome, guest | Sign In | My Account | Store | Cart

This is a stab at converting a transcript generated by the Unix script command that uses ANSI escape sequences, used to colour the terminal, to HTML.

Python, 103 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
#!/usr/bin/env python
from __future__ import with_statement

import re
import cgi

colorcodes =   {'bold':{True:'\033[1m',False:'\033[22m'},
                'cyan':{True:'\033[36m',False:'\033[39m'},
                'blue':{True:'\033[34m',False:'\033[39m'},
                'red':{True:'\033[31m',False:'\033[39m'},
                'magenta':{True:'\033[35m',False:'\033[39m'},
                'green':{True:'\033[32m',False:'\033[39m'},
                'underline':{True:'\033[4m',False:'\033[24m'}}

def recolor(color, text):
    regexp = "(?:%s)(.*?)(?:%s)" % (colorcodes[color][True], colorcodes[color][False])
    regexp = regexp.replace('[', r'\[')
    return re.sub(regexp, r'''<span style="color: %s">\1</span>''' % color, text)

def bold(text):
    regexp = "(?:%s)(.*?)(?:%s)" % (colorcodes['bold'][True], colorcodes['bold'][False])
    regexp = regexp.replace('[', r'\[')
    return re.sub(regexp, r'<span style="font-weight:bold">\1</span>', text)

def underline(text):
    regexp = "(?:%s)(.*?)(?:%s)" % (colorcodes['underline'][True], colorcodes['underline'][False])
    regexp = regexp.replace('[', r'\[')
    return re.sub(regexp, r'<span style="text-decoration: underline">\1</span>', text)

def removebells(text):
    return text.replace('\07', '')

def removebackspaces(text):
    backspace_or_eol = r'(.\010)|(\033\[K)'
    n = 1
    while n > 0:
        text, n = re.subn(backspace_or_eol, '', text, 1)
    return text

template = '''\
<html>
<head>
    <style text="text/css">
    body {
        font-family:    Monaco,
                        "Bitstream Vera Sans Mono",
                        "Lucida Console",
                        Terminal,
                        monospace;
        font-size:      14;
    }
    </style>
</head>
<body>
%s
</body>
</html>
'''

re_string = re.compile(r'(?P<htmlchars>[<&>])|(?P<space>^[ \t]+)|(?P<lineend>\r\n|\r|\n)|(?P<protocal>(^|\s)((http|ftp)://.*?))(\s|$)', re.S|re.M|re.I)
def plaintext2html(text, tabstop=4):
    def do_sub(m):
        c = m.groupdict()
        if c['htmlchars']:
            return cgi.escape(c['htmlchars'])
        if c['lineend']:
            return '<br>'
        elif c['space']:
            t = m.group().replace('\t', '&nbsp;'*tabstop)
            t = t.replace(' ', '&nbsp;')
            return t
        elif c['space'] == '\t':
            return ' '*tabstop;
        else:
            url = m.group('protocal')
            if url.startswith(' '):
                prefix = ' '
                url = url[1:]
            else:
                prefix = ''
            last = m.groups()[-1]
            if last in ['\n', '\r', '\r\n']:
                last = '<br>'
            return '%s%s' % (prefix, url)
    result = re.sub(re_string, do_sub, text)
    result = recolor('cyan', result)
    result = recolor('blue', result)
    result = recolor('red', result)
    result = recolor('magenta', result)
    result = recolor('green', result)
    result = bold(result)
    result = underline(result)
    result = removebells(result)
    result = removebackspaces(result)

    return template % result


if __name__ == '__main__':
    import sys
    with open(sys.argv[-1]) as f:
        text = f.read()
    print plaintext2html(text)

This was developed in answer to my question here, for printing such transcripts.

The code is based on the snippet here which converts text to html.

The code is ugly at the moment and doesn't cover all ANSI escape sequences. It covers the ones used by cmd2 for colouring as that is where I need it. A small complexity which is now working using an ugly hack is how to remove backspaces embeddded in the transcript. This is the very inefficient removebackspaces.

I tried it with Python 3.1 and it breaks in subtle ways: works giving mildly incorrect results. As I am not using Python 3 I won't bother further. I am happy to have any correction to this or other issues though.