Welcome, guest | Sign In | My Account | Store | Cart
"""
Convert text/enriched MIME type to text/html MIME type.

Based on the program in Appendix B of RFC 1896 -- see
http://www.rfc-editor.org/rfc/rfc1896.txt. 

However, it was entirely rewritten for Python and 
refactored for flexibility and comprehension. Support 
for color and fontfamily was added.

Usage: python enriched2html.py < source.txt > target.html
"""
__author__=["Jack Trainor (jacktrainor@gmail.com)",]
__version__="May 2009"

import array
import sys
import os, os.path


class StdIo(array.array):
    """ Wraps array to provide C stdio functions 
    for a block of text. """
    def __new__(cls, tc='c'):
        return super(StdIo, cls).__new__(cls, 'c')
    
    def __init__(self, s=""):
        self.fromstring(s)
        self.position = 0

    def __repr__(self):
        return self.tostring()
        
    def getc(self):
        if self.position < len(self):
            c = self[self.position]
            self.position += 1
        else:
            c = None
        return c
    
    def lookaheadc(self):
        c = self.getc()
        self.ungetc(c)
        return c
    
    def ungetc(self, c):
        if c != None:
            if self.position > 0:
                self.position -= 1

    def putc(self, c):
        self.append(c)

    def puts(self, s):
        self.fromstring(s)

    def get_text_until(self, delimiters=()):
        chars = []
        c = self.getc()
        while c and c not in delimiters:
            chars.append(c)
            c = self.getc()
        text = "".join(chars)
        return c, text
        
        
COMMAND_MAP = {
    "param": "",
    "nofill": "pre",
    "bold": "b",
    "italic": "i",
    "underline": "u",
    "fixed": "tt",
    "center": "center",
    "excerpt": "blockquote",
    "color": "font",
    "fontfamily": "font",
#____UNIMPLEMENTED____
    "paraindent": "",
    "indentright": "",
    "flushleft": "",
    "flushright": "",
    "flushboth": "",
    "bigger": "",
    "smaller": "",
    "indent": ""
}

class TextBlock(object):
    """ Wraps a block of text """
    def __init__(self):
        self.text = ""
        
    def __repr__(self):
        return "[%s]" % self.text
        
class CommandBlock(TextBlock):
    """ TextBlock in which the text is the command plus end flag
    param list """
    def __init__(self):
        TextBlock.__init__(self)
        self.end = False
        self.params = []

    def __repr__(self):
        return "[cmd: %s end: %d]" % (self.text, self.end)

                
def is_text_block(block):
    return (block and isinstance(block, TextBlock))

def is_command_block(block):
    return (block and isinstance(block, CommandBlock))

def is_param_block(block):
    return (is_command_block(block) and block.text == "param")

def is_end_param_block(block):
    return (is_param_block(block) and block.end == True)


class Converter(object):
    """ Converts text/enriched text to text/html format by first
    splitting into a stream of text blocks and command blocks,
    then processing those  blocks into html code. """
    def __init__(self, text):
        self.text = text
        self.input = None
        self.output = None
        self.blocks  = []
        self.block_index = 0
        self.no_fill_count = 0
        
    def execute(self):
        text = self.convert(self.text)
        return text
    
    def convert(self, text):        
        self.input = StdIo(text)
        self.output = StdIo()
        self.read_blocks()
        #self.debug_blocks()
        self.write_blocks()
        self.output.puts('\n')   
        return str(self.output)
    
    def get_block(self):
        if self.block_index < len(self.blocks):
            block = self.blocks[self.block_index]
            self.block_index += 1
        else:
            block = None
        return block

    def unget_block(self, block):
        if block:
            if self.block_index > 0:
                self.block_index -= 1
            
    def get_params(self, command_block):
        while True:
            block_1 = self.get_block()
            block_2 = self.get_block()
            block_3 = self.get_block()
            if is_param_block(block_1) and is_text_block(block_2) and is_end_param_block(block_3):
                command_block.params.append(block_2.text)
            else:
                self.unget_block(block_3)
                self.unget_block(block_2)
                self.unget_block(block_1)
                break
            
    def write_blocks(self):
        self.block_index = 0
        block = self.get_block()
        while block:
            if is_command_block(block) and not block.end:
                self.get_params(block)
            if is_command_block(block):
                self.write_command_block(block)
            elif is_text_block(block):
                self.write_text_block(block)
            block = self.get_block()   
    
    def write_command_block(self, block):
        command = block.text
        html_command = ""
        mapped_command = COMMAND_MAP.get(command, "")
        if not mapped_command:
            if not block.end:
                mapped_command = "?" + command
            else:
                mapped_command = "?" + command
                
        if not block.end:
            if command == "color":
                html_command =("<%s color=\"%s\">" % (mapped_command, block.params[0]))
            elif command == "fontfamily":
                html_command =("<%s face=\"%s\">" % (mapped_command, block.params[0]))
            else:
                html_command = ("<%s>" % mapped_command)
        else:
            html_command = ("</%s>" % mapped_command)
        
        if command == "nofill":
            if not block.end:
                self.no_fill_count += 1
            else:
                self.no_fill_count -= 1
            
        self.output.puts(html_command)

    def write_text_block(self, block):
        newline_count = 0
        for c in block.text:
            if c == '\n' and self.no_fill_count <= 0:
                if newline_count == 0:
                    c = ' '
                newline_count += 1
            else:
                newline_count = 0
                
            if c == '<':
                c = "&lt;"
            elif c == '>':
                c = "&gt;"
            elif c == '&':
                c = "&amp;"
            elif c == '\n':
                if self.no_fill_count <= 0:
                    c = "<br/>"
            self.output.puts(c)

    def debug_blocks(self):
        block = self.get_block()
        while block:
            print block
            block = self.get_block()

    def read_command_block(self, c):
        block = CommandBlock()
        c, text = self.input.get_text_until(">")
        if text and text[0] == '/':
            text = text[1:]
            block.end = True
        block.text = text.lower()
        return c, block
        
    def read_text_block(self, c):
        block = TextBlock()
        c2, text = self.input.get_text_until('<')
        block.text = c + text   
        if c2 == '<':
            self.input.ungetc(c2)
        return c2, block
        
    def read_blocks(self):
        c = self.input.getc()
        while c:
            if c == '<':
                if self.input.lookaheadc() == '<':
                    c = self.input.getc()
                    c, block = self.read_text_block('<')
                else:
                    c, block = self.read_command_block('<')
            else:
                c, block = self.read_text_block(c)
            self.blocks.append(block)
            c = self.input.getc()      

def convert_file(fp_in=sys.stdin):
    text = fp_in.read()
    html = Converter(text).execute()
    return html

def output_html(html, fp_out=sys.stdout):
    fp_out.write(html)

def main():
    html = convert_file()
    output_html(html)
    
if __name__ == "__main__":
    main()

History