Welcome, guest | Sign In | My Account | Store | Cart
"""latex.py

Character translation utilities for LaTeX-formatted text.

Usage:
 - unicode(string,'latex')
 - ustring.decode('latex')
are both available just by letting "import latex" find this file.
 - unicode(string,'latex+latin1')
 - ustring.decode('latex+latin1')
where latin1 can be replaced by any other known encoding, also
become available by calling latex.register().

We also make public a dictionary latex_equivalents,
mapping ord(unicode char) to LaTeX code.

D. Eppstein, October 2003.
"""

from __future__ import generators
import codecs
import re
from sets import Set

def register():
    """Enable encodings of the form 'latex+x' where x describes another encoding.
    Unicode characters are translated to or from x when possible, otherwise
    expanded to latex.
    """
    codecs.register(_registry)

def getregentry():
    """Encodings module API."""
    return _registry('latex')

def _registry(encoding):
    if encoding == 'latex':
        encoding = None
    elif encoding.startswith('latex+'):
        encoding = encoding[6:]
    else:
        return None
        
    class Codec(codecs.Codec):
        def encode(self,input,errors='strict'):
            """Convert unicode string to latex."""
            output = []
            for c in input:
                if encoding:
                    try:
                        output.append(c.encode(encoding))
                        continue
                    except:
                        pass
                if ord(c) in latex_equivalents:
                    output.append(latex_equivalents[ord(c)])
                else:
                    output += ['{\\char', str(ord(c)), '}']
            return ''.join(output), len(input)
            
        def decode(self,input,errors='strict'):
            """Convert latex source string to unicode."""
            if encoding:
                input = unicode(input,encoding,errors)

            # Note: we may get buffer objects here.
            # It is not permussable to call join on buffer objects
            # but we can make them joinable by calling unicode.
            # This should always be safe since we are supposed
            # to be producing unicode output anyway.
            x = map(unicode,_unlatex(input))
            return u''.join(x), len(input)
    
    class StreamWriter(Codec,codecs.StreamWriter):
        pass
            
    class StreamReader(Codec,codecs.StreamReader):
        pass

    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)

def _tokenize(tex):
    """Convert latex source into sequence of single-token substrings."""
    start = 0
    try:
        # skip quickly across boring stuff
        pos = _stoppers.finditer(tex).next().span()[0]
    except StopIteration:
        yield tex
        return

    while 1:
        if pos > start:
            yield tex[start:pos]
            if tex[start] == '\\' and not (tex[pos-1].isdigit() and tex[start+1].isalpha()):
                while pos < len(tex) and tex[pos].isspace(): # skip blanks after csname
                    pos += 1

        while pos < len(tex) and tex[pos] in _ignore:
            pos += 1    # flush control characters
        if pos >= len(tex):
            return
        start = pos
        if tex[pos:pos+2] in {'$$':None, '/~':None}:    # protect ~ in urls
            pos += 2
        elif tex[pos].isdigit():
            while pos < len(tex) and tex[pos].isdigit():
                pos += 1
        elif tex[pos] == '-':
            while pos < len(tex) and tex[pos] == '-':
                pos += 1
        elif tex[pos] != '\\' or pos == len(tex) - 1:
            pos += 1
        elif not tex[pos+1].isalpha():
            pos += 2
        else:
            pos += 1
            while pos < len(tex) and tex[pos].isalpha():
                pos += 1
            if tex[start:pos] == '\\char' or tex[start:pos] == '\\accent':
                while pos < len(tex) and tex[pos].isdigit():
                    pos += 1

class _unlatex:
    """Convert tokenized tex into sequence of unicode strings.  Helper for decode()."""

    def __iter__(self):
        """Turn self into an iterator.  It already is one, nothing to do."""
        return self

    def __init__(self,tex):
        """Create a new token converter from a string."""
        self.tex = tuple(_tokenize(tex))  # turn tokens into indexable list
        self.pos = 0                    # index of first unprocessed token 
        self.lastoutput = 'x'           # lastoutput must always be nonempty string
    
    def __getitem__(self,n):
        """Return token at offset n from current pos."""
        p = self.pos + n
        t = self.tex
        return p < len(t) and t[p] or None

    def next(self):
        """Find and return another piece of converted output."""
        if self.pos >= len(self.tex):
            raise StopIteration
        nextoutput = self.chunk()
        if self.lastoutput[0] == '\\' and self.lastoutput[-1].isalpha() and nextoutput[0].isalpha():
            nextoutput = ' ' + nextoutput   # add extra space to terminate csname
        self.lastoutput = nextoutput
        return nextoutput
    
    def chunk(self):
        """Grab another set of input tokens and convert them to an output string."""
        for delta,c in self.candidates(0):
            if c in _l2u:
                self.pos += delta
                return unichr(_l2u[c])
            elif len(c) == 2 and c[1] == 'i' and (c[0],'\\i') in _l2u:
                self.pos += delta       # correct failure to undot i
                return unichr(_l2u[(c[0],'\\i')])
            elif len(c) == 1 and c[0].startswith('\\char') and c[0][5:].isdigit():
                self.pos += delta
                return unichr(int(c[0][5:]))
    
        # nothing matches, just pass through token as-is
        self.pos += 1
        return self[-1]
    
    def candidates(self,offset):
        """Generate pairs delta,c where c is a token or tuple of tokens from tex
        (after deleting extraneous brackets starting at pos) and delta
        is the length of the tokens prior to bracket deletion.
        """
        t = self[offset]
        if t in _blacklist:
            return
        elif t == '{':
            for delta,c in self.candidates(offset+1):
                if self[offset+delta+1] == '}':
                    yield delta+2,c
        elif t == '\\mbox':
            for delta,c in self.candidates(offset+1):
                yield delta+1,c
        elif t == '$' and self[offset+2] == '$':
            yield 3, (t,self[offset+1],t)
        else:
            q = self[offset+1]
            if q == '{' and self[offset+3] == '}':
                yield 4, (t,self[offset+2])
            elif q:
                yield 2, (t,q)
            yield 1, t

latex_equivalents = {
    0x0009: ' ',
    0x000a: '\n',
    0x0023: '{\#}',
    0x0026: '{\&}',
    0x00a0: '{~}',
    0x00a1: '{!`}',
    0x00a2: '{\\not{c}}',
    0x00a3: '{\\pounds}',
    0x00a7: '{\\S}',
    0x00a8: '{\\"{}}',
    0x00a9: '{\\copyright}',
    0x00af: '{\\={}}',
    0x00ac: '{\\neg}',
    0x00ad: '{\\-}',
    0x00b0: '{\\mbox{$^\\circ$}}',
    0x00b1: '{\\mbox{$\\pm$}}',
    0x00b2: '{\\mbox{$^2$}}',
    0x00b3: '{\\mbox{$^3$}}',
    0x00b4: "{\\'{}}",
    0x00b5: '{\\mbox{$\\mu$}}',
    0x00b6: '{\\P}',
    0x00b7: '{\\mbox{$\\cdot$}}',
    0x00b8: '{\\c{}}',
    0x00b9: '{\\mbox{$^1$}}',
    0x00bf: '{?`}',
    0x00c0: '{\\`A}',
    0x00c1: "{\\'A}",
    0x00c2: '{\\^A}',
    0x00c3: '{\\~A}',
    0x00c4: '{\\"A}',
    0x00c5: '{\\AA}',
    0x00c6: '{\\AE}',
    0x00c7: '{\\c{C}}',
    0x00c8: '{\\`E}',
    0x00c9: "{\\'E}",
    0x00ca: '{\\^E}',
    0x00cb: '{\\"E}',
    0x00cc: '{\\`I}',
    0x00cd: "{\\'I}",
    0x00ce: '{\\^I}',
    0x00cf: '{\\"I}',
    0x00d1: '{\\~N}',
    0x00d2: '{\\`O}',
    0x00d3: "{\\'O}",
    0x00d4: '{\\^O}',
    0x00d5: '{\\~O}',
    0x00d6: '{\\"O}',
    0x00d7: '{\\mbox{$\\times$}}',
    0x00d8: '{\\O}',
    0x00d9: '{\\`U}',
    0x00da: "{\\'U}",
    0x00db: '{\\^U}',
    0x00dc: '{\\"U}',
    0x00dd: "{\\'Y}",
    0x00df: '{\\ss}',
    0x00e0: '{\\`a}',
    0x00e1: "{\\'a}",
    0x00e2: '{\\^a}',
    0x00e3: '{\\~a}',
    0x00e4: '{\\"a}',
    0x00e5: '{\\aa}',
    0x00e6: '{\\ae}',
    0x00e7: '{\\c{c}}',
    0x00e8: '{\\`e}',
    0x00e9: "{\\'e}",
    0x00ea: '{\\^e}',
    0x00eb: '{\\"e}',
    0x00ec: '{\\`\\i}',
    0x00ed: "{\\'\\i}",
    0x00ee: '{\\^\\i}',
    0x00ef: '{\\"\\i}',
    0x00f1: '{\\~n}',
    0x00f2: '{\\`o}',
    0x00f3: "{\\'o}",
    0x00f4: '{\\^o}',
    0x00f5: '{\\~o}',
    0x00f6: '{\\"o}',
    0x00f7: '{\\mbox{$\\div$}}',
    0x00f8: '{\\o}',
    0x00f9: '{\\`u}',
    0x00fa: "{\\'u}",
    0x00fb: '{\\^u}',
    0x00fc: '{\\"u}',
    0x00fd: "{\\'y}",
    0x00ff: '{\\"y}',
    
    0x0100: '{\\=A}',
    0x0101: '{\\=a}',
    0x0102: '{\\u{A}}',
    0x0103: '{\\u{a}}',
    0x0104: '{\\c{A}}',
    0x0105: '{\\c{a}}',
    0x0106: "{\\'C}",
    0x0107: "{\\'c}",
    0x0108: "{\\^C}",
    0x0109: "{\\^c}",
    0x010a: "{\\.C}",
    0x010b: "{\\.c}",
    0x010c: "{\\v{C}}",
    0x010d: "{\\v{c}}",
    0x010e: "{\\v{D}}",
    0x010f: "{\\v{d}}",
    0x0112: '{\\=E}',
    0x0113: '{\\=e}',
    0x0114: '{\\u{E}}',
    0x0115: '{\\u{e}}',
    0x0116: '{\\.E}',
    0x0117: '{\\.e}',
    0x0118: '{\\c{E}}',
    0x0119: '{\\c{e}}',
    0x011a: "{\\v{E}}",
    0x011b: "{\\v{e}}",
    0x011c: '{\\^G}',
    0x011d: '{\\^g}',
    0x011e: '{\\u{G}}',
    0x011f: '{\\u{g}}',
    0x0120: '{\\.G}',
    0x0121: '{\\.g}',
    0x0122: '{\\c{G}}',
    0x0123: '{\\c{g}}',
    0x0124: '{\\^H}',
    0x0125: '{\\^h}',
    0x0128: '{\\~I}',
    0x0129: '{\\~\\i}',
    0x012a: '{\\=I}',
    0x012b: '{\\=\\i}',
    0x012c: '{\\u{I}}',
    0x012d: '{\\u\\i}',
    0x012e: '{\\c{I}}',
    0x012f: '{\\c{i}}',
    0x0130: '{\\.I}',
    0x0131: '{\\i}',
    0x0132: '{IJ}',
    0x0133: '{ij}',
    0x0134: '{\\^J}',
    0x0135: '{\\^\\j}',
    0x0136: '{\\c{K}}',
    0x0137: '{\\c{k}}',
    0x0139: "{\\'L}",
    0x013a: "{\\'l}",
    0x013b: "{\\c{L}}",
    0x013c: "{\\c{l}}",
    0x013d: "{\\v{L}}",
    0x013e: "{\\v{l}}",
    0x0141: '{\\L}',
    0x0142: '{\\l}',
    0x0143: "{\\'N}",
    0x0144: "{\\'n}",
    0x0145: "{\\c{N}}",
    0x0146: "{\\c{n}}",
    0x0147: "{\\v{N}}",
    0x0148: "{\\v{n}}",
    0x014c: '{\\=O}',
    0x014d: '{\\=o}',
    0x014e: '{\\u{O}}',
    0x014f: '{\\u{o}}',
    0x0150: '{\\H{O}}',
    0x0151: '{\\H{o}}',
    0x0152: '{\\OE}',
    0x0153: '{\\oe}',
    0x0154: "{\\'R}",
    0x0155: "{\\'r}",
    0x0156: "{\\c{R}}",
    0x0157: "{\\c{r}}",
    0x0158: "{\\v{R}}",
    0x0159: "{\\v{r}}",
    0x015a: "{\\'S}",
    0x015b: "{\\'s}",
    0x015c: "{\\^S}",
    0x015d: "{\\^s}",
    0x015e: "{\\c{S}}",
    0x015f: "{\\c{s}}",
    0x0160: "{\\v{S}}",
    0x0161: "{\\v{s}}",
    0x0162: "{\\c{T}}",
    0x0163: "{\\c{t}}",
    0x0164: "{\\v{T}}",
    0x0165: "{\\v{t}}",
    0x0168: "{\\~U}",
    0x0169: "{\\~u}",
    0x016a: "{\\=U}",
    0x016b: "{\\=u}",
    0x016c: "{\\u{U}}",
    0x016d: "{\\u{u}}",
    0x016e: "{\\r{U}}",
    0x016f: "{\\r{u}}",
    0x0170: "{\\H{U}}",
    0x0171: "{\\H{u}}",
    0x0172: "{\\c{U}}",
    0x0173: "{\\c{u}}",
    0x0174: "{\\^W}",
    0x0175: "{\\^w}",
    0x0176: "{\\^Y}",
    0x0177: "{\\^y}",
    0x0178: '{\\"Y}',
    0x0179: "{\\'Z}",
    0x017a: "{\\'Z}",
    0x017b: "{\\.Z}",
    0x017c: "{\\.Z}",
    0x017d: "{\\v{Z}}",
    0x017e: "{\\v{z}}",

    0x01c4: "{D\\v{Z}}",
    0x01c5: "{D\\v{z}}",
    0x01c6: "{d\\v{z}}",
    0x01c7: "{LJ}",
    0x01c8: "{Lj}",
    0x01c9: "{lj}",
    0x01ca: "{NJ}",
    0x01cb: "{Nj}",
    0x01cc: "{nj}",
    0x01cd: "{\\v{A}}",
    0x01ce: "{\\v{a}}",
    0x01cf: "{\\v{I}}",
    0x01d0: "{\\v\\i}",
    0x01d1: "{\\v{O}}",
    0x01d2: "{\\v{o}}",
    0x01d3: "{\\v{U}}",
    0x01d4: "{\\v{u}}",
    0x01e6: "{\\v{G}}",
    0x01e7: "{\\v{g}}",
    0x01e8: "{\\v{K}}",
    0x01e9: "{\\v{k}}",
    0x01ea: "{\\c{O}}",
    0x01eb: "{\\c{o}}",
    0x01f0: "{\\v\\j}",
    0x01f1: "{DZ}",
    0x01f2: "{Dz}",
    0x01f3: "{dz}",
    0x01f4: "{\\'G}",
    0x01f5: "{\\'g}",
    0x01fc: "{\\'\\AE}",
    0x01fd: "{\\'\\ae}",
    0x01fe: "{\\'\\O}",
    0x01ff: "{\\'\\o}",

    0x02c6: '{\\^{}}',
    0x02dc: '{\\~{}}',
    0x02d8: '{\\u{}}',
    0x02d9: '{\\.{}}',
    0x02da: "{\\r{}}",
    0x02dd: '{\\H{}}',
    0x02db: '{\\c{}}',
    0x02c7: '{\\v{}}',
    
    0x03c0: '{\\mbox{$\\pi$}}',
    # consider adding more Greek here
    
    0xfb01: '{fi}',
    0xfb02: '{fl}',
    
    0x2013: '{--}',
    0x2014: '{---}',
    0x2018: "{`}",
    0x2019: "{'}",
    0x201c: "{``}",
    0x201d: "{''}",
    0x2020: "{\\dag}",
    0x2021: "{\\ddag}",
    0x2122: "{\\mbox{$^\\mbox{TM}$}}",
    0x2022: "{\\mbox{$\\bullet$}}",
    0x2026: "{\\ldots}",
    0x2202: "{\\mbox{$\\partial$}}",
    0x220f: "{\\mbox{$\\prod$}}",
    0x2211: "{\\mbox{$\\sum$}}",
    0x221a: "{\\mbox{$\\surd$}}",
    0x221e: "{\\mbox{$\\infty$}}",
    0x222b: "{\\mbox{$\\int$}}",
    0x2248: "{\\mbox{$\\approx$}}",
    0x2260: "{\\mbox{$\\neq$}}",
    0x2264: "{\\mbox{$\\leq$}}",
    0x2265: "{\\mbox{$\\geq$}}",
}
for _i in range(0x0020):
    if _i not in latex_equivalents:
        latex_equivalents[_i] = ''
for _i in range(0x0020,0x007f):
    if _i not in latex_equivalents:
        latex_equivalents[_i] = chr(_i)

# Characters that should be ignored and not output in tokenization
_ignore = Set([chr(i) for i in range(32)+[127]]) - Set('\t\n\r')

# Regexp of chars not in blacklist, for quick start of tokenize
_stoppers = re.compile('[\x00-\x1f!$\\-?\\{~\\\\`\']')

_blacklist = Set(' \n\r')
_blacklist.add(None)    # shortcut candidate generation at end of data

# Construction of inverse translation table
_l2u = {
    '\ ':ord(' ')   # unexpanding space makes no sense in non-TeX contexts
}

for _tex in latex_equivalents:
    if _tex <= 0x0020 or (_tex <= 0x007f and len(latex_equivalents[_tex]) <= 1):
        continue    # boring entry
    _toks = tuple(_tokenize(latex_equivalents[_tex]))
    if _toks[0] == '{' and _toks[-1] == '}':
        _toks = _toks[1:-1]
    if _toks[0].isalpha():
        continue    # don't turn ligatures into single chars
    if len(_toks) == 1 and (_toks[0] == "'" or _toks[0] == "`"):
        continue    # don't turn ascii quotes into curly quotes
    if _toks[0] == '\\mbox' and _toks[1] == '{' and _toks[-1] == '}':
        _toks = _toks[2:-1]
    if len(_toks) == 4 and _toks[1] == '{' and _toks[3] == '}':
        _toks = (_toks[0],_toks[2])
    if len(_toks) == 1:
        _toks = _toks[0]
    _l2u[_toks] = _tex

# Shortcut candidate generation for certain useless candidates:
# a character is in _blacklist if it can not be at the start
# of any translation in _l2u.  We use this to quickly skip through
# such characters before getting to more difficult-translate parts.
# _blacklist is defined several lines up from here because it must
# be defined in order to call _tokenize, however it is safe to
# delay filling it out until now.

for i in range(0x0020,0x007f):
    _blacklist.add(chr(i))
_blacklist.remove('{')
_blacklist.remove('$')
for candidate in _l2u:
    if isinstance(candidate,tuple):
        if not candidate or not candidate[0]:
            continue
        firstchar = candidate[0][0]
    else:
        firstchar = candidate[0]
    _blacklist.discard(firstchar)

History