LaTeX codec « Python recipes « ActiveState Code

Codec for converting unicodes to LaTeX markup and vice versa.

      """latex.py

Character translation utilities for LaTeX-formatted text.

Usage:
 - unicode(string,'latex')
 - ustring.decode('latex')
are both available just by letting "import latex" find this file.
 - unicode(string,'latex+latin1')
 - ustring.decode('latex+latin1')
where latin1 can be replaced by any other known encoding, also
become available by calling latex.register().

We also make public a dictionary latex_equivalents,
mapping ord(unicode char) to LaTeX code.

D. Eppstein, October 2003.
"""

from __future__ import generators
import codecs
import re
from sets import Set

def register():
    """Enable encodings of the form 'latex+x' where x describes another encoding.
    Unicode characters are translated to or from x when possible, otherwise
    expanded to latex.
    """
    codecs.register(_registry)

def getregentry():
    """Encodings module API."""
    return _registry('latex')

def _registry(encoding):
    if encoding == 'latex':
        encoding = None
    elif encoding.startswith('latex+'):
        encoding = encoding[6:]
    else:
        return None
        
    class Codec(codecs.Codec):
        def encode(self,input,errors='strict'):
            """Convert unicode string to latex."""
            output = []
            for c in input:
                if encoding:
                    try:
                        output.append(c.encode(encoding))
                        continue
                    except:
                        pass
                if ord(c) in latex_equivalents:
                    output.append(latex_equivalents[ord(c)])
                else:
                    output += ['{\\char', str(ord(c)), '}']
            return ''.join(output), len(input)
            
        def decode(self,input,errors='strict'):
            """Convert latex source string to unicode."""
            if encoding:
                input = unicode(input,encoding,errors)

            # Note: we may get buffer objects here.
            # It is not permussable to call join on buffer objects
            # but we can make them joinable by calling unicode.
            # This should always be safe since we are supposed
            # to be producing unicode output anyway.
            x = map(unicode,_unlatex(input))
            return u''.join(x), len(input)
    
    class StreamWriter(Codec,codecs.StreamWriter):
        pass
            
    class StreamReader(Codec,codecs.StreamReader):
        pass

    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)

def _tokenize(tex):
    """Convert latex source into sequence of single-token substrings."""
    start = 0
    try:
        # skip quickly across boring stuff
        pos = _stoppers.finditer(tex).next().span()[0]
    except StopIteration:
        yield tex
        return

    while 1:
        if pos > start:
            yield tex[start:pos]
            if tex[start] == '\\' and not (tex[pos-1].isdigit() and tex[start+1].isalpha()):
                while pos < len(tex) and tex[pos].isspace(): # skip blanks after csname
                    pos += 1

        while pos < len(tex) and tex[pos] in _ignore:
            pos += 1    # flush control characters
        if pos >= len(tex):
            return
        start = pos
        if tex[pos:pos+2] in {'$$':None, '/~':None}:    # protect ~ in urls
            pos += 2
        elif tex[pos].isdigit():
            while pos < len(tex) and tex[pos].isdigit():
                pos += 1
        elif tex[pos] == '-':
            while pos < len(tex) and tex[pos] == '-':
                pos += 1
        elif tex[pos] != '\\' or pos == len(tex) - 1:
            pos += 1
        elif not tex[pos+1].isalpha():
            pos += 2
        else:
            pos += 1
            while pos < len(tex) and tex[pos].isalpha():
                pos += 1
            if tex[start:pos] == '\\char' or tex[start:pos] == '\\accent':
                while pos < len(tex) and tex[pos].isdigit():
                    pos += 1

class _unlatex:
    """Convert tokenized tex into sequence of unicode strings.  Helper for decode()."""

    def __iter__(self):
        """Turn self into an iterator.  It already is one, nothing to do."""
        return self

    def __init__(self,tex):
        """Create a new token converter from a string."""
        self.tex = tuple(_tokenize(tex))  # turn tokens into indexable list
        self.pos = 0                    # index of first unprocessed token 
        self.lastoutput = 'x'           # lastoutput must always be nonempty string
    
    def __getitem__(self,n):
        """Return token at offset n from current pos."""
        p = self.pos + n
        t = self.tex
        return p < len(t) and t[p] or None

    def next(self):
        """Find and return another piece of converted output."""
        if self.pos >= len(self.tex):
            raise StopIteration
        nextoutput = self.chunk()
        if self.lastoutput[0] == '\\' and self.lastoutput[-1].isalpha() and nextoutput[0].isalpha():
            nextoutput = ' ' + nextoutput   # add extra space to terminate csname
        self.lastoutput = nextoutput
        return nextoutput
    
    def chunk(self):
        """Grab another set of input tokens and convert them to an output string."""
        for delta,c in self.candidates(0):
            if c in _l2u:
                self.pos += delta
                return unichr(_l2u[c])
            elif len(c) == 2 and c[1] == 'i' and (c[0],'\\i') in _l2u:
                self.pos += delta       # correct failure to undot i
                return unichr(_l2u[(c[0],'\\i')])
            elif len(c) == 1 and c[0].startswith('\\char') and c[0][5:].isdigit():
                self.pos += delta
                return unichr(int(c[0][5:]))
    
        # nothing matches, just pass through token as-is
        self.pos += 1
        return self[-1]
    
    def candidates(self,offset):
        """Generate pairs delta,c where c is a token or tuple of tokens from tex
        (after deleting extraneous brackets starting at pos) and delta
        is the length of the tokens prior to bracket deletion.
        """
        t = self[offset]
        if t in _blacklist:
            return
        elif t == '{':
            for delta,c in self.candidates(offset+1):
                if self[offset+delta+1] == '}':
                    yield delta+2,c
        elif t == '\\mbox':
            for delta,c in self.candidates(offset+1):
                yield delta+1,c
        elif t == '$' and self[offset+2] == '$':
            yield 3, (t,self[offset+1],t)
        else:
            q = self[offset+1]
            if q == '{' and self[offset+3] == '}':
                yield 4, (t,self[offset+2])
            elif q:
                yield 2, (t,q)
            yield 1, t

latex_equivalents = {
    0x0009: ' ',
    0x000a: '\n',
    0x0023: '{\#}',
    0x0026: '{\&}',
    0x00a0: '{~}',
    0x00a1: '{!`}',
    0x00a2: '{\\not{c}}',
    0x00a3: '{\\pounds}',
    0x00a7: '{\\S}',
    0x00a8: '{\\"{}}',
    0x00a9: '{\\copyright}',
    0x00af: '{\\={}}',
    0x00ac: '{\\neg}',
    0x00ad: '{\\-}',
    0x00b0: '{\\mbox{$^\\circ$}}',
    0x00b1: '{\\mbox{$\\pm$}}',
    0x00b2: '{\\mbox{$^2$}}',
    0x00b3: '{\\mbox{$^3$}}',
    0x00b4: "{\\'{}}",
    0x00b5: '{\\mbox{$\\mu$}}',
    0x00b6: '{\\P}',
    0x00b7: '{\\mbox{$\\cdot$}}',
    0x00b8: '{\\c{}}',
    0x00b9: '{\\mbox{$^1$}}',
    0x00bf: '{?`}',
    0x00c0: '{\\`A}',
    0x00c1: "{\\'A}",
    0x00c2: '{\\^A}',
    0x00c3: '{\\~A}',
    0x00c4: '{\\"A}',
    0x00c5: '{\\AA}',
    0x00c6: '{\\AE}',
    0x00c7: '{\\c{C}}',
    0x00c8: '{\\`E}',
    0x00c9: "{\\'E}",
    0x00ca: '{\\^E}',
    0x00cb: '{\\"E}',
    0x00cc: '{\\`I}',
    0x00cd: "{\\'I}",
    0x00ce: '{\\^I}',
    0x00cf: '{\\"I}',
    0x00d1: '{\\~N}',
    0x00d2: '{\\`O}',
    0x00d3: "{\\'O}",
    0x00d4: '{\\^O}',
    0x00d5: '{\\~O}',
    0x00d6: '{\\"O}',
    0x00d7: '{\\mbox{$\\times$}}',
    0x00d8: '{\\O}',
    0x00d9: '{\\`U}',
    0x00da: "{\\'U}",
    0x00db: '{\\^U}',
    0x00dc: '{\\"U}',
    0x00dd: "{\\'Y}",
    0x00df: '{\\ss}',
    0x00e0: '{\\`a}',
    0x00e1: "{\\'a}",
    0x00e2: '{\\^a}',
    0x00e3: '{\\~a}',
    0x00e4: '{\\"a}',
    0x00e5: '{\\aa}',
    0x00e6: '{\\ae}',
    0x00e7: '{\\c{c}}',
    0x00e8: '{\\`e}',
    0x00e9: "{\\'e}",
    0x00ea: '{\\^e}',
    0x00eb: '{\\"e}',
    0x00ec: '{\\`\\i}',
    0x00ed: "{\\'\\i}",
    0x00ee: '{\\^\\i}',
    0x00ef: '{\\"\\i}',
    0x00f1: '{\\~n}',
    0x00f2: '{\\`o}',
    0x00f3: "{\\'o}",
    0x00f4: '{\\^o}',
    0x00f5: '{\\~o}',
    0x00f6: '{\\"o}',
    0x00f7: '{\\mbox{$\\div$}}',
    0x00f8: '{\\o}',
    0x00f9: '{\\`u}',
    0x00fa: "{\\'u}",
    0x00fb: '{\\^u}',
    0x00fc: '{\\"u}',
    0x00fd: "{\\'y}",
    0x00ff: '{\\"y}',
    
    0x0100: '{\\=A}',
    0x0101: '{\\=a}',
    0x0102: '{\\u{A}}',
    0x0103: '{\\u{a}}',
    0x0104: '{\\c{A}}',
    0x0105: '{\\c{a}}',
    0x0106: "{\\'C}",
    0x0107: "{\\'c}",
    0x0108: "{\\^C}",
    0x0109: "{\\^c}",
    0x010a: "{\\.C}",
    0x010b: "{\\.c}",
    0x010c: "{\\v{C}}",
    0x010d: "{\\v{c}}",
    0x010e: "{\\v{D}}",
    0x010f: "{\\v{d}}",
    0x0112: '{\\=E}',
    0x0113: '{\\=e}',
    0x0114: '{\\u{E}}',
    0x0115: '{\\u{e}}',
    0x0116: '{\\.E}',
    0x0117: '{\\.e}',
    0x0118: '{\\c{E}}',
    0x0119: '{\\c{e}}',
    0x011a: "{\\v{E}}",
    0x011b: "{\\v{e}}",
    0x011c: '{\\^G}',
    0x011d: '{\\^g}',
    0x011e: '{\\u{G}}',
    0x011f: '{\\u{g}}',
    0x0120: '{\\.G}',
    0x0121: '{\\.g}',
    0x0122: '{\\c{G}}',
    0x0123: '{\\c{g}}',
    0x0124: '{\\^H}',
    0x0125: '{\\^h}',
    0x0128: '{\\~I}',
    0x0129: '{\\~\\i}',
    0x012a: '{\\=I}',
    0x012b: '{\\=\\i}',
    0x012c: '{\\u{I}}',
    0x012d: '{\\u\\i}',
    0x012e: '{\\c{I}}',
    0x012f: '{\\c{i}}',
    0x0130: '{\\.I}',
    0x0131: '{\\i}',
    0x0132: '{IJ}',
    0x0133: '{ij}',
    0x0134: '{\\^J}',
    0x0135: '{\\^\\j}',
    0x0136: '{\\c{K}}',
    0x0137: '{\\c{k}}',
    0x0139: "{\\'L}",
    0x013a: "{\\'l}",
    0x013b: "{\\c{L}}",
    0x013c: "{\\c{l}}",
    0x013d: "{\\v{L}}",
    0x013e: "{\\v{l}}",
    0x0141: '{\\L}',
    0x0142: '{\\l}',
    0x0143: "{\\'N}",
    0x0144: "{\\'n}",
    0x0145: "{\\c{N}}",
    0x0146: "{\\c{n}}",
    0x0147: "{\\v{N}}",
    0x0148: "{\\v{n}}",
    0x014c: '{\\=O}',
    0x014d: '{\\=o}',
    0x014e: '{\\u{O}}',
    0x014f: '{\\u{o}}',
    0x0150: '{\\H{O}}',
    0x0151: '{\\H{o}}',
    0x0152: '{\\OE}',
    0x0153: '{\\oe}',
    0x0154: "{\\'R}",
    0x0155: "{\\'r}",
    0x0156: "{\\c{R}}",
    0x0157: "{\\c{r}}",
    0x0158: "{\\v{R}}",
    0x0159: "{\\v{r}}",
    0x015a: "{\\'S}",
    0x015b: "{\\'s}",
    0x015c: "{\\^S}",
    0x015d: "{\\^s}",
    0x015e: "{\\c{S}}",
    0x015f: "{\\c{s}}",
    0x0160: "{\\v{S}}",
    0x0161: "{\\v{s}}",
    0x0162: "{\\c{T}}",
    0x0163: "{\\c{t}}",
    0x0164: "{\\v{T}}",
    0x0165: "{\\v{t}}",
    0x0168: "{\\~U}",
    0x0169: "{\\~u}",
    0x016a: "{\\=U}",
    0x016b: "{\\=u}",
    0x016c: "{\\u{U}}",
    0x016d: "{\\u{u}}",
    0x016e: "{\\r{U}}",
    0x016f: "{\\r{u}}",
    0x0170: "{\\H{U}}",
    0x0171: "{\\H{u}}",
    0x0172: "{\\c{U}}",
    0x0173: "{\\c{u}}",
    0x0174: "{\\^W}",
    0x0175: "{\\^w}",
    0x0176: "{\\^Y}",
    0x0177: "{\\^y}",
    0x0178: '{\\"Y}',
    0x0179: "{\\'Z}",
    0x017a: "{\\'Z}",
    0x017b: "{\\.Z}",
    0x017c: "{\\.Z}",
    0x017d: "{\\v{Z}}",
    0x017e: "{\\v{z}}",

    0x01c4: "{D\\v{Z}}",
    0x01c5: "{D\\v{z}}",
    0x01c6: "{d\\v{z}}",
    0x01c7: "{LJ}",
    0x01c8: "{Lj}",
    0x01c9: "{lj}",
    0x01ca: "{NJ}",
    0x01cb: "{Nj}",
    0x01cc: "{nj}",
    0x01cd: "{\\v{A}}",
    0x01ce: "{\\v{a}}",
    0x01cf: "{\\v{I}}",
    0x01d0: "{\\v\\i}",
    0x01d1: "{\\v{O}}",
    0x01d2: "{\\v{o}}",
    0x01d3: "{\\v{U}}",
    0x01d4: "{\\v{u}}",
    0x01e6: "{\\v{G}}",
    0x01e7: "{\\v{g}}",
    0x01e8: "{\\v{K}}",
    0x01e9: "{\\v{k}}",
    0x01ea: "{\\c{O}}",
    0x01eb: "{\\c{o}}",
    0x01f0: "{\\v\\j}",
    0x01f1: "{DZ}",
    0x01f2: "{Dz}",
    0x01f3: "{dz}",
    0x01f4: "{\\'G}",
    0x01f5: "{\\'g}",
    0x01fc: "{\\'\\AE}",
    0x01fd: "{\\'\\ae}",
    0x01fe: "{\\'\\O}",
    0x01ff: "{\\'\\o}",

    0x02c6: '{\\^{}}',
    0x02dc: '{\\~{}}',
    0x02d8: '{\\u{}}',
    0x02d9: '{\\.{}}',
    0x02da: "{\\r{}}",
    0x02dd: '{\\H{}}',
    0x02db: '{\\c{}}',
    0x02c7: '{\\v{}}',
    
    0x03c0: '{\\mbox{$\\pi$}}',
    # consider adding more Greek here
    
    0xfb01: '{fi}',
    0xfb02: '{fl}',
    
    0x2013: '{--}',
    0x2014: '{---}',
    0x2018: "{`}",
    0x2019: "{'}",
    0x201c: "{``}",
    0x201d: "{''}",
    0x2020: "{\\dag}",
    0x2021: "{\\ddag}",
    0x2122: "{\\mbox{$^\\mbox{TM}$}}",
    0x2022: "{\\mbox{$\\bullet$}}",
    0x2026: "{\\ldots}",
    0x2202: "{\\mbox{$\\partial$}}",
    0x220f: "{\\mbox{$\\prod$}}",
    0x2211: "{\\mbox{$\\sum$}}",
    0x221a: "{\\mbox{$\\surd$}}",
    0x221e: "{\\mbox{$\\infty$}}",
    0x222b: "{\\mbox{$\\int$}}",
    0x2248: "{\\mbox{$\\approx$}}",
    0x2260: "{\\mbox{$\\neq$}}",
    0x2264: "{\\mbox{$\\leq$}}",
    0x2265: "{\\mbox{$\\geq$}}",
}
for _i in range(0x0020):
    if _i not in latex_equivalents:
        latex_equivalents[_i] = ''
for _i in range(0x0020,0x007f):
    if _i not in latex_equivalents:
        latex_equivalents[_i] = chr(_i)

# Characters that should be ignored and not output in tokenization
_ignore = Set([chr(i) for i in range(32)+[127]]) - Set('\t\n\r')

# Regexp of chars not in blacklist, for quick start of tokenize
_stoppers = re.compile('[\x00-\x1f!$\\-?\\{~\\\\`\']')

_blacklist = Set(' \n\r')
_blacklist.add(None)    # shortcut candidate generation at end of data

# Construction of inverse translation table
_l2u = {
    '\ ':ord(' ')   # unexpanding space makes no sense in non-TeX contexts
}

for _tex in latex_equivalents:
    if _tex <= 0x0020 or (_tex <= 0x007f and len(latex_equivalents[_tex]) <= 1):
        continue    # boring entry
    _toks = tuple(_tokenize(latex_equivalents[_tex]))
    if _toks[0] == '{' and _toks[-1] == '}':
        _toks = _toks[1:-1]
    if _toks[0].isalpha():
        continue    # don't turn ligatures into single chars
    if len(_toks) == 1 and (_toks[0] == "'" or _toks[0] == "`"):
        continue    # don't turn ascii quotes into curly quotes
    if _toks[0] == '\\mbox' and _toks[1] == '{' and _toks[-1] == '}':
        _toks = _toks[2:-1]
    if len(_toks) == 4 and _toks[1] == '{' and _toks[3] == '}':
        _toks = (_toks[0],_toks[2])
    if len(_toks) == 1:
        _toks = _toks[0]
    _l2u[_toks] = _tex

# Shortcut candidate generation for certain useless candidates:
# a character is in _blacklist if it can not be at the start
# of any translation in _l2u.  We use this to quickly skip through
# such characters before getting to more difficult-translate parts.
# _blacklist is defined several lines up from here because it must
# be defined in order to call _tokenize, however it is safe to
# delay filling it out until now.

for i in range(0x0020,0x007f):
    _blacklist.add(chr(i))
_blacklist.remove('{')
_blacklist.remove('$')
for candidate in _l2u:
    if isinstance(candidate,tuple):
        if not candidate or not candidate[0]:
            continue
        firstchar = candidate[0][0]
    else:
        firstchar = candidate[0]
    _blacklist.discard(firstchar)

      

Some attention has been paid to making this efficient (e.g. see the usage of the _blacklist and _stoppers variables). The markup one gets from encoding into LaTeX surrounds each translated character with brackets; this is important e.g. when using the markup within BibTeX (BibTeX uses the brackets to recognize individual characters in its collation routines). This codec is mostly aimed at translating accented Roman scripts; I haven't included other scripts such as Greek or Cyrillic because I'm not aware of a standard set of macros for making these work out-of-the-box in typical latex installations.

Tags: text

2 comments

Oleg Paraschenko 19 years, 10 months ago # | flag

mapping from a TeXML. You can also take a mapping table from TeXML:

http://getfo.org/texml/

TeXML has a mapping for ~2500 unicode characters. But it doesn't bother for a plain TeX and uses LaTeX constructions when it is reasonable.

Peter Bengtsson 19 years, 9 months ago # | flag

How does one use it? I'm really poor unicoding despite being Swedish native. How do I use this?

I tried

>>> import latex



>>> latex.register()



>>> unicode("_pe$er_", "latex+latin1")

u'_pe$er_'

I was expecting something like:

u'_pe\$er\'

◄	Python recipes (4591)	►
◄	David Eppstein's recipes (14)	►

LaTeX codec (Python recipe) by David Eppstein
ActiveState Code (http://code.activestate.com/recipes/252124/)

2 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

LaTeX codec (Python recipe) by David Eppstein ActiveState Code (http://code.activestate.com/recipes/252124/)

2 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

LaTeX codec (Python recipe) by David Eppstein
ActiveState Code (http://code.activestate.com/recipes/252124/)