Recipe 326576 revision 2 « ActiveState Code

#!/usr/bin/python

import random
from urllib import urlopen

class Trigram:
    """From one or more text files, the frequency of three character
    sequences is calculated.  When treated as a vector, this information
    can be compared to other trigrams, and the difference between them
    seen as an angle.  The cosine of this angle varies between 1 for
    complete similarity, and 0 for utter difference.  Since letter
    combinations are characteristic to a language, this can be used to
    determine the language of a body of text. For example:

        >>> reference_en = Trigram('/path/to/reference/text/english')
        >>> reference_de = Trigram('/path/to/reference/text/german')
        >>> unknown = Trigram('url://pointing/to/unknown/text')
        >>> unknown.similarity(reference_de)
        0.4
        >>> unknown.similarity(reference_en)
        0.95

    would indicate the unknown text is almost cetrtainly English.  As
    syntax sugar, the minus sign is overloaded to return the difference
    between texts, so the above objects would give you:

        >>> unknown - reference_de
        0.6
        >>> reference_en - unknown    # order doesn't matter.
        0.05

    As it stands, the Trigram ignores character set information, which
    means you can only accurately compare within a single encoding
    (iso-8859-1 in the examples).  A more complete implementation might
    convert to unicode first.

    As an extra bonus, there is a method to make up nonsense words in the
    style of the Trigram's text.

        >>> reference_en.makeWords(30)
        My withillonquiver and ald, by now wittlectionsurper, may sequia,
        tory, I ad my notter. Marriusbabilly She lady for rachalle spen
        hat knong al elf

    Beware when using urls: HTML won't be parsed out.

    Most methods chatter away to standard output, to let you know they're
    still there.
    """
    length = 0

    def __init__(self, fn=None):
        self.lut = {}
        if fn is not None:
            self.parseFile(fn)

    def parseFile(self, fn):
        pair = '  '
        if '://' in fn:
            print "trying to fetch url, may take time..."
            f = urlopen(fn)
        else:
            f = open(fn)
        for z, line in enumerate(f):
            if not z % 1000:
                print "line %s" % z
            # \n's are spurious in a prose context
            for letter in line.strip() + ' ':
                d = self.lut.setdefault(pair, {})
                d[letter] = d.get(letter, 0) + 1
                pair = pair[1] + letter
        f.close()
        self.measure()


    def measure(self):
        """calculates the scalar length of the trigram vector and
        stores it in self.length."""
        total = 0
        for y in self.lut.values():
            total += sum([ x * x for x in y.values() ])
        self.length = total ** 0.5

    def similarity(self, other):
        """returns a number between 0 and 1 indicating similarity.
        1 means an identical ratio of trigrams;
        0 means no trigrams in common.
        """
        if not isinstance(other, Trigram):
            raise TypeError("can't compare Trigram with non-Trigram")
        lut1 = self.lut
        lut2 = other.lut
        total = 0
        for k in lut1.keys():
            if k in lut2:
                a = lut1[k]
                b = lut2[k]
                for x in a:
                    if x in b:
                        total += a[x] * b[x]

        return float(total) / (self.length * other.length)

    def __sub__(self, other):
        """indicates difference between trigram sets; 1 is entirely
        different, 0 is entirely the same."""
        return 1 - self.similarity(other)


    def makeWords(self, count):
        """returns a string of made-up words based on the known text."""
        text = []
        k = '  '
        while count:
            n = self.likely(k)
            text.append(n)
            k = k[1] + n
            if n in ' \t':
                count -= 1
        return ''.join(text)


    def likely(self, k):
        """Returns a character likely to follow the given string
        two character string, or a space if nothing is found."""
        if k not in self.lut:
            return ' '
        # if you were using this a lot, caching would a good idea.
        letters = []
        for k, v in self.lut[k].items():
            letters.append(k * v)
        letters = ''.join(letters)
        return random.choice(letters)


def test():
    en = Trigram('http://gutenberg.net/dirs/etext97/lsusn11.txt')
   #NB fr and some others have English license text.
    #   no has english excerpts.
    fr = Trigram('http://gutenberg.net/dirs/etext03/candi10.txt')
    fi = Trigram('http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt')
    no = Trigram('http://gutenberg.net/dirs/1/2/8/4/12844/12844-8.txt')
    se = Trigram('http://gutenberg.net/dirs/1/0/1/1/10117/10117-8.txt')
    no2 = Trigram('http://gutenberg.net/dirs/1/3/0/4/13041/13041-8.txt')
    en2 = Trigram('http://gutenberg.net/dirs/etext05/cfgsh10.txt')
    fr2 = Trigram('http://gutenberg.net/dirs/1/3/7/0/13704/13704-8.txt')
    print "calculating difference:"
    print "en - fr is %s" % (en - fr)
    print "fr - en is %s" % (fr - en)
    print "en - en2 is %s" % (en - en2)
    print "en - fr2 is %s" % (en - fr2)
    print "fr - en2 is %s" % (fr - en2)
    print "fr - fr2 is %s" % (fr - fr2)
    print "fr2 - en2 is %s" % (fr2 - en2)
    print "fi - fr  is %s" % (fi - fr)
    print "fi - en  is %s" % (fi - en)
    print "fi - se  is %s" % (fi - se)
    print "no - se  is %s" % (no - se)
    print "en - no  is %s" % (en - no)
    print "no - no2  is %s" % (no - no2)
    print "se - no2  is %s" % (se - no2)
    print "en - no2  is %s" % (en - no2)
    print "fr - no2  is %s" % (fr - no2)

    print "\nmaking up English"
    print en.makeWords(30)
    print "\nmaking up French"
    print fr.makeWords(30)


if __name__ == '__main__':
    test()
Recipe 326576 revision 2

History

Accounts

Code Recipes

Feedback & Information

ActiveState