#!/usr/bin/python import random from urllib import urlopen class Trigram: """From one or more text files, the frequency of three character sequences is calculated. When treated as a vector, this information can be compared to other trigrams, and the difference between them seen as an angle. The cosine of this angle varies between 1 for complete similarity, and 0 for utter difference. Since letter combinations are characteristic to a language, this can be used to determine the language of a body of text. For example: >>> reference_en = Trigram('/path/to/reference/text/english') >>> reference_de = Trigram('/path/to/reference/text/german') >>> unknown = Trigram('url://pointing/to/unknown/text') >>> unknown.similarity(reference_de) 0.4 >>> unknown.similarity(reference_en) 0.95 would indicate the unknown text is almost cetrtainly English. As syntax sugar, the minus sign is overloaded to return the difference between texts, so the above objects would give you: >>> unknown - reference_de 0.6 >>> reference_en - unknown # order doesn't matter. 0.05 As it stands, the Trigram ignores character set information, which means you can only accurately compare within a single encoding (iso-8859-1 in the examples). A more complete implementation might convert to unicode first. As an extra bonus, there is a method to make up nonsense words in the style of the Trigram's text. >>> reference_en.makeWords(30) My withillonquiver and ald, by now wittlectionsurper, may sequia, tory, I ad my notter. Marriusbabilly She lady for rachalle spen hat knong al elf Beware when using urls: HTML won't be parsed out. Most methods chatter away to standard output, to let you know they're still there. """ length = 0 def __init__(self, fn=None): self.lut = {} if fn is not None: self.parseFile(fn) def parseFile(self, fn): pair = ' ' if '://' in fn: print "trying to fetch url, may take time..." f = urlopen(fn) else: f = open(fn) for z, line in enumerate(f): if not z % 1000: print "line %s" % z # \n's are spurious in a prose context for letter in line.strip() + ' ': d = self.lut.setdefault(pair, {}) d[letter] = d.get(letter, 0) + 1 pair = pair[1] + letter f.close() self.measure() def measure(self): """calculates the scalar length of the trigram vector and stores it in self.length.""" total = 0 for y in self.lut.values(): total += sum([ x * x for x in y.values() ]) self.length = total ** 0.5 def similarity(self, other): """returns a number between 0 and 1 indicating similarity. 1 means an identical ratio of trigrams; 0 means no trigrams in common. """ if not isinstance(other, Trigram): raise TypeError("can't compare Trigram with non-Trigram") lut1 = self.lut lut2 = other.lut total = 0 for k in lut1.keys(): if k in lut2: a = lut1[k] b = lut2[k] for x in a: if x in b: total += a[x] * b[x] return float(total) / (self.length * other.length) def __sub__(self, other): """indicates difference between trigram sets; 1 is entirely different, 0 is entirely the same.""" return 1 - self.similarity(other) def makeWords(self, count): """returns a string of made-up words based on the known text.""" text = [] k = ' ' while count: n = self.likely(k) text.append(n) k = k[1] + n if n in ' \t': count -= 1 return ''.join(text) def likely(self, k): """Returns a character likely to follow the given string two character string, or a space if nothing is found.""" if k not in self.lut: return ' ' # if you were using this a lot, caching would a good idea. letters = [] for k, v in self.lut[k].items(): letters.append(k * v) letters = ''.join(letters) return random.choice(letters) def test(): en = Trigram('http://gutenberg.net/dirs/etext97/lsusn11.txt') #NB fr and some others have English license text. # no has english excerpts. fr = Trigram('http://gutenberg.net/dirs/etext03/candi10.txt') fi = Trigram('http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txt') no = Trigram('http://gutenberg.net/dirs/1/2/8/4/12844/12844-8.txt') se = Trigram('http://gutenberg.net/dirs/1/0/1/1/10117/10117-8.txt') no2 = Trigram('http://gutenberg.net/dirs/1/3/0/4/13041/13041-8.txt') en2 = Trigram('http://gutenberg.net/dirs/etext05/cfgsh10.txt') fr2 = Trigram('http://gutenberg.net/dirs/1/3/7/0/13704/13704-8.txt') print "calculating difference:" print "en - fr is %s" % (en - fr) print "fr - en is %s" % (fr - en) print "en - en2 is %s" % (en - en2) print "en - fr2 is %s" % (en - fr2) print "fr - en2 is %s" % (fr - en2) print "fr - fr2 is %s" % (fr - fr2) print "fr2 - en2 is %s" % (fr2 - en2) print "fi - fr is %s" % (fi - fr) print "fi - en is %s" % (fi - en) print "fi - se is %s" % (fi - se) print "no - se is %s" % (no - se) print "en - no is %s" % (en - no) print "no - no2 is %s" % (no - no2) print "se - no2 is %s" % (se - no2) print "en - no2 is %s" % (en - no2) print "fr - no2 is %s" % (fr - no2) print "\nmaking up English" print en.makeWords(30) print "\nmaking up French" print fr.makeWords(30) if __name__ == '__main__': test()