Recipe 576699 revision 1 « ActiveState Code

import re

def wordList(text, wordChars = None, ignoreCase = True):
    """converts string simple-mindedly to a list of words"""
    if wordChars is None:
        lcLetters = "abcdefghijklmnopqrstuvwxyz"
        ucLetters = lcLetters.upper()
        otherLegalChars = """-"""
        wordChars = set(lcLetters + ucLetters + otherLegalChars)
    # make sure spaces are always allowed-- they are word boundaries
    wordChars.add(" ")
    # while still a string, convert word boundaries to spaces
    whiteSpace = "\n\r\f\t\v"
    wordBoundaries = whiteSpace + ".,"
    if ignoreCase: text = text.lower()
    li = list(text)
    # drop unwanted chars
    li = [x for x in li if x in wordChars]
    # convert back to string
    text = "".join(li)
    for s in wordBoundaries: text = text.replace(s, " ")
    # need to retain at least one space for word boundaries
    # collapse groups of spaces to a single space
    # Method: replace 64 spaces with a space, 32 spaces with a space, ..., 2 spaces with a space
    for i in [x*x for x in [6,5,4,3,2,1]]: text = text.strip(" " * i)
    # split into list again at word boundaries
    text = text.split(" ")
    # if any "empty words" have been manufactured by previous processes, drop them
    text = [x for x in text if x != ""]
    return text

def reWordList(text, wordChars = None, ignoreCase = True):
    """converts string simple-mindedly to a list of words"""
    if wordChars is None:
        lcLetters = "abcdefghijklmnopqrstuvwxyz"
        ucLetters = lcLetters.upper()
        wordChars = lcLetters + ucLetters
    # make sure spaces are always allowed-- they are word boundaries
    extraWordBoundaries = """.","""
    # convert all word boundaries (white spaces and additional chars specified above) to "spaces"
    rx = re.compile('[\s' + extraWordBoundaries + ']')
    text = rx.sub(" ", text)
    # collapse groups of spaces to a single "space"
    # Method: replace 64 spaces with a space, 32 spaces with a space, ..., 2 spaces with a "space"
    for i in [x*x for x in [6,5,4,3,2,1]]: text = text.strip(" " * i)
    if ignoreCase: text = text.lower()
    li = list(text)
    # drop unwanted chars (which do not delimit words or form a part of them)
    # note "space" is retained
    rx = re.compile("[^" + wordChars + " ]")
    text = rx.sub("", text)
    # split into list at word boundaries
    text = text.split(" ")
    # if any "empty words" have been manufactured by previous processes, drop them
    text = [x for x in text if x != ""]
    return text

def wordFreqs(text, wordChars = None):
    """takes a list of words and returns list of unique words and their frequencies"""
    words = wordList(text, wordChars)
    uniqueWords = set(words)
    return sorted([(word, words.count(word)) for word in uniqueWords])

if __name__ == "__main__":
    # to load a string from a file here, loadFile = True
    loadFile = True
    if loadFile:
        a = list(file(r"d:\partdb.sql"))
        a = "\n".join(a)
    else:
        a = """ Nor again is there anyone who loves or pursues or desires to obtain pain of itself
                because it is pain, but because occasionally circumstances occur in which toil and pain can
                procure him some great pleasure. To take a trivial example, which of us ever undertakes
                laborious physical exercise, except to obtain some advantage from it?"""
    b = wordFreqs(a)
    # example: find integers only
    #b = wordFreqs(a, wordChars = "0123456789")
    print b
    print "\n" * 5
    print "number of words in original: ", len(wordList(a))
    print "Number of unique words: ", len(b)
Recipe 576699 revision 1

History

Accounts

Code Recipes

Feedback & Information

ActiveState