# bentley_knuth.py # Author: Vasudev Ram - http://www.dancingbison.com # Version: 0.1 # The problem this program tries to solve is from the page: # http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/ # Description: The program Bentley asked Knuth to write: # Read a file of text, determine the n most frequently # used words, and print out a sorted list of those words # along with their frequencies. import sys import os import string sys_argv = sys.argv def usage(): sys.stderr.write("Usage: %s n file\n" % sys_argv[0]) sys.stderr.write("where n is the number of most frequently\n") sys.stderr.write("used words you want to find, and \n") sys.stderr.write("file is the name of the file in which to look.\n") if len(sys_argv) < 3: usage() sys.exit(1) try: n = int(sys_argv[1]) except ValueError: sys.stderr.write("%s: Error: %s is not a decimal numeric value" % (sys_argv[0], sys_argv[1])) sys.exit(1) print "n =", n if n < 1: sys.stderr.write("%s: Error: %s is not a positive value" % (sys_argv[0], sys_argv[1])) in_filename = sys.argv[2] print "%s: Finding %d most frequent words in file %s" % \ (sys_argv[0], n, in_filename) try: fil_in = open(in_filename) except IOError: sys.stderr.write("%s: ERROR: Could not open in_filename %s\n" % \ (sys_argv[0], in_filename)) sys.exit(1) word_freq_dict = {} for lin in fil_in: words_in_line = lin.split() for word in words_in_line: if word_freq_dict.has_key(word): word_freq_dict[word] += 1 else: word_freq_dict[word] = 1 word_freq_list = [] for item in word_freq_dict.items(): word_freq_list.append(item) wfl = sorted(word_freq_list, key=lambda word_freq_list: word_freq_list[1], reverse=True) #wfl.reverse() print "The %d most frequent words sorted by decreasing frequency:" % n len_wfl = len(wfl) if n > len_wfl: print "n = %d, file has only %d unique words," % (n, len_wfl) print "so printing %d words" % len_wfl print "Word: Frequency" m = min(n, len_wfl) for i in range(m): print wfl[i][0], ": ", wfl[i][1] fil_in.close()