Welcome, guest | Sign In | My Account | Store | Cart
import urllib2
import re

page = 1
contrib = [] # each element of contrib is a tuple consisting of the name of the user and the number of submitted recipes.

while 1: # loop over pages
    print "Processing page %s" % (page)
    f=urllib2.urlopen("http://code.activestate.com/recipes/users/?page=%s" % (page))
    html = f.read()
    f.close()
    
    pattern = '<li><a href="/recipes/users/.*/">(.*)</a>\s*<span class="secondary">\((.*) recipe[s]?\)</span>'
    res = re.findall(pattern, html)
    if res:
        contrib.extend(res)

    if html.find('<span class="next disabled">') != -1: # found at the last page
        break
    else:
        page += 1

# Print users and number of recipes on screen 
#for p in contrib:
#    print p[0], p[1]

# Number of recipes as a list:
nrecipes = [int(p[1]) for p in contrib]

# Print the distribution
n = 1
while n <= max(nrecipes):
    c = nrecipes.count(n)
    if c:
        print "%s people contribute %s recipes each" % (c,n)
    n += 1

Diff to Previous Revision

--- revision 1 2011-06-02 10:58:47
+++ revision 2 2011-06-02 14:52:50
@@ -13,7 +13,7 @@
     pattern = '<li><a href="/recipes/users/.*/">(.*)</a>\s*<span class="secondary">\((.*) recipe[s]?\)</span>'
     res = re.findall(pattern, html)
     if res:
-        m.extend(res)
+        contrib.extend(res)
 
     if html.find('<span class="next disabled">') != -1: # found at the last page
         break

History