This recipe uses the mechanize module to grab the "number of projects per progamming language" information from two large opensource software release sites. The information is interesting when comparing popularity of programming languages.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | import sys
import re
import mechanize
import pullparser
SOURCEFORGE_URL = 'http://sourceforge.net/softwaremap/trove_list.php?form_cat=160'
# sourceforge.net, projects per programming language, parser tokens:
"""
Token('starttag', 'a', [('href', 'trove_list.php?form_cat=163')]),
Token('starttag', 'img', [('src', 'http://images.sourceforge.net..'), ...)
Token('entityref', 'nbsp', None),
Token('data', ' Ada', None),
Token('endtag', 'a', None),
Token('data', ' ', None)]
Token('starttag', 'i', []),
Token('data', '(100 projects)', None),
"""
FRESHMEAT_URL = 'http://freshmeat.net/browse/160/?topic_id=160'
# projects per programming language, parser tokens:
"""
Token('data', 'Programming Language', None),
Token('endtag', 'b', None),
Token('data', '\r\n ', None),
Token('starttag', 'br', []),
Token('data', '\r\n \n\r\n ', None),
Token('starttag', 'li', []),
Token('starttag', 'a', [('href', '/browse/163/')]),
Token('starttag', 'b', []),
Token('data', 'Ada', None),
Token('endtag', 'b', None),
Token('endtag', 'a', None),
Token('data', '\r\n ', None),
Token('starttag', 'small', []),
Token('data', '(57 projects)', None),
Token('endtag', 'small', None),
Token('data', '\r\n ', None),
Token('starttag', 'li', []),
Token('starttag', 'a', [('href', '/browse/161/')]),
"""
def get_n_project(s):
return int(re.compile(r'([\d]*) projects').findall(s)[0])
def sourceforge_get_language_statistics(parser):
'sourceforge_get_language_statistics(file_obj) -> [ (n_project, language), ... ]'
def is_language_link(token):
try:
if token.type == 'starttag' and token.data == 'a' \
and 'trove_list.php?form_cat=' in token.attrs[0][1]:
return True
except IndexError:
return False
return False
for t in parser.tags():
if is_language_link(t):
l = [
(parser.next(), 'img'),
(parser.next(), 'nbsp'),
(parser.next(), 'data(language)'),
(parser.next(), '/a'),
(parser.next(), 'data'),
(parser.next(), 'i'),
(parser.next(), 'data(n_project)'), ]
language = l[2][0].data.strip()
n = l[6][0].data.strip()
try:
n_project = get_n_project(n)
except IndexError:
continue
else:
yield (n_project, language)
def freshmeat_get_language_statistics(parser):
def is_language_link(token):
try:
if token.type == 'starttag' and token.data == 'a' \
and '/browse/' in token.attrs[0][1]:
return True
except IndexError:
return False
return False
for t in parser.tags():
if is_language_link(t):
l = [
(parser.next(), 'b'),
(parser.next(), 'data(language)'),
(parser.next(), '/b'),
(parser.next(), 'a'),
(parser.next(), 'data'),
(parser.next(), 'small'),
(parser.next(), 'data(n_project)'), ]
language = l[1][0].data.strip()
n = l[6][0].data
try:
n_project = get_n_project(n)
except IndexError:
continue
else:
yield (n_project, language)
for (name, url, get_statistics) in (
('sourceforge.net', SOURCEFORGE_URL,
sourceforge_get_language_statistics),
('freshmeat.net', FRESHMEAT_URL,
freshmeat_get_language_statistics),):
b = mechanize.Browser()
b.set_handle_robots(False)
b.open(url)
r = b.response()
r.seek(0)
p = pullparser.PullParser(r)
l = list(get_statistics(p))
l.sort()
l.reverse()
print name
for (n_project, language) in l:
print '%6d %s' % (n_project, language)
print
|
The mechanize module is available at http://wwwsearch.sf.net/mechanize/
Tags: web
useful example. - Thanks for this, good example of raw tag-pulling using mechanize. I wonder why you chose mechanize rather than, say, Beautiful soup, when even pullparser's author has recommended the latter
a couple of tag contents "100 projects" "57 projects" aren't labelled as to where they're hard-coded from, but i'm pretty sure it's from the ADA project
hard tabs?!