Welcome, guest | Sign In | My Account | Store | Cart
import sys
import re
import mechanize
import pullparser

SOURCEFORGE_URL = 'http://sourceforge.net/softwaremap/trove_list.php?form_cat=160'
# sourceforge.net, projects per programming language, parser tokens:
"""
Token('starttag', 'a', [('href', 'trove_list.php?form_cat=163')]),
Token('starttag', 'img', [('src', 'http://images.sourceforge.net..'), ...)
Token('entityref', 'nbsp', None),
Token('data', ' Ada', None),
Token('endtag', 'a', None),
Token('data', ' ', None)]
Token('starttag', 'i', []),
Token('data', '(100 projects)', None),
"""
FRESHMEAT_URL = 'http://freshmeat.net/browse/160/?topic_id=160'
# projects per programming language, parser tokens:
"""
Token('data', 'Programming Language', None),
Token('endtag', 'b', None),
Token('data', '\r\n              ', None),
Token('starttag', 'br', []),
Token('data', '\r\n      \n\r\n      ', None),
Token('starttag', 'li', []),
Token('starttag', 'a', [('href', '/browse/163/')]),
Token('starttag', 'b', []),
Token('data', 'Ada', None),
Token('endtag', 'b', None),
Token('endtag', 'a', None),
Token('data', '\r\n          ', None),
Token('starttag', 'small', []),
Token('data', '(57 projects)', None),
Token('endtag', 'small', None),
Token('data', '\r\n              ', None),
Token('starttag', 'li', []),
Token('starttag', 'a', [('href', '/browse/161/')]),
"""

def get_n_project(s):
	return int(re.compile(r'([\d]*) projects').findall(s)[0])
def sourceforge_get_language_statistics(parser):
	'sourceforge_get_language_statistics(file_obj) -> [ (n_project, language), ... ]'
	def is_language_link(token):
		try:
			if token.type == 'starttag' and token.data == 'a' \
				and 'trove_list.php?form_cat=' in token.attrs[0][1]:
					return True
		except IndexError:
			return False
		return False
	for t in parser.tags():
		if is_language_link(t):
			l = [
			(parser.next(), 'img'),
			(parser.next(), 'nbsp'),
			(parser.next(), 'data(language)'),
			(parser.next(), '/a'),
			(parser.next(), 'data'),
			(parser.next(), 'i'),
			(parser.next(), 'data(n_project)'), ]

			language = l[2][0].data.strip()
			n = l[6][0].data.strip()
			try:
				n_project = get_n_project(n)
			except IndexError:
				continue
			else:
				yield (n_project, language)

def freshmeat_get_language_statistics(parser):
	def is_language_link(token):
		try:
			if token.type == 'starttag' and token.data == 'a' \
				and '/browse/' in token.attrs[0][1]:
					return True
		except IndexError:
			return False
		return False
	for t in parser.tags():
		if is_language_link(t):
			l = [
			(parser.next(), 'b'),
			(parser.next(), 'data(language)'),
			(parser.next(), '/b'),
			(parser.next(), 'a'),
			(parser.next(), 'data'),
			(parser.next(), 'small'),
			(parser.next(), 'data(n_project)'), ]
			language = l[1][0].data.strip()
			n = l[6][0].data
			try:
				n_project = get_n_project(n)
			except IndexError:
				continue
			else:
				yield (n_project, language)

for (name, url, get_statistics) in (
		('sourceforge.net', SOURCEFORGE_URL,
			sourceforge_get_language_statistics),
		('freshmeat.net', FRESHMEAT_URL,
			freshmeat_get_language_statistics),):
	b = mechanize.Browser()
	b.set_handle_robots(False)
	b.open(url)
	r = b.response()
	r.seek(0)
	p = pullparser.PullParser(r)
	l = list(get_statistics(p))
	l.sort()
	l.reverse()
	print name
	for (n_project, language) in l:
		print '%6d %s' % (n_project, language)
	print

History