Advogato (http://www.advogato.org) exports members' diaries in a simple XML format. This script fetches the entries and stores them in a dictionary keyed by date. I assume it can also be used with other virgule sites, such as http:///www.badvogato.org.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | #!/usr/bin/env python
import sgmllib, string, urllib
class DiaryParser(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.entries = []
self.dates = []
self.inHtml = 0
self.inDate = 0
self.data = ""
def handle_data(self, data):
self.data = self.data + data
def unknown_starttag(self, tag, attrs):
pass
def unknown_endtag(self, tag):
pass
def start_html(self, attributes):
self.inHtml = 1
self.data = ""
self.setliteral()
def end_html(self):
self.entries.append(self.data)
self.inHtml = 0
def start_date(self, attributes):
self.data = ""
self.setliteral()
def end_html(self):
self.entries.append(self.data)
self.inHtml = 0
def start_date(self, attributes):
self.data = ""
self.inDate = 1
def end_date(self):
self.dates.append(self.data)
self.inDate = 0
def getEntries(person):
""" Fetch a Advogato member's diary and return a dictionary in the form
{ date : entry, ... }
"""
parser = DiaryParser()
f = urllib.urlopen("http://www.advogato.org/person/%s/diary.xml" % urllib.quote(person))
s = f.read(8192)
while s:
parser.feed(s)
s = f.read(8192)
parser.close()
result = {}
for d, e in map(None, parser.dates, parser.entries):
result[d] = e
return result
if __name__=='__main__':
import sys
print getEntries(sys.argv[1])
|
Tags: web