import sys, os
import codecs
import time
import datetime
import re
from urllib import urlopen
import yaml
from BeautifulSoup import BeautifulSoup
DELAY = 2
RE_USER = re.compile(r'''(?x) # verbose mode
@ # start of twitter user link
]*href="/ # 'a' opening tag to the start of href url
([^"]*)" # capture the user part of url to \1
[^<]* # any number of non-closing bracket chars to get to:
# the 'a' closing tag''')
# matches @scarpent
RE_LINK = re.compile(r'''(?x) # verbose mode
]*href=" # 'a' opening tag to the start of href url
([^"]*)" # capture entire url to \1
[^>]* # any number of non-closing bracket chars to get to:
>
([^<]*) # any number of non-closing bracket chars to get to:
# the 'a' closing tag''')
# matches http://bit.ly/Xxlch
# captures - search for twitter tags link
RE_LINK_HASH = re.compile(r'''(?x) # verbose mode
\[
\/search\?q\=\%23[^\]]*
\]
''')
# capture numeric status id from "published timestamp" span
RE_STATUS_ID = re.compile(r'.*/status/([0-9]*).*')
# e.g. http://twitter.com/scarpent/status/1329714004
# ---------------
def escape(content):
return content.replace('"', r'\"')
def print_usage(s=""):
if s:
print s
print """Usage: %s twitter-account filename
Incremental backup of twitter statuses in yaml format.
""" % (sys.argv[0],)
# ---------------
def main(args):
if len(args)!=2:
return print_usage()
username, fname = args
num_tweets = num_tweets_new = 0
output = []
output_orig = []
by_status_id = {}
if os.path.exists(fname):
output_orig = [l.rstrip() for l in codecs.open(fname, "r", "utf-8").readlines()]
content_before = yaml.load("\n".join(output_orig))
if content_before:
by_status_id = dict([(e["status_id"], e)
for e in content_before if "status_id" in e])
RE_STATUS_CLASS = re.compile(r'.*\bstatus\b.*')
url_base = "http://twitter.com/%s" % (username,)
# let's fetch some tweets ...
print 'Reading %s, backup in %s, started at %s' % (
url_base, fname, datetime.datetime.today(),)
page_nr = 0
while True:
page_nr += 1
print "%3d. page - read and parse" % (page_nr,)
num_tweets_this_page = 0
url = '%s?page=%s' % (url_base, page_nr)
f = urlopen(url)
# f = open('twitter-feed.htm', 'rb')
soup = BeautifulSoup(f.read())
f.close()
tweets = soup.findAll('li', {'class': RE_STATUS_CLASS})
if len(tweets) == 0:
break
for tweet in tweets:
num_tweets += 1
current = []
content = unicode(tweet.find('span', 'entry-content').renderContents(), 'utf8')
content = RE_USER.sub(r'@\1', content)
content = RE_LINK.sub(r'\2 [\1]', content)
content = RE_LINK_HASH.sub(r'', content)
current.append('- content : "%s"' % (escape(content),))
date_time = tweet.find('span', 'timestamp').get("data", None)
if date_time:
# {date_time: time:'Sun Sep 04 20:05:28 +0000 2011'}
# -> Sun Sep 04 20:05:28 +0000 2011
date_time = date_time.replace("{", "").replace("}", "").replace("time:", "").strip().strip("'").strip()
current.append(' date_time: "%s"' % (escape(date_time),))
status_id = None
m = RE_STATUS_ID.search(tweet.find('a', 'entry-date')['href'])
if m:
status_id = m.groups()[0]
current.append(' status_id: "%s"' % (escape(status_id),))
current.append(' url: "http://twitter.com/#!/%s/status/%s"' % (username, escape(status_id)))
current.append("")
if status_id is not None and status_id in by_status_id:
# skip it
continue
num_tweets_new += 1
num_tweets_this_page += 1
output.extend(current)
if num_tweets_this_page==0:
print 'No new tweets found, quit iteration ...'
break
print ' %3d/%3d tweets saved/processed. Waiting %d seconds before fetching next page...' % (
num_tweets_new, num_tweets, DELAY)
# be nice to twitter's servers
time.sleep(DELAY)
if num_tweets_new==0:
print 'No new tweets found in %d tweets analysed, file %s left untouched' % (num_tweets, fname)
return False
fout = codecs.open(fname, "w", "utf-8")
output.extend(output_orig)
fout.write("\n".join(output))
print '%d/%d tweets saved in %s' % (num_tweets_new, num_tweets, fout.name)
if __name__=="__main__":
main(sys.argv[1:])