Recipe 577877 revision 1 « ActiveState Code

import sys, os
import codecs
import time
import datetime
import re
from urllib import urlopen

import yaml
from BeautifulSoup import BeautifulSoup

DELAY = 2 

RE_USER = re.compile(r'''(?x)   # verbose mode
    @                     # start of twitter user link
    <a[ ][^>]*href="/     # 'a' opening tag to the start of href url
    ([^"]*)"              # capture the user part of url to \1
    [^<]*                 # any number of non-closing bracket chars to get to:
    </a>                  # the 'a' closing tag''')
    # matches @<a href="/scarpent">scarpent</a>

RE_LINK = re.compile(r'''(?x)   # verbose mode
    <a[ ][^>]*href="      # 'a' opening tag to the start of href url
    ([^"]*)"              # capture entire url to \1
    [^>]*                 # any number of non-closing bracket chars to get to:
    >
    ([^<]*)               # any number of non-closing bracket chars to get to:
    </a>                  # the 'a' closing tag''')
    # matches <a href="http://bit.ly/Xxlch" rel="nofollow"
    #                            target="_blank">http://bit.ly/Xxlch</a>

# captures - search for twitter tags link
RE_LINK_HASH = re.compile(r'''(?x)   # verbose mode
    \[
    \/search\?q\=\%23[^\]]*
    \]
    ''')

# capture numeric status id from "published timestamp" span
RE_STATUS_ID = re.compile(r'.*/status/([0-9]*).*')
    # e.g. http://twitter.com/scarpent/status/1329714004

# ---------------

def escape(content):
    return content.replace('"', r'\"')

def print_usage(s=""):
    if s:
        print s
    print """Usage: %s twitter-account filename
    Incremental backup of twitter statuses in yaml format.
    """ % (sys.argv[0],)

# ---------------

def main(args):
    if len(args)!=2:
        return print_usage()

    username, fname  = args

    num_tweets = num_tweets_new = 0
    output = []
    output_orig = []
    by_status_id = {}

    if os.path.exists(fname):
        output_orig = [l.rstrip() for l in codecs.open(fname, "r", "utf-8").readlines()]
        content_before = yaml.load("\n".join(output_orig))
        if content_before:
            by_status_id = dict([(e["status_id"], e) 
                                 for e in content_before if "status_id" in e])

    RE_STATUS_CLASS = re.compile(r'.*\bstatus\b.*')
    url_base = "http://twitter.com/%s" % (username,)

    # let's fetch some tweets ...
    print 'Reading %s, backup in %s, started at %s' % (
           url_base, fname, datetime.datetime.today(),)
    page_nr = 0
    while True:
        page_nr += 1
        print "%3d. page - read and parse" % (page_nr,)

        num_tweets_this_page = 0

        url = '%s?page=%s' % (url_base, page_nr)
        f = urlopen(url)
        # f = open('twitter-feed.htm', 'rb')
        soup = BeautifulSoup(f.read())
        f.close()
        tweets = soup.findAll('li', {'class': RE_STATUS_CLASS})
        if len(tweets) == 0:
            break

        for tweet in tweets:
            num_tweets += 1
            current = []

            content = unicode(tweet.find('span', 'entry-content').renderContents(), 'utf8')

            content = RE_USER.sub(r'@\1', content)
            content = RE_LINK.sub(r'\2 [\1]', content)
            content = RE_LINK_HASH.sub(r'', content)

            current.append('- content : "%s"' % (escape(content),))

            date_time = tweet.find('span', 'timestamp').get("data", None)
            if date_time:
                # {date_time: time:'Sun Sep 04 20:05:28 +0000 2011'}
                # -> Sun Sep 04 20:05:28 +0000 2011
                date_time = date_time.replace("{", "").replace("}", "").replace("time:", "").strip().strip("'").strip()
                current.append('  date_time: "%s"' % (escape(date_time),))

            status_id = None
            m = RE_STATUS_ID.search(tweet.find('a', 'entry-date')['href'])
            if m:
                status_id = m.groups()[0]
                current.append('  status_id: "%s"' % (escape(status_id),))
                current.append('  url: "http://twitter.com/#!/%s/status/%s"' % (username, escape(status_id)))
            current.append("")

            if status_id is not None and status_id in by_status_id:
                # skip it
                continue

            num_tweets_new += 1
            num_tweets_this_page += 1
            output.extend(current)

        if num_tweets_this_page==0:
            print 'No new tweets found, quit iteration ...'
            break

        print '     %3d/%3d tweets saved/processed. Waiting %d seconds before fetching next page...' % (
              num_tweets_new, num_tweets, DELAY)

        # be nice to twitter's servers
        time.sleep(DELAY)

    if num_tweets_new==0:
        print 'No new tweets found in %d tweets analysed, file %s left untouched' % (num_tweets, fname)
        return False

    fout = codecs.open(fname, "w", "utf-8")
    output.extend(output_orig)
    fout.write("\n".join(output))
    print '%d/%d tweets saved in %s' % (num_tweets_new, num_tweets, fout.name)

if __name__=="__main__":
    main(sys.argv[1:])
Recipe 577877 revision 1

History

Accounts

Code Recipes

Feedback & Information

ActiveState