Welcome, guest | Sign In | My Account | Store | Cart

Yet another script for backing up Twitter posts (statuses). More information can be found here.

The script is based on http://code.activestate.com/recipes/576594/, http://movingtofreedom.org/2009/03/18/python-script-for-backing-up-twitter-statuses/.

Python, 151 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import sys, os
import codecs
import time
import datetime
import re
from urllib import urlopen

import yaml
from BeautifulSoup import BeautifulSoup

DELAY = 2 

RE_USER = re.compile(r'''(?x)   # verbose mode
    @                     # start of twitter user link
    <a[ ][^>]*href="/     # 'a' opening tag to the start of href url
    ([^"]*)"              # capture the user part of url to \1
    [^<]*                 # any number of non-closing bracket chars to get to:
    </a>                  # the 'a' closing tag''')
    # matches @<a href="/scarpent">scarpent</a>

RE_LINK = re.compile(r'''(?x)   # verbose mode
    <a[ ][^>]*href="      # 'a' opening tag to the start of href url
    ([^"]*)"              # capture entire url to \1
    [^>]*                 # any number of non-closing bracket chars to get to:
    >
    ([^<]*)               # any number of non-closing bracket chars to get to:
    </a>                  # the 'a' closing tag''')
    # matches <a href="http://bit.ly/Xxlch" rel="nofollow"
    #                            target="_blank">http://bit.ly/Xxlch</a>

# captures - search for twitter tags link
RE_LINK_HASH = re.compile(r'''(?x)   # verbose mode
    \[
    \/search\?q\=\%23[^\]]*
    \]
    ''')

# capture numeric status id from "published timestamp" span
RE_STATUS_ID = re.compile(r'.*/status/([0-9]*).*')
    # e.g. http://twitter.com/scarpent/status/1329714004

# ---------------

def escape(content):
    return content.replace('"', r'\"')

def print_usage(s=""):
    if s:
        print s
    print """Usage: %s twitter-account filename
    Incremental backup of twitter statuses in yaml format.
    """ % (sys.argv[0],)

# ---------------

def main(args):
    if len(args)!=2:
        return print_usage()

    username, fname  = args

    num_tweets = num_tweets_new = 0
    output = []
    output_orig = []
    by_status_id = {}

    if os.path.exists(fname):
        output_orig = [l.rstrip() for l in codecs.open(fname, "r", "utf-8").readlines()]
        content_before = yaml.load("\n".join(output_orig))
        if content_before:
            by_status_id = dict([(e["status_id"], e) 
                                 for e in content_before if "status_id" in e])

    RE_STATUS_CLASS = re.compile(r'.*\bstatus\b.*')
    url_base = "http://twitter.com/%s" % (username,)

    # let's fetch some tweets ...
    print 'Reading %s, backup in %s, started at %s' % (
           url_base, fname, datetime.datetime.today(),)
    page_nr = 0
    while True:
        page_nr += 1
        print "%3d. page - read and parse" % (page_nr,)

        num_tweets_this_page = 0

        url = '%s?page=%s' % (url_base, page_nr)
        f = urlopen(url)
        # f = open('twitter-feed.htm', 'rb')
        soup = BeautifulSoup(f.read())
        f.close()
        tweets = soup.findAll('li', {'class': RE_STATUS_CLASS})
        if len(tweets) == 0:
            break

        for tweet in tweets:
            num_tweets += 1
            current = []

            content = unicode(tweet.find('span', 'entry-content').renderContents(), 'utf8')

            content = RE_USER.sub(r'@\1', content)
            content = RE_LINK.sub(r'\2 [\1]', content)
            content = RE_LINK_HASH.sub(r'', content)

            current.append('- content : "%s"' % (escape(content),))

            date_time = tweet.find('span', 'timestamp').get("data", None)
            if date_time:
                # {date_time: time:'Sun Sep 04 20:05:28 +0000 2011'}
                # -> Sun Sep 04 20:05:28 +0000 2011
                date_time = date_time.replace("{", "").replace("}", "").replace("time:", "").strip().strip("'").strip()
                current.append('  date_time: "%s"' % (escape(date_time),))

            status_id = None
            m = RE_STATUS_ID.search(tweet.find('a', 'entry-date')['href'])
            if m:
                status_id = m.groups()[0]
                current.append('  status_id: "%s"' % (escape(status_id),))
                current.append('  url: "http://twitter.com/#!/%s/status/%s"' % (username, escape(status_id)))
            current.append("")

            if status_id is not None and status_id in by_status_id:
                # skip it
                continue

            num_tweets_new += 1
            num_tweets_this_page += 1
            output.extend(current)

        if num_tweets_this_page==0:
            print 'No new tweets found, quit iteration ...'
            break

        print '     %3d/%3d tweets saved/processed. Waiting %d seconds before fetching next page...' % (
              num_tweets_new, num_tweets, DELAY)

        # be nice to twitter's servers
        time.sleep(DELAY)

    if num_tweets_new==0:
        print 'No new tweets found in %d tweets analysed, file %s left untouched' % (num_tweets, fname)
        return False

    fout = codecs.open(fname, "w", "utf-8")
    output.extend(output_orig)
    fout.write("\n".join(output))
    print '%d/%d tweets saved in %s' % (num_tweets_new, num_tweets, fout.name)

if __name__=="__main__":
    main(sys.argv[1:])