Yet another script for backing up Twitter posts (statuses). More information can be found here.
The script is based on http://code.activestate.com/recipes/576594/, http://movingtofreedom.org/2009/03/18/python-script-for-backing-up-twitter-statuses/.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | import sys, os
import codecs
import time
import datetime
import re
from urllib import urlopen
import yaml
from BeautifulSoup import BeautifulSoup
DELAY = 2
RE_USER = re.compile(r'''(?x) # verbose mode
@ # start of twitter user link
<a[ ][^>]*href="/ # 'a' opening tag to the start of href url
([^"]*)" # capture the user part of url to \1
[^<]* # any number of non-closing bracket chars to get to:
</a> # the 'a' closing tag''')
# matches @<a href="/scarpent">scarpent</a>
RE_LINK = re.compile(r'''(?x) # verbose mode
<a[ ][^>]*href=" # 'a' opening tag to the start of href url
([^"]*)" # capture entire url to \1
[^>]* # any number of non-closing bracket chars to get to:
>
([^<]*) # any number of non-closing bracket chars to get to:
</a> # the 'a' closing tag''')
# matches <a href="http://bit.ly/Xxlch" rel="nofollow"
# target="_blank">http://bit.ly/Xxlch</a>
# captures - search for twitter tags link
RE_LINK_HASH = re.compile(r'''(?x) # verbose mode
\[
\/search\?q\=\%23[^\]]*
\]
''')
# capture numeric status id from "published timestamp" span
RE_STATUS_ID = re.compile(r'.*/status/([0-9]*).*')
# e.g. http://twitter.com/scarpent/status/1329714004
# ---------------
def escape(content):
return content.replace('"', r'\"')
def print_usage(s=""):
if s:
print s
print """Usage: %s twitter-account filename
Incremental backup of twitter statuses in yaml format.
""" % (sys.argv[0],)
# ---------------
def main(args):
if len(args)!=2:
return print_usage()
username, fname = args
num_tweets = num_tweets_new = 0
output = []
output_orig = []
by_status_id = {}
if os.path.exists(fname):
output_orig = [l.rstrip() for l in codecs.open(fname, "r", "utf-8").readlines()]
content_before = yaml.load("\n".join(output_orig))
if content_before:
by_status_id = dict([(e["status_id"], e)
for e in content_before if "status_id" in e])
RE_STATUS_CLASS = re.compile(r'.*\bstatus\b.*')
url_base = "http://twitter.com/%s" % (username,)
# let's fetch some tweets ...
print 'Reading %s, backup in %s, started at %s' % (
url_base, fname, datetime.datetime.today(),)
page_nr = 0
while True:
page_nr += 1
print "%3d. page - read and parse" % (page_nr,)
num_tweets_this_page = 0
url = '%s?page=%s' % (url_base, page_nr)
f = urlopen(url)
# f = open('twitter-feed.htm', 'rb')
soup = BeautifulSoup(f.read())
f.close()
tweets = soup.findAll('li', {'class': RE_STATUS_CLASS})
if len(tweets) == 0:
break
for tweet in tweets:
num_tweets += 1
current = []
content = unicode(tweet.find('span', 'entry-content').renderContents(), 'utf8')
content = RE_USER.sub(r'@\1', content)
content = RE_LINK.sub(r'\2 [\1]', content)
content = RE_LINK_HASH.sub(r'', content)
current.append('- content : "%s"' % (escape(content),))
date_time = tweet.find('span', 'timestamp').get("data", None)
if date_time:
# {date_time: time:'Sun Sep 04 20:05:28 +0000 2011'}
# -> Sun Sep 04 20:05:28 +0000 2011
date_time = date_time.replace("{", "").replace("}", "").replace("time:", "").strip().strip("'").strip()
current.append(' date_time: "%s"' % (escape(date_time),))
status_id = None
m = RE_STATUS_ID.search(tweet.find('a', 'entry-date')['href'])
if m:
status_id = m.groups()[0]
current.append(' status_id: "%s"' % (escape(status_id),))
current.append(' url: "http://twitter.com/#!/%s/status/%s"' % (username, escape(status_id)))
current.append("")
if status_id is not None and status_id in by_status_id:
# skip it
continue
num_tweets_new += 1
num_tweets_this_page += 1
output.extend(current)
if num_tweets_this_page==0:
print 'No new tweets found, quit iteration ...'
break
print ' %3d/%3d tweets saved/processed. Waiting %d seconds before fetching next page...' % (
num_tweets_new, num_tweets, DELAY)
# be nice to twitter's servers
time.sleep(DELAY)
if num_tweets_new==0:
print 'No new tweets found in %d tweets analysed, file %s left untouched' % (num_tweets, fname)
return False
fout = codecs.open(fname, "w", "utf-8")
output.extend(output_orig)
fout.write("\n".join(output))
print '%d/%d tweets saved in %s' % (num_tweets_new, num_tweets, fout.name)
if __name__=="__main__":
main(sys.argv[1:])
|