I used this when trying to retrieve a site I could not access via the wayback machine (archive.org). Might require some tweaking to get it to work...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | #!/usr/bin/env python
import urlparse
import urllib2
import os
import HTMLParser
import sre
class HTMLLinkScanner(HTMLParser.HTMLParser):
tags = {'a':'href','img':'src','frame':'src','base':'href'}
def reset(self):
self.links = {}
self.replacements = []
HTMLParser.HTMLParser.reset(self)
def handle_starttag(self, tag, attrs):
if tag in self.tags:
checkattrs = self.tags[tag]
if isinstance(checkattrs, (str, unicode)):
checkattrs = [checkattrs]
for attr, value in attrs:
if attr in checkattrs:
if tag != 'base':
link = urlparse.urldefrag(value)[0]
self.links[link] = True
self.replacements.append((self.get_starttag_text(), attr, value))
class MirrorRetriever:
def __init__(self, archivedir):
self.archivedir = archivedir
self.urlmap = {}
def url2filename(self, url):
scheme, location, path, query, fragment = urlparse.urlsplit(url)
if not path or path.endswith('/'):
path += 'index.html'
path = os.path.join(*path.split('/'))
if scheme.lower() != 'http':
location = os.path.join(scheme, location)
# ignore query for the meantime
return os.path.join(self.archivedir, location, path)
def testinclude(self, url):
scheme, location, path, query, fragment = urlparse.urlsplit(url)
if scheme in ('mailto', 'javascript'): return False
# TODO: add ability to specify site
# return location.lower() == 'www.mcmillan-inc.com'
return True
def ensuredir(self, pathname):
if not os.path.isdir(pathname):
self.ensuredir(os.path.dirname(pathname))
os.mkdir(pathname)
def retrieveurl(self, url):
return urllib2.urlopen(url).read()
def mirror(self, url):
if url in self.urlmap:
return
else:
filename = self.url2filename(url)
if not self.testinclude(url):
return
print url,'->',filename
self.urlmap[url] = filename
# TODO: add an option about re-reading stuff
if os.path.isfile(filename):
contents = open(filename, 'r').read()
else:
try:
contents = self.retrieveurl(url)
except urllib2.URLError, e:
print 'could not retrieve url %s: %s' % (url, e)
return
self.ensuredir(os.path.dirname(filename))
linkscanner = HTMLLinkScanner()
try:
linkscanner.feed(contents)
except:
print 'could not parse %s as html' % url
linkstomirror = []
for link in linkscanner.links:
linkurl = urlparse.urljoin(url, link)
linkstomirror.append(linkurl)
contents = sre.sub('http://web.archive.org/web/[0-9]{14}/', '', contents)
for tagtext, attr, link in linkscanner.replacements:
scheme, location, path, query, fragment = urlparse.urlsplit(link)
newtext = None
if tagtext.lower().startswith('<base'):
# strip out base references
newtext = ''
elif scheme or location:
if not self.testinclude(link): continue
linkfilename = self.url2filename(link)
newtext = tagtext.replace(link, 'file://%s' % linkfilename)
elif path.startswith('/'):
linkurl = urlparse.urljoin(url, link)
linkfilename = self.url2filename(linkurl)
newtext = tagtext.replace(link, 'file://%s' % linkfilename)
if newtext is not None:
contents = contents.replace(tagtext, newtext)
contentsfile = open(filename, 'w')
contentsfile.write(contents)
contentsfile.close()
for linkurl in linkstomirror:
self.mirror(linkurl)
class WaybackRetriever(MirrorRetriever):
def __init__(self, archivedir, datestring):
MirrorRetriever.__init__(self, archivedir)
self.datestring = datestring
def retrieveurl(self, url):
waybackurl = 'http://web.archive.org/web/%s/%s' % (self.datestring, url)
contents = urllib2.urlopen(waybackurl).read()
if contents.find("Sorry, we can't find the archived version of this page") != -1:
raise urllib2.URLError("not in wayback archive")
# remove the copyrighted javascript from the wayback machine...
contents = sre.sub('\\<SCRIPT language="Javascript"\\>(.|\r|\n)*(// FILE ARCHIVED ON [0-9]{14} AND RETRIEVED(.|\r|\n)* ON [0-9]{14}[.])(.|\r|\n)*\\</SCRIPT\\>', '\\2', contents)
# replace the javascript-style comments indicating the retrieval with html comments
contents = sre.sub('// ((FILE|INTERNET).*)', '<!-- \\1 -->', contents)
return contents
if __name__ == '__main__':
import sys
m = WaybackRetriever(os.path.abspath('.'), sys.argv[2])
m.mirror(sys.argv[1])
|
The internet archive keeps archives of a number of sites. Sometimes it is useful to retrieve not just a page at a time, but all the pages from a given site. This script should help to do that.
Issues arising include the javascript that they append to each page to update URLs and how to parse the pages and replace the links etc... Also there is some intelligence to update an already downloaded site but I think this could be improved.