Pulling stuff out of the internet archive (wayback machine) « Python recipes

I used this when trying to retrieve a site I could not access via the wayback machine (archive.org). Might require some tweaking to get it to work...

      #!/usr/bin/env python

import urlparse
import urllib2
import os
import HTMLParser
import sre

class HTMLLinkScanner(HTMLParser.HTMLParser):
  tags = {'a':'href','img':'src','frame':'src','base':'href'}

  def reset(self):
    self.links = {}
    self.replacements = []
    HTMLParser.HTMLParser.reset(self)

  def handle_starttag(self, tag, attrs):
    if tag in self.tags:
      checkattrs = self.tags[tag]
      if isinstance(checkattrs, (str, unicode)):
        checkattrs = [checkattrs]
      for attr, value in attrs:
        if attr in checkattrs:
          if tag != 'base':
            link = urlparse.urldefrag(value)[0]
            self.links[link] = True
          self.replacements.append((self.get_starttag_text(), attr, value))

class MirrorRetriever:
  def __init__(self, archivedir):
    self.archivedir = archivedir
    self.urlmap = {}

  def url2filename(self, url):
    scheme, location, path, query, fragment = urlparse.urlsplit(url)
    if not path or path.endswith('/'):
      path += 'index.html'
    path = os.path.join(*path.split('/'))
    if scheme.lower() != 'http':
      location = os.path.join(scheme, location)
    # ignore query for the meantime
    return os.path.join(self.archivedir, location, path)

  def testinclude(self, url):
    scheme, location, path, query, fragment = urlparse.urlsplit(url)
    if scheme in ('mailto', 'javascript'): return False
    # TODO: add ability to specify site
    # return location.lower() == 'www.mcmillan-inc.com'
    return True

  def ensuredir(self, pathname):
    if not os.path.isdir(pathname):
      self.ensuredir(os.path.dirname(pathname))
      os.mkdir(pathname)

  def retrieveurl(self, url):
    return urllib2.urlopen(url).read()

  def mirror(self, url):
    if url in self.urlmap:
      return
    else:
      filename = self.url2filename(url)
      if not self.testinclude(url):
        return
      print url,'->',filename
      self.urlmap[url] = filename
      # TODO: add an option about re-reading stuff
      if os.path.isfile(filename):
        contents = open(filename, 'r').read()
      else:
        try:
          contents = self.retrieveurl(url)
        except urllib2.URLError, e:
          print 'could not retrieve url %s: %s' % (url, e)
          return
      self.ensuredir(os.path.dirname(filename))
      linkscanner = HTMLLinkScanner()
      try:
        linkscanner.feed(contents)
      except:
        print 'could not parse %s as html' % url
      linkstomirror = []
      for link in linkscanner.links:
        linkurl = urlparse.urljoin(url, link)
        linkstomirror.append(linkurl)
      contents = sre.sub('http://web.archive.org/web/[0-9]{14}/', '', contents)
      for tagtext, attr, link in linkscanner.replacements:
        scheme, location, path, query, fragment = urlparse.urlsplit(link)
        newtext = None
        if tagtext.lower().startswith('<base'):
          # strip out base references
          newtext = ''
        elif scheme or location:
          if not self.testinclude(link): continue
          linkfilename = self.url2filename(link)
          newtext = tagtext.replace(link, 'file://%s' % linkfilename)
        elif path.startswith('/'):
          linkurl = urlparse.urljoin(url, link)
          linkfilename = self.url2filename(linkurl)
          newtext = tagtext.replace(link, 'file://%s' % linkfilename)
        if newtext is not None:
          contents = contents.replace(tagtext, newtext)
      contentsfile = open(filename, 'w')
      contentsfile.write(contents)
      contentsfile.close()
      for linkurl in linkstomirror:
        self.mirror(linkurl)

class WaybackRetriever(MirrorRetriever):
  def __init__(self, archivedir, datestring):
    MirrorRetriever.__init__(self, archivedir)
    self.datestring = datestring

  def retrieveurl(self, url):
    waybackurl = 'http://web.archive.org/web/%s/%s' % (self.datestring, url)
    contents = urllib2.urlopen(waybackurl).read()
    if contents.find("Sorry, we can't find the archived version of this page") != -1:
      raise urllib2.URLError("not in wayback archive")
    # remove the copyrighted javascript from the wayback machine...
    contents = sre.sub('\\<SCRIPT language="Javascript"\\>(.|\r|\n)*(// FILE ARCHIVED ON [0-9]{14} AND RETRIEVED(.|\r|\n)* ON [0-9]{14}[.])(.|\r|\n)*\\</SCRIPT\\>', '\\2', contents)
    # replace the javascript-style comments indicating the retrieval with html comments
    contents = sre.sub('// ((FILE|INTERNET).*)', '<!-- \\1 -->', contents)
    return contents

if __name__ == '__main__':
  import sys
  m = WaybackRetriever(os.path.abspath('.'), sys.argv[2])
  m.mirror(sys.argv[1])

      

The internet archive keeps archives of a number of sites. Sometimes it is useful to retrieve not just a page at a time, but all the pages from a given site. This script should help to do that.

Issues arising include the javascript that they append to each page to update URLs and how to parse the pages and replace the links etc... Also there is some intelligence to update an already downloaded site but I think this could be improved.

Tags: web

◄	Python recipes (4591)	►
◄	David Fraser's recipes (1)	►

Pulling stuff out of the internet archive (wayback machine) (Python recipe) by David Fraser
ActiveState Code (http://code.activestate.com/recipes/286224/)

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Pulling stuff out of the internet archive (wayback machine) (Python recipe) by David Fraser ActiveState Code (http://code.activestate.com/recipes/286224/)