Recipe 286224 revision 1 « ActiveState Code

#!/usr/bin/env python

import urlparse
import urllib2
import os
import HTMLParser
import sre

class HTMLLinkScanner(HTMLParser.HTMLParser):
  tags = {'a':'href','img':'src','frame':'src','base':'href'}

  def reset(self):
    self.links = {}
    self.replacements = []
    HTMLParser.HTMLParser.reset(self)

  def handle_starttag(self, tag, attrs):
    if tag in self.tags:
      checkattrs = self.tags[tag]
      if isinstance(checkattrs, (str, unicode)):
        checkattrs = [checkattrs]
      for attr, value in attrs:
        if attr in checkattrs:
          if tag != 'base':
            link = urlparse.urldefrag(value)[0]
            self.links[link] = True
          self.replacements.append((self.get_starttag_text(), attr, value))

class MirrorRetriever:
  def __init__(self, archivedir):
    self.archivedir = archivedir
    self.urlmap = {}

  def url2filename(self, url):
    scheme, location, path, query, fragment = urlparse.urlsplit(url)
    if not path or path.endswith('/'):
      path += 'index.html'
    path = os.path.join(*path.split('/'))
    if scheme.lower() != 'http':
      location = os.path.join(scheme, location)
    # ignore query for the meantime
    return os.path.join(self.archivedir, location, path)

  def testinclude(self, url):
    scheme, location, path, query, fragment = urlparse.urlsplit(url)
    if scheme in ('mailto', 'javascript'): return False
    # TODO: add ability to specify site
    # return location.lower() == 'www.mcmillan-inc.com'
    return True

  def ensuredir(self, pathname):
    if not os.path.isdir(pathname):
      self.ensuredir(os.path.dirname(pathname))
      os.mkdir(pathname)

  def retrieveurl(self, url):
    return urllib2.urlopen(url).read()

  def mirror(self, url):
    if url in self.urlmap:
      return
    else:
      filename = self.url2filename(url)
      if not self.testinclude(url):
        return
      print url,'->',filename
      self.urlmap[url] = filename
      # TODO: add an option about re-reading stuff
      if os.path.isfile(filename):
        contents = open(filename, 'r').read()
      else:
        try:
          contents = self.retrieveurl(url)
        except urllib2.URLError, e:
          print 'could not retrieve url %s: %s' % (url, e)
          return
      self.ensuredir(os.path.dirname(filename))
      linkscanner = HTMLLinkScanner()
      try:
        linkscanner.feed(contents)
      except:
        print 'could not parse %s as html' % url
      linkstomirror = []
      for link in linkscanner.links:
        linkurl = urlparse.urljoin(url, link)
        linkstomirror.append(linkurl)
      contents = sre.sub('http://web.archive.org/web/[0-9]{14}/', '', contents)
      for tagtext, attr, link in linkscanner.replacements:
        scheme, location, path, query, fragment = urlparse.urlsplit(link)
        newtext = None
        if tagtext.lower().startswith('<base'):
          # strip out base references
          newtext = ''
        elif scheme or location:
          if not self.testinclude(link): continue
          linkfilename = self.url2filename(link)
          newtext = tagtext.replace(link, 'file://%s' % linkfilename)
        elif path.startswith('/'):
          linkurl = urlparse.urljoin(url, link)
          linkfilename = self.url2filename(linkurl)
          newtext = tagtext.replace(link, 'file://%s' % linkfilename)
        if newtext is not None:
          contents = contents.replace(tagtext, newtext)
      contentsfile = open(filename, 'w')
      contentsfile.write(contents)
      contentsfile.close()
      for linkurl in linkstomirror:
        self.mirror(linkurl)

class WaybackRetriever(MirrorRetriever):
  def __init__(self, archivedir, datestring):
    MirrorRetriever.__init__(self, archivedir)
    self.datestring = datestring

  def retrieveurl(self, url):
    waybackurl = 'http://web.archive.org/web/%s/%s' % (self.datestring, url)
    contents = urllib2.urlopen(waybackurl).read()
    if contents.find("Sorry, we can't find the archived version of this page") != -1:
      raise urllib2.URLError("not in wayback archive")
    # remove the copyrighted javascript from the wayback machine...
    contents = sre.sub('\\<SCRIPT language="Javascript"\\>(.|\r|\n)*(// FILE ARCHIVED ON [0-9]{14} AND RETRIEVED(.|\r|\n)* ON [0-9]{14}[.])(.|\r|\n)*\\</SCRIPT\\>', '\\2', contents)
    # replace the javascript-style comments indicating the retrieval with html comments
    contents = sre.sub('// ((FILE|INTERNET).*)', '<!-- \\1 -->', contents)
    return contents

if __name__ == '__main__':
  import sys
  m = WaybackRetriever(os.path.abspath('.'), sys.argv[2])
  m.mirror(sys.argv[1])
Recipe 286224 revision 1

History

Accounts

Code Recipes

Feedback & Information

ActiveState