The fetch_relative() function downloads a file, reproducing the directory structure from the server. After downloading, additional callback function can be performed on the file's contents. If the local copy already exists, the file is not re-refetched, and the callback is performed on the local copy.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | #!/usr/bin/env python
import urllib
import urlparse
import os
import re
import logging
_sub = re.compile("([^\\-/a-zA-Z0-9\\.])").sub
def _sc(ch):
return "_%02x" % ord(ch.group(1))
def normalize_path(path, sub=_sub, sc=_sc):
if "." not in path:
path += "/"
if path.endswith("/"):
path += "index.html"
if path and path[0] == "/":
path = path[1:]
path = sub(sc, path)
return path
def localize_path(url):
splitd = urlparse.urlsplit(url)
if splitd[3]:
return "%s?%s" % splitd[2:4]
else:
return splitd[2]
def exists_relative(url):
path = localize_path(url)
path = normalize_path(path)
return os.path.exists(path)
def fetch_relative(url, proxies=None, postfetch=None):
"""postfetch is a callback that receives fetched data (as string)"""
path = localize_path(url)
path = normalize_path(path)
if os.path.exists(path):
if postfetch:
logging.debug("reprocessing file %s" % path)
f = open(path, "rb")
data = f.read()
f.close()
postfetch(data)
return False
logging.debug("fetching %s" % url)
f = urllib.urlopen(url, proxies=proxies)
data = f.read()
f.close()
head, tail = os.path.split(path)
if not os.path.exists(head):
os.makedirs(head)
f = open(path, "wb")
f.write(data)
f.close()
if postfetch:
postfetch(data)
return True
|
Implementation note: Files that don't have extensions (that is, no "." in the file name) are treated as directories and mirrored as "file_name/index.html".
See also Yuri Tkachenko's http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/302700 if you're looking for code to create a website spider.