Welcome, guest | Sign In | My Account | Store | Cart

The fetch_relative() function downloads a file, reproducing the directory structure from the server. After downloading, additional callback function can be performed on the file's contents. If the local copy already exists, the file is not re-refetched, and the callback is performed on the local copy.

Python, 59 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python

import urllib
import urlparse
import os
import re
import logging

_sub = re.compile("([^\\-/a-zA-Z0-9\\.])").sub
def _sc(ch):
    return "_%02x" % ord(ch.group(1))

def normalize_path(path, sub=_sub, sc=_sc):
    if "." not in path:
        path += "/"
    if path.endswith("/"):
        path += "index.html"
    if path and path[0] == "/":
        path = path[1:]
    path = sub(sc, path)
    return path

def localize_path(url):
    splitd = urlparse.urlsplit(url)
    if splitd[3]:
        return "%s?%s" % splitd[2:4]
    else:
        return splitd[2]

def exists_relative(url):
    path = localize_path(url)
    path = normalize_path(path)
    return os.path.exists(path)

def fetch_relative(url, proxies=None, postfetch=None):
    """postfetch is a callback that receives fetched data (as string)"""
    path = localize_path(url)
    path = normalize_path(path)
    if os.path.exists(path):
        if postfetch:
            logging.debug("reprocessing file %s" % path)
            f = open(path, "rb")
            data = f.read()
            f.close()
            postfetch(data)
        return False
    logging.debug("fetching %s" % url)
    f = urllib.urlopen(url, proxies=proxies)
    data = f.read()
    f.close()
    head, tail = os.path.split(path)
    if not os.path.exists(head):
        os.makedirs(head)
    f = open(path, "wb")
    f.write(data)
    f.close()
    if postfetch:
        postfetch(data)
    return True

Implementation note: Files that don't have extensions (that is, no "." in the file name) are treated as directories and mirrored as "file_name/index.html".

See also Yuri Tkachenko's http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/302700 if you're looking for code to create a website spider.