I improved the urllib.URLopen.retrieve() method so that it can restart a download if it failed. And like wget does (with wget -c), it restarts where it stopped. The number of maximum tries can be changed.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | from urllib import quote_plus, FancyURLopener, URLopener, unwrap,\
toBytes, splittype
def retrieve(self, url, filename=None, reporthook=None, data=None,
maxtries=5, r_range=None):
"""retrieve(url) returns (filename, headers) for a local object
or (tempfilename, headers) for a remote object.
If it fails, it relaunches itself until the dl is complete or
maxtries == 0 (maxtries == -1 for unlimited tries).
Range tuple(start, end) indicates the range of the remote object
we have to retrieve (ignored for local files)"""
if maxtries < -1:
raise ValueError, 'maxtries must be at least equal with -1'
url = unwrap(toBytes(url))
if self.tempcache and url in self.tempcache:
return self.tempcache[url]
type, url1 = splittype(url)
if filename is None and (not type or type == 'file'):
try:
fp = self.open_local_file(url1)
hdrs = fp.info()
fp.close()
return url2pathname(splithost(url1)[1]), hdrs
except IOError, msg:
pass
if not r_range is None:
try:
self.addheader(('Range', 'bytes=%d-%d' % r_range))
except TypeError:
raise ValueError, 'r_range argument must be a tuple of two int : (start, end)'
fp = self.open(url, data)
try:
headers = fp.info()
if filename:
tfp = open(filename, 'ab')
else:
import tempfile
garbage, path = splittype(url)
garbage, path = splithost(path or "")
path, garbage = splitquery(path or "")
path, garbage = splitattr(path or "")
suffix = os.path.splitext(path)[1]
(fd, filename) = tempfile.mkstemp(suffix)
self.__tempfiles.append(filename)
tfp = os.fdopen(fd, 'ab')
try:
result = filename, headers
if self.tempcache is not None:
self.tempcache[url] = result
bs = 1024*8
size = -1
read = 0
blocknum = 0
if "content-length" in headers:
size = int(headers["Content-Length"])
elif r_range is not None:
size = r_range[1]
if reporthook:
reporthook(blocknum, bs, size)
while 1:
block = fp.read(bs)
if block == "":
break
read += len(block)
tfp.write(block)
blocknum += 1
if reporthook:
reporthook(blocknum, bs, size)
finally:
tfp.close()
finally:
fp.close()
# raise exception if actual size does not match content-length
# header and if maxtries <= 0
if size >= 0 and read < size:
if maxtries > 0 or maxtries == -1:
self.retrieve(url, filename, reporthook, data,
maxtries if maxtries == -1 else maxtries-1,
r_range=(read, size))
else:
raise ContentTooShortError("retrieval incomplete: got only %i out "
"of %i bytes" % (read, size), result)
return result
#to use our function in the opener
URLopener.retrieve = retrieve
|