a simple recipe which can be used to download a webpage using twisted. Created this while going through the twisted documentation. supports the following : - basic authentication - check whether the page is updated - progress bar
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | #!/usr/bin/env python
from twisted.web import client, error
import os.path
import ConfigParser
import getpass, base64
import webbrowser
class HTTPProgressDownloader(client.HTTPDownloader):
def __init__(self, url, outfile, headers=None):
client.HTTPDownloader.__init__(self, url, outfile, headers=headers)
self.status = None
def noPage(self, reason): # called for non-200 responses
if self.status == '304':
print reason.getErrorMessage()
client.HTTPDownloader.page(self, '')
else:
client.HTTPDownloader.noPage(self, reason)
def gotHeaders(self, headers):
# page data is on the way
if self.status == '200':
# initialize for progress bar
if headers.has_key('content-length'):
self.totallength = int(headers['content-length'][0])
else:
self.totallength = 0
self.currentlength = 0.0
print ''
# update headers metadata
oldheaders = {}
eTag = headers.get('etag','')
if eTag:
oldheaders['etag'] = eTag[0]
modified = headers.get('last-modified','')
if modified:
oldheaders['last-modified'] = modified[0]
config = ConfigParser.ConfigParser()
config.read('metadata.ini')
if config.has_section('headers'):
config.remove_section('headers')
config.add_section('headers')
for key, value in oldheaders.items():
config.set('headers', key, value)
config.write(open('metadata.ini','w'))
return client.HTTPDownloader.gotHeaders(self, headers)
def pagePart(self, data):
if self.status == '200':
self.currentlength += len(data)
if self.totallength:
percent = "%i%%" % (
(self.currentlength/self.totallength)*100)
else:
percent = '%dK' % (self.currentLength/1000)
print "\033[1FProgress: " + percent
return client.HTTPDownloader.pagePart(self, data)
def downloadWithProgress(url, outputfile, contextFactory=None, *args, **kwargs):
scheme, host, port, path = client._parse(url)
factory = HTTPProgressDownloader(url, outputfile, *args, **kwargs)
if scheme == 'https':
from twisted.internet import ssl
if contextFactory == None :
contextFactory = ssl.ClientContextFactory()
reactor.connectSSL(host, port, factory, contextFactory)
else:
reactor.connectTCP(host, port, factory)
return factory.deferred
def downloadPage( url, outputfile, RequestHeaders):
downloadWithProgress(url, outputfile, headers=RequestHeaders).addCallback(
downloadComplete).addErrback(
handleBasicAuthentication,url,outputfile, RequestHeaders).addErrback(
handleError)
def downloadComplete(result):
print "download Complete"
reactor.stop()
def handleBasicAuthentication(failure, url, outputfile, RequestHeaders):
failure.trap(error.Error)
if failure.value.status == '401':
username = raw_input("user name:")
password = getpass.getpass("password: ")
basicAuth = base64.encodestring("%s:%s"%(username, password))
authHeader = "Basic "+basicAuth.strip()
AuthHeaders = {"Authorization": authHeader}
RequestHeaders.update(AuthHeaders)
return downloadWithProgress(url, outputfile, headers=RequestHeaders)
else:
return failure
def handleError(failure):
print "Error: ", failure.getErrorMessage()
reactor.stop()
def getRequestHeaders(url, outputfile):
# update metadata and generate request headers
RequestHeaders = {}
config = ConfigParser.ConfigParser()
if not os.path.isfile('metadata.ini'):
section = 'download-metadata'
config.add_section(section)
config.set(section, "url", url)
config.set(section, "filename", outputfile)
config.write(open('metadata.ini','w'))
else:
config.read('metadata.ini')
eTag = None
if config.has_option('headers','etag'):
eTag = config.get('headers','etag')
if eTag:
RequestHeaders['If-None-Match'] = eTag
modified = None
if config.has_option('headers','last-modified'):
modified = config.get('headers','last-modified')
if modified:
RequestHeaders['If-Modified-Since'] = modified
return RequestHeaders
if __name__ == '__main__':
import sys
from twisted.internet import reactor
url, outputfile = sys.argv[1:]
RequestHeaders = getRequestHeaders(url, outputfile)
downloadPage(url, outputfile, RequestHeaders)
reactor.run()
webbrowser.open(outputfile)
|
Tags: network
Line 65: currentLength should be currentlength -- all lower case
On OS X, I had to change the last line (to open the browser) to: