Welcome, guest | Sign In | My Account | Store | Cart

a simple recipe which can be used to download a webpage using twisted. Created this while going through the twisted documentation. supports the following : - basic authentication - check whether the page is updated - progress bar

Python, 147 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python

from twisted.web import client, error
import os.path
import ConfigParser 
import getpass, base64
import webbrowser

class HTTPProgressDownloader(client.HTTPDownloader):    
    def __init__(self, url, outfile, headers=None):
        client.HTTPDownloader.__init__(self, url, outfile, headers=headers)
        self.status = None

    def noPage(self, reason): # called for non-200 responses
        if self.status == '304':
            print reason.getErrorMessage()
            client.HTTPDownloader.page(self, '')
        else:
            client.HTTPDownloader.noPage(self, reason)

    def gotHeaders(self, headers):
        # page data is on the way
        if self.status == '200':
            
            # initialize for progress bar
            if headers.has_key('content-length'):
                self.totallength = int(headers['content-length'][0])
            else:
                self.totallength = 0
            self.currentlength = 0.0
            print ''

            # update headers metadata 
            oldheaders = {}
            eTag = headers.get('etag','')
            if eTag:
                oldheaders['etag'] = eTag[0]
            modified = headers.get('last-modified','')
            if modified:
                oldheaders['last-modified'] = modified[0]
                
            config = ConfigParser.ConfigParser()
            config.read('metadata.ini')
                
            if config.has_section('headers'):
                config.remove_section('headers')    
                
            config.add_section('headers')
            for key, value in oldheaders.items():
                config.set('headers', key, value)
                
            config.write(open('metadata.ini','w'))
            

        return client.HTTPDownloader.gotHeaders(self, headers)

    def pagePart(self, data):
        if self.status == '200':
            self.currentlength += len(data)
            if self.totallength:
                percent = "%i%%" % (
                    (self.currentlength/self.totallength)*100)
                
            else:
                percent = '%dK' % (self.currentLength/1000)
            print "\033[1FProgress: " + percent
        return client.HTTPDownloader.pagePart(self, data)

def downloadWithProgress(url, outputfile, contextFactory=None, *args, **kwargs):
    scheme, host, port, path = client._parse(url)
    factory = HTTPProgressDownloader(url, outputfile, *args, **kwargs)
    if scheme == 'https':
        from twisted.internet import ssl
        if contextFactory == None :
            contextFactory = ssl.ClientContextFactory()
        reactor.connectSSL(host, port, factory, contextFactory)
    else:
        reactor.connectTCP(host, port, factory)

    return factory.deferred

def downloadPage( url, outputfile, RequestHeaders):
    downloadWithProgress(url, outputfile, headers=RequestHeaders).addCallback(
        downloadComplete).addErrback(
        handleBasicAuthentication,url,outputfile, RequestHeaders).addErrback(
        handleError)
        
def downloadComplete(result):
    print "download Complete"
    reactor.stop()

def handleBasicAuthentication(failure, url, outputfile, RequestHeaders):
    failure.trap(error.Error)
    if failure.value.status == '401':
        username = raw_input("user name:")
        password = getpass.getpass("password: ")
        basicAuth = base64.encodestring("%s:%s"%(username, password))
        authHeader = "Basic "+basicAuth.strip()
        AuthHeaders = {"Authorization": authHeader}
        RequestHeaders.update(AuthHeaders)
        return downloadWithProgress(url, outputfile, headers=RequestHeaders)
    else:
        return failure

def handleError(failure):
    print "Error: ", failure.getErrorMessage()
    reactor.stop()

def getRequestHeaders(url, outputfile):
    # update metadata and generate request headers
    
    RequestHeaders = {}
    
    config = ConfigParser.ConfigParser()
    if not os.path.isfile('metadata.ini'):
        section = 'download-metadata'
        config.add_section(section)
        config.set(section, "url", url)
        config.set(section, "filename", outputfile)
        config.write(open('metadata.ini','w'))
    else:
        config.read('metadata.ini')
        eTag = None
        if config.has_option('headers','etag'):
            eTag = config.get('headers','etag')
            if eTag:
                RequestHeaders['If-None-Match'] = eTag

        modified = None
        if config.has_option('headers','last-modified'):
            modified = config.get('headers','last-modified')
            if modified:
                RequestHeaders['If-Modified-Since'] = modified

    return RequestHeaders

if __name__ == '__main__':
    import sys
    from twisted.internet import reactor
    
    url, outputfile = sys.argv[1:]

    RequestHeaders = getRequestHeaders(url, outputfile)
    downloadPage(url, outputfile, RequestHeaders)

    reactor.run()
    webbrowser.open(outputfile)

2 comments

steve steiner 15 years, 7 months ago  # | flag

Line 65: currentLength should be currentlength -- all lower case

steve steiner 15 years, 7 months ago  # | flag

On OS X, I had to change the last line (to open the browser) to:

webbrowser.open("file://" + os.getcwd() + "/" + outputfile)