Welcome, guest | Sign In | My Account | Store | Cart

Gets several pages in parallel, without threads. It exploits HTTP pipelining by resetting the state of HTTPConnection to trick it into sending the next request ahead of time.

More information about HTTP pipelining can be found on Wikipedia: http://en.wikipedia.org/wiki/HTTP_pipelining

Python, 96 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from httplib import HTTPConnection, _CS_IDLE
import urlparse

def pipeline(domain,pages,max_out_bound=4,debuglevel=0):
    pagecount = len(pages)
    conn = HTTPConnection(domain)
    conn.set_debuglevel(debuglevel)
    respobjs = [None]*pagecount
    finished = [False]*pagecount
    data = [None]*pagecount
    headers = {'Host':domain,'Content-Length':0,'Connection':'Keep-Alive'}

    while not all(finished):
        # Send 
        out_bound = 0
        for i,page in enumerate(pages):
            if out_bound >= max_out_bound:
                break
            elif page and not finished[i] and respobjs[i] is None:
                if debuglevel > 0:
                    print 'Sending request for %r...' % (page,)
                conn._HTTPConnection__state = _CS_IDLE # FU private variable!
                conn.request("GET", page, None, headers)
                respobjs[i] = conn.response_class(conn.sock, strict=conn.strict, method=conn._method)
                out_bound += 1
        # Try to read a response
        for i,resp in enumerate(respobjs):
            if resp is None:
                continue
            if debuglevel > 0:
                print 'Retrieving %r...' % (pages[i],)
            out_bound -= 1
            skip_read = False
            resp.begin()
            if debuglevel > 0:
                print '    %d %s' % (resp.status, resp.reason)
            if 200 <= resp.status < 300:
                # Ok
                data[i] = resp.read()
                cookie = resp.getheader('Set-Cookie')
                if cookie is not None:
                    headers['Cookie'] = cookie
                skip_read = True
                finished[i] = True
                respobjs[i] = None
            elif 300 <= resp.status < 400:
                # Redirect
                loc = resp.getheader('Location')
                respobjs[i] = None
                parsed = loc and urlparse.urlparse(loc)
                if not parsed:
                    # Missing or empty location header
                    data[i] = (resp.status, resp.reason)
                    finished[i] = True
                elif parsed.netloc != '' and parsed.netloc != host:
                    # Redirect to another host
                    data[i] = (resp.status, resp.reason, loc)
                    finished[i] = True
                else:
                    path = urlparse.urlunparse(parsed._replace(scheme='',netloc='',fragment=''))
                    if debuglevel > 0:
                        print '  Updated %r to %r' % (pages[i],path)
                    pages[i] = path
            elif resp.status >= 400:
                # Failed
                data[i] = (resp.status, resp.reason)
                finished[i] = True
                respobjs[i] = None
            if resp.will_close:
                # Connection (will be) closed, need to resend
                conn.close()
                if debuglevel > 0:
                    print '  Connection closed'
                for j,f in enumerate(finished):
                    if not f and respobj[j] is not None:
                        if debuglevel > 0:
                            print '  Discarding out-bound request for %r' % (pages[j],)
                        respobj[j] = None
                break
            elif not skip_read:
                resp.read() # read any data
            if any(not f and respobjs[j] is None for j,f in enumerate(finished)):
                # Send another pending request
                break
        else:
            break # All respobjs are None?
    return data

if __name__ == '__main__':
    domain = 'en.wikipedia.org'
    pages = ('/wiki/HTTP_pipelining', '/wiki/HTTP', '/wiki/HTTP_persistent_connection')
    data = pipeline(domain,pages,max_out_bound=2,debuglevel=1)
    for i,page in enumerate(data):
        print
        print '==== Page %r ====' % (pages[i],)
        print page[:512]

I am having trouble figuring out how to convert this into a class, without requiring the user to read the requests in-order. Anyone have any ideas?

Created by Markus J on Fri, 27 Feb 2009 (MIT)
Python recipes (4591)
Markus J's recipes (1)

Required Modules

Other Information and Tasks