Gets several pages in parallel, without threads. It exploits HTTP pipelining by resetting the state of HTTPConnection to trick it into sending the next request ahead of time.
More information about HTTP pipelining can be found on Wikipedia: http://en.wikipedia.org/wiki/HTTP_pipelining
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | from httplib import HTTPConnection, _CS_IDLE
import urlparse
def pipeline(domain,pages,max_out_bound=4,debuglevel=0):
pagecount = len(pages)
conn = HTTPConnection(domain)
conn.set_debuglevel(debuglevel)
respobjs = [None]*pagecount
finished = [False]*pagecount
data = [None]*pagecount
headers = {'Host':domain,'Content-Length':0,'Connection':'Keep-Alive'}
while not all(finished):
# Send
out_bound = 0
for i,page in enumerate(pages):
if out_bound >= max_out_bound:
break
elif page and not finished[i] and respobjs[i] is None:
if debuglevel > 0:
print 'Sending request for %r...' % (page,)
conn._HTTPConnection__state = _CS_IDLE # FU private variable!
conn.request("GET", page, None, headers)
respobjs[i] = conn.response_class(conn.sock, strict=conn.strict, method=conn._method)
out_bound += 1
# Try to read a response
for i,resp in enumerate(respobjs):
if resp is None:
continue
if debuglevel > 0:
print 'Retrieving %r...' % (pages[i],)
out_bound -= 1
skip_read = False
resp.begin()
if debuglevel > 0:
print ' %d %s' % (resp.status, resp.reason)
if 200 <= resp.status < 300:
# Ok
data[i] = resp.read()
cookie = resp.getheader('Set-Cookie')
if cookie is not None:
headers['Cookie'] = cookie
skip_read = True
finished[i] = True
respobjs[i] = None
elif 300 <= resp.status < 400:
# Redirect
loc = resp.getheader('Location')
respobjs[i] = None
parsed = loc and urlparse.urlparse(loc)
if not parsed:
# Missing or empty location header
data[i] = (resp.status, resp.reason)
finished[i] = True
elif parsed.netloc != '' and parsed.netloc != host:
# Redirect to another host
data[i] = (resp.status, resp.reason, loc)
finished[i] = True
else:
path = urlparse.urlunparse(parsed._replace(scheme='',netloc='',fragment=''))
if debuglevel > 0:
print ' Updated %r to %r' % (pages[i],path)
pages[i] = path
elif resp.status >= 400:
# Failed
data[i] = (resp.status, resp.reason)
finished[i] = True
respobjs[i] = None
if resp.will_close:
# Connection (will be) closed, need to resend
conn.close()
if debuglevel > 0:
print ' Connection closed'
for j,f in enumerate(finished):
if not f and respobj[j] is not None:
if debuglevel > 0:
print ' Discarding out-bound request for %r' % (pages[j],)
respobj[j] = None
break
elif not skip_read:
resp.read() # read any data
if any(not f and respobjs[j] is None for j,f in enumerate(finished)):
# Send another pending request
break
else:
break # All respobjs are None?
return data
if __name__ == '__main__':
domain = 'en.wikipedia.org'
pages = ('/wiki/HTTP_pipelining', '/wiki/HTTP', '/wiki/HTTP_persistent_connection')
data = pipeline(domain,pages,max_out_bound=2,debuglevel=1)
for i,page in enumerate(data):
print
print '==== Page %r ====' % (pages[i],)
print page[:512]
|
I am having trouble figuring out how to convert this into a class, without requiring the user to read the requests in-order. Anyone have any ideas?