from httplib import HTTPConnection, _CS_IDLE import urlparse def pipeline(domain,pages,max_out_bound=4,debuglevel=0): pagecount = len(pages) conn = HTTPConnection(domain) conn.set_debuglevel(debuglevel) respobjs = [None]*pagecount finished = [False]*pagecount data = [None]*pagecount headers = {'Host':domain,'Content-Length':0,'Connection':'Keep-Alive'} while not all(finished): # Send out_bound = 0 for i,page in enumerate(pages): if out_bound >= max_out_bound: break elif page and not finished[i] and respobjs[i] is None: if debuglevel > 0: print 'Sending request for %r...' % (page,) conn._HTTPConnection__state = _CS_IDLE # FU private variable! conn.request("GET", page, None, headers) respobjs[i] = conn.response_class(conn.sock, strict=conn.strict, method=conn._method) out_bound += 1 # Try to read a response for i,resp in enumerate(respobjs): if resp is None: continue if debuglevel > 0: print 'Retrieving %r...' % (pages[i],) out_bound -= 1 skip_read = False resp.begin() if debuglevel > 0: print ' %d %s' % (resp.status, resp.reason) if 200 <= resp.status < 300: # Ok data[i] = resp.read() cookie = resp.getheader('Set-Cookie') if cookie is not None: headers['Cookie'] = cookie skip_read = True finished[i] = True respobjs[i] = None elif 300 <= resp.status < 400: # Redirect loc = resp.getheader('Location') respobjs[i] = None parsed = loc and urlparse.urlparse(loc) if not parsed: # Missing or empty location header data[i] = (resp.status, resp.reason) finished[i] = True elif parsed.netloc != '' and parsed.netloc != host: # Redirect to another host data[i] = (resp.status, resp.reason, loc) finished[i] = True else: path = urlparse.urlunparse(parsed._replace(scheme='',netloc='',fragment='')) if debuglevel > 0: print ' Updated %r to %r' % (pages[i],path) pages[i] = path elif resp.status >= 400: # Failed data[i] = (resp.status, resp.reason) finished[i] = True respobjs[i] = None if resp.will_close: # Connection (will be) closed, need to resend conn.close() if debuglevel > 0: print ' Connection closed' for j,f in enumerate(finished): if not f and respobj[j] is not None: if debuglevel > 0: print ' Discarding out-bound request for %r' % (pages[j],) respobj[j] = None break elif not skip_read: resp.read() # read any data if any(not f and respobjs[j] is None for j,f in enumerate(finished)): # Send another pending request break else: break # All respobjs are None? return data if __name__ == '__main__': domain = 'en.wikipedia.org' pages = ('/wiki/HTTP_pipelining', '/wiki/HTTP', '/wiki/HTTP_persistent_connection') data = pipeline(domain,pages,max_out_bound=2,debuglevel=1) for i,page in enumerate(data): print print '==== Page %r ====' % (pages[i],) print page[:512]