Welcome, guest | Sign In | My Account | Store | Cart
from HTMLParser import HTMLParser
import urllib
import time
import Queue
import threading
import urlparse

MIRRORS_URL = 'http://fedora.redhat.com/download/mirrors.html'
MAX_THREADS = 50
HTTP_TIMEOUT = 60.0   # Max. seconds to wait for a response

class UrlFinder(HTMLParser):

    '''Subclass of the HTMLParser object.  Records the HREF attributes
    of anchor tags if the scheme is 'http' and the anchor occurs in
    the 'content' section of the page.'''
    
    def __init__(self):
        HTMLParser.__init__(self)
        self.mirrorLinks = []  

        # True if we're currently in the 'content' section
        self.isInMirrors = False
        
    def handle_comment(self, data):

        # The comments have spaces before and after, but don't count
        # on that.
        data = data.strip()

        if 'content BEGIN' == data:
            self.isInMirrors = True
        elif 'content END' == data:
            self.isInMirrors = False

    def handle_starttag(self, tag, attrs):
        if self.isInMirrors:
            attrs = dict(attrs) # Convert from tuple of tuples to dict
            if 'a' == tag and 'http' == urllib.splittype(attrs['href'])[0]:
                self.mirrorLinks.append(attrs['href'])

# Record the start time, so we can print a nice message at the end
processStartTime = time.time()

# Create the parser, get the 'mirrors' page from Redhat,
# and extract the URLs
print "Getting mirrors list...",
parser = UrlFinder()
parser.feed(urllib.urlopen(MIRRORS_URL).read())


print len(parser.mirrorLinks), "mirrors found."
numThreads = min(MAX_THREADS, len(parser.mirrorLinks))
print "Testing bandwidth with", numThreads, "threads..."

# Build a queue to feed the worker threads
workQueue = Queue.Queue()
for url in parser.mirrorLinks:
    workQueue.put(url)

def TestUrl(workQueue, resultQueue):

    ''' Worker thread procedure.  Test how long it takes to return the
    mirror index page, and stuff the results into resultQueue.'''
    
    def SubthreadProc(url, result):

        ''' Subthread procedure.  Actually get the mirror index page
        in a subthread, so that we can time out using join rather than
        wait for a very slow server.  Passing in a list for result
        lets us simulate pass-by-reference, since callers cannot get
        the return code from a Python thread.'''
        
        startTime = time.time()
        try:
            data = urllib.urlopen(url).read()
        except Exception:
            # Could be a socket error or an HTTP error--either way, we
            # don't care--it's a failure to us.
            result.append(-1)
        else:
            elapsed = int((time.time() - startTime) * 1000)
            result.append(elapsed)

            
    while 1:
        # Contine pulling data from the work queue until it's empty
        try:
            url = workQueue.get(0)
        except Queue.Empty:
            # work queue is empty--exit the thread proc.
            return

        # Create a single subthread to do the actual work
        result = []
        subThread = threading.Thread(target=SubthreadProc, args=(url, result))

        # Daemonize the subthread so that even if a few are hanging
        # around when the process is done, the process will exit.
        subThread.setDaemon(True)

        # Run the subthread and wait for it to finish, or time out
        subThread.start()
        subThread.join(HTTP_TIMEOUT)

        if [] == result:
            # Subthread hasn't give a result yet.  Consider it timed out.
            resultQueue.put((url, "TIMEOUT"))
        elif -1 == result[0]:
            # Subthread returned an error from geturl.
            resultQueue.put((url, "FAILED"))
        else:
            # Subthread returned a time.  Store it.
            resultQueue.put((url, result[0]))
        

workers = []
resultQueue = Queue.Queue()

# Create worker threads to load-balance the retrieval
for threadNum in range(0, numThreads):
    workers.append(threading.Thread(target=TestUrl,
                                    args=(workQueue,resultQueue)))
    workers[-1].start()

# Wait for all the workers to finish
for w in workers:
    w.join()

# Separate the successes from failures
timings = []
failures = []
while not resultQueue.empty():
    url, result = resultQueue.get(0)
    if isinstance(result, str):
        failures.append((result, url))
    else:
        timings.append((result, url))

# Sort by increasing time or result string
timings.sort()
failures.sort()

# Print the results
print "\nMirrors (ordered fastest to slowest)"
for result, url in timings:
    print "%7d %s" % (result, url)
for result, url in failures:
    print "%7s %s" % (result, url)

print "\nProcess completed in ", time.time() - processStartTime, " seconds."

History

  • revision 2 (19 years ago)
  • previous revisions are not available