Welcome, guest | Sign In | My Account | Store | Cart
from HTMLParser import HTMLParser
import urllib
import time
import Queue
import threading
import urlparse

MIRRORS_URL = 'http://fedora.redhat.com/download/mirrors.html'
HTTP_TIMEOUT = 60.0   # Max. seconds to wait for a response

class UrlFinder(HTMLParser):

    '''Subclass of the HTMLParser object.  Records the HREF attributes
    of anchor tags if the scheme is 'http' and the anchor occurs in
    the 'content' section of the page.'''
    def __init__(self):
        self.mirrorLinks = []  

        # True if we're currently in the 'content' section
        self.isInMirrors = False
    def handle_comment(self, data):

        # The comments have spaces before and after, but don't count
        # on that.
        data = data.strip()

        if 'content BEGIN' == data:
            self.isInMirrors = True
        elif 'content END' == data:
            self.isInMirrors = False

    def handle_starttag(self, tag, attrs):
        if self.isInMirrors:
            attrs = dict(attrs) # Convert from tuple of tuples to dict
            if 'a' == tag and 'http' == urllib.splittype(attrs['href'])[0]:

# Record the start time, so we can print a nice message at the end
processStartTime = time.time()

# Create the parser, get the 'mirrors' page from Redhat,
# and extract the URLs
print "Getting mirrors list...",
parser = UrlFinder()

print len(parser.mirrorLinks), "mirrors found."
numThreads = min(MAX_THREADS, len(parser.mirrorLinks))
print "Testing bandwidth with", numThreads, "threads..."

# Build a queue to feed the worker threads
workQueue = Queue.Queue()
for url in parser.mirrorLinks:

def TestUrl(workQueue, resultQueue):

    ''' Worker thread procedure.  Test how long it takes to return the
    mirror index page, and stuff the results into resultQueue.'''
    def SubthreadProc(url, result):

        ''' Subthread procedure.  Actually get the mirror index page
        in a subthread, so that we can time out using join rather than
        wait for a very slow server.  Passing in a list for result
        lets us simulate pass-by-reference, since callers cannot get
        the return code from a Python thread.'''
        startTime = time.time()
            data = urllib.urlopen(url).read()
        except Exception:
            # Could be a socket error or an HTTP error--either way, we
            # don't care--it's a failure to us.
            elapsed = int((time.time() - startTime) * 1000)

    while 1:
        # Contine pulling data from the work queue until it's empty
            url = workQueue.get(0)
        except Queue.Empty:
            # work queue is empty--exit the thread proc.

        # Create a single subthread to do the actual work
        result = []
        subThread = threading.Thread(target=SubthreadProc, args=(url, result))

        # Daemonize the subthread so that even if a few are hanging
        # around when the process is done, the process will exit.

        # Run the subthread and wait for it to finish, or time out

        if [] == result:
            # Subthread hasn't give a result yet.  Consider it timed out.
            resultQueue.put((url, "TIMEOUT"))
        elif -1 == result[0]:
            # Subthread returned an error from geturl.
            resultQueue.put((url, "FAILED"))
            # Subthread returned a time.  Store it.
            resultQueue.put((url, result[0]))

workers = []
resultQueue = Queue.Queue()

# Create worker threads to load-balance the retrieval
for threadNum in range(0, numThreads):

# Wait for all the workers to finish
for w in workers:

# Separate the successes from failures
timings = []
failures = []
while not resultQueue.empty():
    url, result = resultQueue.get(0)
    if isinstance(result, str):
        failures.append((result, url))
        timings.append((result, url))

# Sort by increasing time or result string

# Print the results
print "\nMirrors (ordered fastest to slowest)"
for result, url in timings:
    print "%7d %s" % (result, url)
for result, url in failures:
    print "%7s %s" % (result, url)

print "\nProcess completed in ", time.time() - processStartTime, " seconds."


  • revision 2 (19 years ago)
  • previous revisions are not available