A simple class that starts in a url and follows links to a desired depth.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | # -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
from urllib2 import urlopen
class Spider(HTMLParser):
def __init__(self, starting_url, depth, max_span):
HTMLParser.__init__(self)
self.url = starting_url
self.db = {self.url: 1}
self.node = [self.url]
self.depth = depth # recursion depth max
self.max_span = max_span # max links obtained per url
self.links_found = 0
def handle_starttag(self, tag, attrs):
if self.links_found < self.max_span and tag == 'a' and attrs:
link = attrs[0][1]
if link[:4] != "http":
link = '/'.join(self.url.split('/')[:3])+('/'+link).replace('//','/')
if link not in self.db:
print "new link ---> %s" % link
self.links_found += 1
self.node.append(link)
self.db[link] = (self.db.get(link) or 0) + 1
def crawl(self):
for depth in xrange(self.depth):
print "*"*70+("\nScanning depth %d web\n" % (depth+1))+"*"*70
context_node = self.node[:]
self.node = []
for self.url in context_node:
self.links_found = 0
try:
req = urlopen(self.url)
res = req.read()
self.feed(res)
except:
self.reset()
print "*"*40 + "\nRESULTS\n" + "*"*40
zorted = [(v,k) for (k,v) in self.db.items()]
zorted.sort(reverse = True)
return zorted
if __name__ == "__main__":
spidey = Spider(starting_url = 'http://www.7cerebros.com.ar', depth = 5, max_span = 10)
result = spidey.crawl()
for (n,link) in result:
print "%s was found %d time%s." %(link,n, "s" if n is not 1 else "")
|