class Tag:
name = '';
text = '';
first_child = 0;
parent = 0;
next_sibling = 0;
closed = 0;
depth = 0;
def get_tag_info_str(self):
c,p,s = 'none','none','none'
if self.first_child != 0:
c = self.first_child.name
if self.parent != 0:
p = self.parent.name
if self.next_sibling != 0:
s = self.next_sibling.name
return "name = {}, text = {}\nParent = {}, First Child = {}, Next Sibling = {}\nClosed = {}, Depth = {}\n".format(self.name, self.text, p, c, s, self.closed, self.depth)
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
class MyHTMLParser(HTMLParser):
tag_list = []
depth = 0;
previous_tag = 'none';
mode = 'silent';
def handle_starttag(self, tag, attrs):
if self.mode != 'silent':
print "Start tag:", tag
for attr in attrs:
print " attr:", attr
self.depth = self.depth + 1
t = Tag()
t.name = tag
t.depth = self.depth
if self.previous_tag == 'start':
# current tag is a first child of the last tag
t.parent = self.tag_list[len(self.tag_list)-1]
self.tag_list[len(self.tag_list)-1].first_child = t
elif self.previous_tag == 'end':
# current tag is next sibling of the last tag
for x in reversed(self.tag_list):
if x.depth == self.depth:
x.next_sibling = t
if t.parent == 0:
t.parent = x.parent
break
elif self.previous_tag == 'startend':
# current tag is the next sibling of the previous tag
t.parent = self.tag_list[len(self.tag_list)-1].parent
self.tag_list[len(self.tag_list)-1].next_sibling = t
self.tag_list.append(t)
self.previous_tag = 'start'
def handle_endtag(self, tag):
if self.mode != 'silent':
print "End tag :", tag
for x in reversed(self.tag_list):
if x.name == tag and x.closed == 0:
x.closed = 1
break
self.depth = self.depth - 1
self.previous_tag = 'end'
def handle_startendtag(self, tag, attrs):
if self.mode != 'silent':
print "Start/End tag :", tag
for attr in attrs:
print " attr:", attr
t = Tag()
self.depth = self.depth + 1
t.name = tag
t.depth = self.depth
t.closed = 1
if self.previous_tag == 'start':
# current tag is first child of the last tag
t.parent = self.tag_list[len(self.tag_list)-1]
self.tag_list[len(self.tag_list)-1].first_child = t
elif self.previous_tag == 'startend':
# current tag is next sibling of last tag
t.parent = self.tag_list[len(self.tag_list)-1].parent
self.tag_list[len(self.tag_list)-1].next_sibling = t
elif self.previous_tag == 'end':
# current tag is next sibling of a previous tag of depth = self.depth
for x in reversed(self.tag_list):
if x.depth == self.depth:
x.next_sibling = t
if t.parent == 0:
t.parent = x.parent
break
self.tag_list.append(t)
self.depth = self.depth - 1
self.previous_tag = 'startend'
def handle_data(self, data):
if self.mode != 'silent':
print "Data :", data
self.depth = self.depth + 1
# add data to last tag in list with depth = current depth - 1
for x in reversed(self.tag_list):
if x.depth == self.depth - 1:
x.text = (x.text + ' ' + data.strip(' \n\t')).strip(' \n\t')
break
self.depth = self.depth - 1
def handle_comment(self, data):
if self.mode != 'silent':
print "Comment :", data
def handle_entityref(self, name):
if self.mode != 'silent':
c = unichr(name2codepoint[name])
print "Named ent:", c
def handle_charref(self, name):
if self.mode != 'silent':
if name.startswith('x'):
c = unichr(int(name[1:], 16))
else:
c = unichr(int(name))
print "Num ent :", c
def handle_decl(self, data):
if self.mode != 'silent':
print "Decl :", data
def print_tag_list(self, u):
for l in self.tag_list:
print l.get_tag_info_str()
def clear_tag_list(self):
self.tag_list.__delslice__(0,len(self.tag_list))
def pretty_print_tags(self):
for t in self.tag_list:
s = ''
s = s + self.get_indent_str(t.depth-1)
s = s + self.get_tag_str(t.name)
print s
def get_indent_str(self, n):
s = ''
while(n != 0):
s = s + ' '
n = n - 1
return s
def get_tag_str(self, name):
return '<{}>'.format(name)
def find_first_tag(self, name):
r = 0
for t in self.tag_list:
if t.name == name:
r = t
break
return r
def print_first_tag_info(self, name):
t = self.find_first_tag(name)
if t == 0:
print "Tag: {} not found".format(name)
else:
print t.get_tag_info_str()
import urllib
import socket
socket.setdefaulttimeout(10)
import httplib
class WebCrawler:
"""A simple web crawler"""
link_dict = {};
initial_depth = 0;
#filter_list = [];
parser = 0;
re_compiled_obj = 0;
class PageInfo:
""" i store info about a webpage here """
has_been_scraped = 0;
word_dict = {};
def __init__(self,re_compiled_obj):
#self.filter_list.append(self.Filter(1,'.cnn.'))
self.parser = MyHTMLParser()
self.re_compiled_obj = re_compiled_obj
def get_page(self,url):
""" loads a webpage into a string """
page = ''
try:
f = urllib.urlopen(url=url)
page = f.read()
f.close()
except IOError:
print "Error opening {}".format(url)
except httplib.InvalidURL, e:
print "{} caused an Invalid URL error.".format(url)
if hasattr(e, 'reason'):
print 'We failed to reach a server.'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
return page
def check_filters(self,url):
""" If the url_str matches any of the
enabled filter strings
then put the url in the dictionary """
match = self.re_compiled_obj.search(url)
#print "match = {}".format(match)
return match
def find_h1_tag(self,s,pos):
""" finds the first tag """
start = s.find('
',pos)
end = s.find('
',start)
return start, end
def save_tag_text(self, tag, d):
""" stores each word in the tag in a dictionary """
if tag != 0:
token_list = tag.text.split(' ')
for token in token_list:
#print 'token = {}'.format(token)
if d.has_key(token):
d[token] = d[token] + 1
else:
d[token] = 1
return d
def save_page_text(self,page_str):
""" Save all important text on the page """
offset = 0
d = {}
while offset != -1:
start,end = self.find_h1_tag(page_str,offset)
offset = end
if start != -1 and end != -1:
h1_tag = page_str[start:end+5]
#print h1_tag
self.parser.clear_tag_list()
# turn text into linked list of tags
# only feed part of the page into the parser
self.parser.feed(h1_tag)
#self.parser.pretty_print_tags()
tag = self.parser.find_first_tag('h1')
# add words from tag into the dictionary
d = self.save_tag_text(tag,d)
return d
def save_all_links_on_page(self,page_str,limit=60):
""" Stores all links found on the current page in a dictionary """
d = {}
offset = 0
i = 0
num_pages_filtered = 0
num_duplicate_pages = 0
while offset != -1:
if i == limit:
break
offset = page_str.find('