Recipe 578060 revision 1 « ActiveState Code

class Tag:
    name = '';
    text = '';
    first_child = 0;
    parent = 0;
    next_sibling = 0;
    closed = 0;
    depth = 0;
    def get_tag_info_str(self):
        c,p,s = 'none','none','none'
        if self.first_child != 0:
            c = self.first_child.name
        if self.parent != 0:
            p = self.parent.name
        if self.next_sibling != 0:
            s = self.next_sibling.name
        return "name = {}, text = {}\nParent = {}, First Child = {}, Next Sibling = {}\nClosed = {}, Depth = {}\n".format(self.name, self.text, p, c, s, self.closed, self.depth)
      
      
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint

class MyHTMLParser(HTMLParser):
    tag_list = []
    depth = 0;
    previous_tag = 'none';
    mode = 'silent';
  
  
    def handle_starttag(self, tag, attrs):
        if self.mode != 'silent':
            print "Start tag:", tag
            for attr in attrs:
                print "     attr:", attr
        self.depth = self.depth + 1
        t = Tag()
        t.name = tag
        t.depth = self.depth
        if self.previous_tag == 'start':
            # current tag is a first child of the last tag
            t.parent = self.tag_list[len(self.tag_list)-1]
            self.tag_list[len(self.tag_list)-1].first_child = t
        elif self.previous_tag == 'end':
            # current tag is next sibling of the last tag
          
            for x in reversed(self.tag_list):
                if x.depth == self.depth:
                    x.next_sibling = t          
                    if t.parent == 0:
                        t.parent = x.parent
                    break
        elif self.previous_tag == 'startend':
            # current tag is the next sibling of the previous tag
            t.parent = self.tag_list[len(self.tag_list)-1].parent
            self.tag_list[len(self.tag_list)-1].next_sibling = t
          
      
        self.tag_list.append(t)  
        self.previous_tag = 'start'
    def handle_endtag(self, tag):
        if self.mode != 'silent':
            print "End tag  :", tag
        for x in reversed(self.tag_list):
            if x.name == tag and x.closed == 0:
                x.closed = 1
                break
        self.depth = self.depth - 1
        self.previous_tag = 'end'
      
    def handle_startendtag(self, tag, attrs):
        if self.mode != 'silent':
            print "Start/End tag  :", tag
            for attr in attrs:
                print "     attr:", attr
        t = Tag()
        self.depth = self.depth + 1
        t.name = tag
        t.depth = self.depth
        t.closed = 1
          
        if self.previous_tag == 'start':
            # current tag is first child of the last tag
            t.parent = self.tag_list[len(self.tag_list)-1]
            self.tag_list[len(self.tag_list)-1].first_child = t
        elif self.previous_tag == 'startend':          
            # current tag is next sibling of last tag
            t.parent = self.tag_list[len(self.tag_list)-1].parent
            self.tag_list[len(self.tag_list)-1].next_sibling = t
        elif self.previous_tag == 'end':          
            # current tag is next sibling of a previous tag of depth = self.depth
            for x in reversed(self.tag_list):
                if x.depth == self.depth:
                    x.next_sibling = t          
                    if t.parent == 0:
                        t.parent = x.parent
                    break
          
        self.tag_list.append(t)  
        self.depth = self.depth - 1
        self.previous_tag = 'startend'
      
      
    def handle_data(self, data):
        if self.mode != 'silent':
            print "Data     :", data
      
        self.depth = self.depth + 1
      
        # add data to last tag in list with depth = current depth - 1
        for x in reversed(self.tag_list):
            if x.depth == self.depth - 1:
                x.text = (x.text + ' ' + data.strip(' \n\t')).strip(' \n\t')
                break
              
        self.depth = self.depth - 1
      
    def handle_comment(self, data):
        if self.mode != 'silent':
            print "Comment  :", data
    def handle_entityref(self, name):
        if self.mode != 'silent':
            c = unichr(name2codepoint[name])
            print "Named ent:", c
    def handle_charref(self, name):
        if self.mode != 'silent':
            if name.startswith('x'):
                c = unichr(int(name[1:], 16))
            else:
                c = unichr(int(name))
            print "Num ent  :", c
    def handle_decl(self, data):
        if self.mode != 'silent':
            print "Decl     :", data
      
    def print_tag_list(self, u):
        for l in self.tag_list:
            print l.get_tag_info_str()
          
    def clear_tag_list(self):
        self.tag_list.__delslice__(0,len(self.tag_list))
    
    def pretty_print_tags(self):
        for t in self.tag_list:
            s = ''
            s = s + self.get_indent_str(t.depth-1)
            s = s + self.get_tag_str(t.name)
            print s

    def get_indent_str(self, n):
        s = ''
        while(n != 0):
            s = s + '    '
            n = n - 1          
        return s
          
    def get_tag_str(self, name):
        return '<{}>'.format(name)
      
    def find_first_tag(self, name):
        r = 0
        for t in self.tag_list:
            if t.name == name:
                r = t
                break
        return r
      
    def print_first_tag_info(self, name):
        t = self.find_first_tag(name)
        if t == 0:
            print "Tag: {} not found".format(name)
        else:
            print t.get_tag_info_str()











import urllib
import socket
socket.setdefaulttimeout(10)
import httplib

class WebCrawler:
    """A simple web crawler"""
      
    link_dict = {};
    initial_depth = 0;
    #filter_list = [];
    parser = 0;
    re_compiled_obj = 0;
    
    class PageInfo:
        """ i store info about a webpage here """
        has_been_scraped = 0;
        word_dict = {};
                 
          
    def __init__(self,re_compiled_obj):      
        #self.filter_list.append(self.Filter(1,'.cnn.'))
        self.parser = MyHTMLParser()
        self.re_compiled_obj = re_compiled_obj
          
    def get_page(self,url):
        """ loads a webpage into a string """
        page = ''
        try:
            f = urllib.urlopen(url=url)
            page = f.read()
            f.close()
        except IOError:
            print "Error opening {}".format(url)
        except httplib.InvalidURL, e:
            print "{} caused an Invalid URL error.".format(url)
            if hasattr(e, 'reason'):
                print 'We failed to reach a server.'
                print 'Reason: ', e.reason
            elif hasattr(e, 'code'):
                print 'The server couldn\'t fulfill the request.'
                print 'Error code: ', e.code        
                
        return page
  
    def check_filters(self,url):
        """ If the url_str matches any of the
        enabled filter strings
        then put the url in the dictionary """
        
        
        match = self.re_compiled_obj.search(url)
        #print "match = {}".format(match)
        return match
  
      
    def find_h1_tag(self,s,pos):
        """ finds the first <h1> tag """
        start = s.find('<h1>',pos)
        end = s.find('</h1>',start)
        return start, end

    def save_tag_text(self, tag, d):
        """ stores each word in the tag in a dictionary """
        if tag != 0:
            token_list = tag.text.split(' ')
            for token in token_list:
                #print 'token = {}'.format(token)
                if d.has_key(token):
                    d[token] = d[token] + 1
                else:
                    d[token] = 1
        return d
      
    def save_page_text(self,page_str):
        """ Save all important text on the page """
        offset = 0
        d = {}
      
        while offset != -1:
            start,end = self.find_h1_tag(page_str,offset)
            offset = end
      
            if start != -1 and end != -1:
                h1_tag = page_str[start:end+5]
                #print h1_tag
                self.parser.clear_tag_list()
                # turn text into linked list of tags
                # only feed part of the page into the parser
                self.parser.feed(h1_tag)                                 
                #self.parser.pretty_print_tags()
                tag = self.parser.find_first_tag('h1')
                # add words from tag into the dictionary
                d = self.save_tag_text(tag,d)
        return d
      
    def save_all_links_on_page(self,page_str,limit=60):
        """ Stores all links found on the current page in a dictionary """
        d = {}
        offset = 0
        i = 0
        num_pages_filtered = 0
        num_duplicate_pages = 0
        while offset != -1:
            if i == limit:
                break
            offset = page_str.find('<a href="http',offset)
            if offset != -1:
                start = page_str.find('"', offset)
                end = page_str.find('"',start+1)
                link = page_str[start+1:end]
                # don't just save all the links
                # filter the links that match specified criteria
                if self.check_filters(link):
                    if link not in self.link_dict:
                        # adding link to global dictionary
                        self.link_dict[link] = self.PageInfo()
                        # adding link to local dictionary
                        d[link] = self.PageInfo()
                    else:
                        num_duplicate_pages = num_duplicate_pages + 1
                else:
                    num_pages_filtered = num_pages_filtered + 1
                offset = offset + 1
            i = i + 1
        print "{} out of {} links were filtered".format(num_pages_filtered,i)
        print "{} out of {} links were duplicates".format(num_duplicate_pages,i)
        #print "{} links are being returned from save_all_links".format(len(d))
        return d
  
  
  
  
    def save_all_links_recursive(self,links,depth):
        """ Recursive function that
            1) converts each page (link) into a string
            2) stores all links found in a dictionary """
        d = {}
      
        print "We are {} levels deep".format(self.initial_depth - depth)
      
        if depth != 0:
            depth = depth - 1
            urls = links.viewkeys()
            #print "There are {} urls".format(len(urls))
            for url in urls:
                print "trying to get {} over the internet".format(url)
                page_str = self.get_page(url)
                print "done getting {} over the internet".format(url)
                self.link_dict[url].word_dict = self.save_page_text(page_str)
                d = self.save_all_links_on_page(page_str)
                self.link_dict[url].has_been_scraped = 1
                # d contains all the links found on the current page
                self.save_all_links_recursive(d,depth)

    def start_crawling(self,seed_pages,depth):
        """ User calls this function to start crawling the web """
        d = {}
        self.link_dict.clear()
       
        # initialize global dictionary variable to the seed page url's passed in
        for page in seed_pages:           
            self.link_dict[page] = self.PageInfo()
            d[page] = self.PageInfo()
        self.initial_depth = depth
        # start a recursive crawl
        # can't pass in self.link_dict because then i get a RuntimeError: dictionary changed size during iteration
        self.save_all_links_recursive(d,depth)
      
    def print_all_page_text(self):
        """ prints contents of all the word dictionaries """
        for i in range(len(self.link_dict)):
            page_info = self.link_dict.values()[i]
            url = self.link_dict.keys()[i]
            print 'url = {}, has_been_scraped = {}'.format(url,page_info.has_been_scraped)
            d = page_info.word_dict
            for j in range(len(d)):
                word = d.keys()[j]
                count = d.values()[j]
                print '{} was found {} times'.format(word,count)
      
import re
                
cnn_url_regex = re.compile('(?<=[.]cnn)[.]com')                


# (?<=[.]cnn)[.]com regular expression does the following:
# 1) match '.com' exactly
# 2) then looking backwards from where '.com' was found it attempts to find '.cnn'


                
w = WebCrawler(cnn_url_regex)
w.start_crawling(['http://www.cnn.com/2012/02/24/world/americas/haiti-pm-resigns/index.html?hpt=hp_t3'],1)
Recipe 578060 revision 1

History

Accounts

Code Recipes

Feedback & Information

ActiveState