Welcome, guest | Sign In | My Account | Store | Cart
# websiteTxtSearcher.py
# Searches a website recursively for any given string.
# FB - 201009105
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing

global urlList
urlList = []

# recursively search starting from the root URL
def searchUrl(url, level, searchText): # the root URL is level 0
    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
    except:
        return

    soup = BeautifulSoup(''.join(urlContent))
    # remove script tags
    c=soup.findAll('script')
    for i in c:
        i.extract() 
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
    except:
        return
    text = ''.join(body_texts) 

    # search
    if text.find(searchText) > -1:
        print url
        print

    # if there are links on the webpage then recursively repeat
    if level > 0:
        linkTags = soup.findAll('a')
        if len(linkTags) > 0:
            for linkTag in linkTags:
                try:
                    linkUrl = linkTag['href']
                    searchUrl(linkUrl, level - 1, searchText)
                except:
                    pass

# main
rootUrl = 'http://www.yahoo.com'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
searchUrl(rootUrl, 1, " computer ")

Diff to Previous Revision

--- revision 1 2010-09-10 23:25:39
+++ revision 2 2010-09-11 17:32:01
@@ -33,7 +33,10 @@
     for i in c:
         i.extract() 
     # get text content of the URL
-    body_texts = soup.body(text=True)
+    try:
+        body_texts = soup.body(text=True)
+    except:
+        return
     text = ''.join(body_texts) 
 
     # search

History