# websiteTxtSearcher.py
# Searches a website recursively for any given string.
# FB - 201009105
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing
global urlList
urlList = []
# recursively search starting from the root URL
def searchUrl(url, level, searchText): # the root URL is level 0
# do not go to other websites
global website
netloc = urlparse.urlsplit(url).netloc.split('.')
if netloc[-2] + netloc[-1] != website:
return
global urlList
if url in urlList: # prevent using the same URL again
return
try:
urlContent = urllib2.urlopen(url).read()
urlList.append(url)
except:
return
soup = BeautifulSoup(''.join(urlContent))
# remove script tags
c=soup.findAll('script')
for i in c:
i.extract()
# get text content of the URL
try:
body_texts = soup.body(text=True)
except:
return
text = ''.join(body_texts)
# search
if text.find(searchText) > -1:
print url
print
# if there are links on the webpage then recursively repeat
if level > 0:
linkTags = soup.findAll('a')
if len(linkTags) > 0:
for linkTag in linkTags:
try:
linkUrl = linkTag['href']
searchUrl(linkUrl, level - 1, searchText)
except:
pass
# main
rootUrl = 'http://www.yahoo.com'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
searchUrl(rootUrl, 1, " computer ")
Diff to Previous Revision
--- revision 1 2010-09-10 23:25:39
+++ revision 2 2010-09-11 17:32:01
@@ -33,7 +33,10 @@
for i in c:
i.extract()
# get text content of the URL
- body_texts = soup.body(text=True)
+ try:
+ body_texts = soup.body(text=True)
+ except:
+ return
text = ''.join(body_texts)
# search