Searches a website recursively for the given text string and prints all URLs containing it.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | # websiteTxtSearcher.py
# Searches a website recursively for any given string.
# FB - 201009105
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing
global urlList
urlList = []
# recursively search starting from the root URL
def searchUrl(url, level, searchText): # the root URL is level 0
# do not go to other websites
global website
netloc = urlparse.urlsplit(url).netloc.split('.')
if netloc[-2] + netloc[-1] != website:
return
global urlList
if url in urlList: # prevent using the same URL again
return
try:
urlContent = urllib2.urlopen(url).read()
urlList.append(url)
except:
return
soup = BeautifulSoup(''.join(urlContent))
# remove script tags
c=soup.findAll('script')
for i in c:
i.extract()
# get text content of the URL
try:
body_texts = soup.body(text=True)
except:
return
text = ''.join(body_texts)
# search
if text.find(searchText) > -1:
print url
print
# if there are links on the webpage then recursively repeat
if level > 0:
linkTags = soup.findAll('a')
if len(linkTags) > 0:
for linkTag in linkTags:
try:
linkUrl = linkTag['href']
searchUrl(linkUrl, level - 1, searchText)
except:
pass
# main
rootUrl = 'http://www.yahoo.com'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
searchUrl(rootUrl, 1, " computer ")
|
Traceback (most recent call last): File "./site_urlsearch.py", line 63, in <module> searchUrl(rootUrl, 1, " computer ") File "./site_urlsearch.py", line 39, in searchUrl body_texts = soup.body(text=True) TypeError: 'NoneType' object is not callable
That error never happened to me but I put the line "body_texts = soup.body(text=True)" inside a try-except. That should fix it. Thanks.
This version always searches the whole website (no max search depth level limit) and it also has some improvements:
hi there,
how can you make this code for python 3? i can't run it in python 3.