Welcome, guest | Sign In | My Account | Store | Cart

Searches a website recursively for the given text string and prints all URLs containing it.

Python, 63 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# websiteTxtSearcher.py
# Searches a website recursively for any given string.
# FB - 201009105
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing

global urlList
urlList = []

# recursively search starting from the root URL
def searchUrl(url, level, searchText): # the root URL is level 0
    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
    except:
        return

    soup = BeautifulSoup(''.join(urlContent))
    # remove script tags
    c=soup.findAll('script')
    for i in c:
        i.extract() 
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
    except:
        return
    text = ''.join(body_texts) 

    # search
    if text.find(searchText) > -1:
        print url
        print

    # if there are links on the webpage then recursively repeat
    if level > 0:
        linkTags = soup.findAll('a')
        if len(linkTags) > 0:
            for linkTag in linkTags:
                try:
                    linkUrl = linkTag['href']
                    searchUrl(linkUrl, level - 1, searchText)
                except:
                    pass

# main
rootUrl = 'http://www.yahoo.com'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
searchUrl(rootUrl, 1, " computer ")

4 comments

Jerry Rocteur 13 years, 7 months ago  # | flag

Traceback (most recent call last): File "./site_urlsearch.py", line 63, in <module> searchUrl(rootUrl, 1, " computer ") File "./site_urlsearch.py", line 39, in searchUrl body_texts = soup.body(text=True) TypeError: 'NoneType' object is not callable

FB36 (author) 13 years, 7 months ago  # | flag

That error never happened to me but I put the line "body_texts = soup.body(text=True)" inside a try-except. That should fix it. Thanks.

FB36 (author) 13 years, 7 months ago  # | flag

This version always searches the whole website (no max search depth level limit) and it also has some improvements:

# websiteTxtSearcher2.py
# Searches a website recursively for any given string.
# FB - 201009105
import time
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing

global urlList
urlList = []

# recursively search starting from the root URL
def searchUrl(url, searchText, caseSensitive):

    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc
    netlocSplit = netloc.split('.')
    if netlocSplit[-2] + netlocSplit[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
    except:
        return

    # if not an HTML file then return
    if urlContent.find('<html') == -1 and urlContent.find('<HTML') == -1:
            return

    soup = BeautifulSoup(''.join(urlContent))
    # remove script tags
    c=soup.findAll('script')
    for i in c:
        i.extract() 
    # get text content of the URL
    try:
        body_texts = soup.body(text=True)
    except:
        return
    text = ''.join(body_texts) 

    # search
    if caseSensitive:
        if text.find(searchText) > -1:
        print url
        print
    else:
        if text.lower().find(searchText.lower()) > -1:
            print url
            print

    # if there are links on the webpage then recursively repeat
    linkTags = soup.findAll('a')

    for linkTag in linkTags:
        try:
            linkUrl = linkTag['href']
            # if relative URL then convert to absolute
            if urlparse.urlsplit(linkUrl).scheme == '':
                linkUrl = urlparse.urlsplit(url).scheme + '://' + netloc + '/' + linkUrl

            searchUrl(linkUrl, searchText, caseSensitive)
        except:
            pass

# main
rootUrl = 'http://www.bloodshed.net/index.html'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
t = time.time()
searchUrl(rootUrl, 'printf', True)
print "Search duration in seconds: "
print time.time() - t
print
Yo 6 years, 11 months ago  # | flag

hi there,

how can you make this code for python 3? i can't run it in python 3.