Website Mapper « Python recipes

Prints the tree graph of the given URL.

      # WebsiteMapper.py
# Prints a tree graph of any website.
# FB - 201009223
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing

global urlList
urlList = []

def printWebsiteMap(url, level = 0):

    # do not go to other websites
    global website
    parsedUrl = urlparse.urlsplit(url)
    scheme = parsedUrl.scheme
    netloc = parsedUrl.netloc
    netlocSplit = netloc.split('.')
    if netlocSplit[-2] + netlocSplit[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        soup = BeautifulSoup(''.join(urlContent))
        urlList.append(url)
    except:
        return

    if level == 0:
        print url
    else:
        print '  ' * (level - 1) + '|'
        print '  ' * (level - 1) + '|' +'__' * level + url

    global maxLevel
    if level < maxLevel:        
        # if there are links on the webpage then recursively repeat
        linkTags = soup.findAll('a')

        for linkTag in linkTags:
            try:
                linkUrl = linkTag['href']
                urlOk = True
                
                # skip if URL is a section on the same webpage
                if linkUrl.startswith('#'):
                    urlOk = False

                # skip if URL is an email
                # if linkUrl.lower().startswith('mailto:'):
                if linkUrl.find('@') > -1:
                    urlOk = False

                # skip if not an HTML URL 
                parsedUrl = urlparse.urlsplit(linkUrl)
                if parsedUrl.path.find('.') > -1: # is there a file name?
                    pathLower  = parsedUrl.path.lower()
                    if not (pathLower.endswith('.html') or pathLower.endswith('.htm')):
                        urlOk = False

                if urlOk:
                    # if relative URL then convert to absolute
                    if parsedUrl.scheme == '':
                        linkUrl = scheme + '://' + netloc + '/' + linkUrl

                    # remove '/' in the end if exists
                    if linkUrl.endswith('/'):
                        linkUrl = linkUrl.strip('/')

                    printWebsiteMap(linkUrl, level + 1)
            except:
                pass

# MAIN
rootUrl = 'http://www.bloodshed.net'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
global maxLevel
maxLevel = 9
printWebsiteMap(rootUrl)

      

Tags: html, http, url, web

5 comments

magnun leno da silva 13 years, 7 months ago # | flag

I've copied the code and tested but it returned an error:

magnun@magnun-desktop:/tmp$ ./websitemapper.py 
http://www.bloodshed.net
Traceback (most recent call last):
  File "./websitemapper.py", line 75, in <module>
    printWebsiteMap(rootUrl)
  File "./websitemapper.py", line 45, in printWebsiteMap
    soup = BeautifulSoup(''.join(urlContent))
  File "/usr/lib/pymodules/python2.6/BeautifulSoup.py", line 1499, in __init__
    BeautifulStoneSoup.__init__(self, *args, **kwargs)
  File "/usr/lib/pymodules/python2.6/BeautifulSoup.py", line 1230, in __init__
    self._feed(isHTML=isHTML)
  File "/usr/lib/pymodules/python2.6/BeautifulSoup.py", line 1263, in _feed
    self.builder.feed(markup)
  File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed
    self.goahead(0)
  File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead
    k = self.parse_starttag(i)
  File "/usr/lib/python2.6/HTMLParser.py", line 226, in parse_starttag
    endpos = self.check_for_whole_start_tag(i)
  File "/usr/lib/python2.6/HTMLParser.py", line 301, in check_for_whole_start_tag
    self.error("malformed start tag")
  File "/usr/lib/python2.6/HTMLParser.py", line 115, in error
    raise HTMLParseError(message, self.getpos())
HTMLParser.HTMLParseError: malformed start tag, at line 364, column 49

It will be very useful! Thanks...

FB36 (author) 13 years, 7 months ago # | flag

That error never happened in my tests. I moved "soup = BeautifulSoup(''.join(urlContent))" line up into the try statement. That should fix it.

The code certainly far from perfect. It first downloads zip, pdf, video files etc before realizing they don't contain HTML code for example. I don't know if there is a better way. Just checking if the URL ends w/ ".html" certainly wouldn't be enough.

Also the graph it prints needs improvement. The "tree" command of DOS does it better. (But it is only for graphing directory structures, not websites.)

Denis Barmenkov 13 years, 7 months ago # | flag

"The code certainly far from perfect. It first downloads zip, pdf, video files etc before realizing they don't contain HTML code for example. I don't know if there is a better way. Just checking if the URL ends w/ ".html" certainly wouldn't be enough."

it's easy to fix:

make a HEAD request for document
take "content-type" field value
download and parse document if it has content-type "text/html"

FB36 (author) 13 years, 7 months ago # | flag

I made some improvements. Now it would not download non-html files to check.

Stephen Chappell 13 years, 6 months ago # | flag

You might be interested in recipe 577091 if you want a format like the "tree" command in DOS. By rewriting the "listdir" function in the code, you could drop in function that deals with URLs instead. Thanks for the code!

◄	Python recipes (4591)	►
◄	FB36's recipes (148)	►

Website Mapper (Python recipe) by FB36
ActiveState Code (http://code.activestate.com/recipes/577392/)

5 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Website Mapper (Python recipe) by FB36 ActiveState Code (http://code.activestate.com/recipes/577392/)

5 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Website Mapper (Python recipe) by FB36
ActiveState Code (http://code.activestate.com/recipes/577392/)