# WebsiteMapper.py
# Prints a tree graph of any website.
# FB - 201009223
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing
global urlList
urlList = []
def printWebsiteMap(url, level = 0):
# do not go to other websites
global website
parsedUrl = urlparse.urlsplit(url)
scheme = parsedUrl.scheme
netloc = parsedUrl.netloc
netlocSplit = netloc.split('.')
if netlocSplit[-2] + netlocSplit[-1] != website:
return
global urlList
if url in urlList: # prevent using the same URL again
return
try:
urlContent = urllib2.urlopen(url).read()
soup = BeautifulSoup(''.join(urlContent))
urlList.append(url)
except:
return
if level == 0:
print url
else:
print ' ' * (level - 1) + '|'
print ' ' * (level - 1) + '|' +'__' * level + url
global maxLevel
if level < maxLevel:
# if there are links on the webpage then recursively repeat
linkTags = soup.findAll('a')
for linkTag in linkTags:
try:
linkUrl = linkTag['href']
urlOk = True
# skip if URL is a section on the same webpage
if linkUrl.startswith('#'):
urlOk = False
# skip if URL is an email
# if linkUrl.lower().startswith('mailto:'):
if linkUrl.find('@') > -1:
urlOk = False
# skip if not an HTML URL
parsedUrl = urlparse.urlsplit(linkUrl)
if parsedUrl.path.find('.') > -1: # is there a file name?
pathLower = parsedUrl.path.lower()
if not (pathLower.endswith('.html') or pathLower.endswith('.htm')):
urlOk = False
if urlOk:
# if relative URL then convert to absolute
if parsedUrl.scheme == '':
linkUrl = scheme + '://' + netloc + '/' + linkUrl
# remove '/' in the end if exists
if linkUrl.endswith('/'):
linkUrl = linkUrl.strip('/')
printWebsiteMap(linkUrl, level + 1)
except:
pass
# MAIN
rootUrl = 'http://www.bloodshed.net'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
global maxLevel
maxLevel = 9
printWebsiteMap(rootUrl)