# WebsiteMapper.py
# Prints a tree graph of any website.
# FB - 201009164
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing
global urlList
urlList = []
def printWebsiteMap(url, level = 0):
# do not go to other websites
global website
netloc = urlparse.urlsplit(url).netloc
netlocSplit = netloc.split('.')
if netlocSplit[-2] + netlocSplit[-1] != website:
return
global urlList
if url in urlList: # prevent using the same URL again
return
try:
urlContent = urllib2.urlopen(url).read()
soup = BeautifulSoup(''.join(urlContent))
urlList.append(url)
except:
return
# if not an HTML file then return
if urlContent.find('<html') == -1 and urlContent.find('<HTML') == -1:
return
if level == 0:
print url
else:
print ' ' * (level - 1) + '|'
print ' ' * (level - 1) + '|' +'__' * level + url
global maxLevel
if level < maxLevel:
# if there are links on the webpage then recursively repeat
linkTags = soup.findAll('a')
for linkTag in linkTags:
try:
linkUrl = linkTag['href']
# skip if URL is a section on the same webpage
if linkUrl.startswith('#'):
return
# if relative URL then convert to absolute
if urlparse.urlsplit(linkUrl).scheme == '':
linkUrl = urlparse.urlsplit(url).scheme + '://' + netloc + '/' + linkUrl
# remove '/' in the end if exists
if linkUrl.endswith('/'):
linkUrl = linkUrl.strip('/')
printWebsiteMap(linkUrl, level + 1)
except:
pass
# MAIN
rootUrl = 'http://www.bloodshed.net'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
global maxLevel
maxLevel = 4
printWebsiteMap(rootUrl)
Diff to Previous Revision
--- revision 1 2010-09-17 04:52:51
+++ revision 2 2010-09-17 21:48:47
@@ -24,6 +24,7 @@
try:
urlContent = urllib2.urlopen(url).read()
+ soup = BeautifulSoup(''.join(urlContent))
urlList.append(url)
except:
return
@@ -41,7 +42,6 @@
global maxLevel
if level < maxLevel:
# if there are links on the webpage then recursively repeat
- soup = BeautifulSoup(''.join(urlContent))
linkTags = soup.findAll('a')
for linkTag in linkTags: