Prints the tree graph of the given URL.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | # WebsiteMapper.py
# Prints a tree graph of any website.
# FB - 201009223
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing
global urlList
urlList = []
def printWebsiteMap(url, level = 0):
# do not go to other websites
global website
parsedUrl = urlparse.urlsplit(url)
scheme = parsedUrl.scheme
netloc = parsedUrl.netloc
netlocSplit = netloc.split('.')
if netlocSplit[-2] + netlocSplit[-1] != website:
return
global urlList
if url in urlList: # prevent using the same URL again
return
try:
urlContent = urllib2.urlopen(url).read()
soup = BeautifulSoup(''.join(urlContent))
urlList.append(url)
except:
return
if level == 0:
print url
else:
print ' ' * (level - 1) + '|'
print ' ' * (level - 1) + '|' +'__' * level + url
global maxLevel
if level < maxLevel:
# if there are links on the webpage then recursively repeat
linkTags = soup.findAll('a')
for linkTag in linkTags:
try:
linkUrl = linkTag['href']
urlOk = True
# skip if URL is a section on the same webpage
if linkUrl.startswith('#'):
urlOk = False
# skip if URL is an email
# if linkUrl.lower().startswith('mailto:'):
if linkUrl.find('@') > -1:
urlOk = False
# skip if not an HTML URL
parsedUrl = urlparse.urlsplit(linkUrl)
if parsedUrl.path.find('.') > -1: # is there a file name?
pathLower = parsedUrl.path.lower()
if not (pathLower.endswith('.html') or pathLower.endswith('.htm')):
urlOk = False
if urlOk:
# if relative URL then convert to absolute
if parsedUrl.scheme == '':
linkUrl = scheme + '://' + netloc + '/' + linkUrl
# remove '/' in the end if exists
if linkUrl.endswith('/'):
linkUrl = linkUrl.strip('/')
printWebsiteMap(linkUrl, level + 1)
except:
pass
# MAIN
rootUrl = 'http://www.bloodshed.net'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
global maxLevel
maxLevel = 9
printWebsiteMap(rootUrl)
|
I've copied the code and tested but it returned an error:
It will be very useful! Thanks...
That error never happened in my tests. I moved "soup = BeautifulSoup(''.join(urlContent))" line up into the try statement. That should fix it.
The code certainly far from perfect. It first downloads zip, pdf, video files etc before realizing they don't contain HTML code for example. I don't know if there is a better way. Just checking if the URL ends w/ ".html" certainly wouldn't be enough.
Also the graph it prints needs improvement. The "tree" command of DOS does it better. (But it is only for graphing directory structures, not websites.)
"The code certainly far from perfect. It first downloads zip, pdf, video files etc before realizing they don't contain HTML code for example. I don't know if there is a better way. Just checking if the URL ends w/ ".html" certainly wouldn't be enough."
it's easy to fix:
I made some improvements. Now it would not download non-html files to check.
You might be interested in recipe 577091 if you want a format like the "tree" command in DOS. By rewriting the "listdir" function in the code, you could drop in function that deals with URLs instead. Thanks for the code!