Recipe 577392 revision 3 « ActiveState Code

# WebsiteMapper.py
# Prints a tree graph of any website.
# FB - 201009223
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing

global urlList
urlList = []

def printWebsiteMap(url, level = 0):

    # do not go to other websites
    global website
    parsedUrl = urlparse.urlsplit(url)
    scheme = parsedUrl.scheme
    netloc = parsedUrl.netloc
    netlocSplit = netloc.split('.')
    if netlocSplit[-2] + netlocSplit[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        soup = BeautifulSoup(''.join(urlContent))
        urlList.append(url)
    except:
        return

    if level == 0:
        print url
    else:
        print '  ' * (level - 1) + '|'
        print '  ' * (level - 1) + '|' +'__' * level + url

    global maxLevel
    if level < maxLevel:        
        # if there are links on the webpage then recursively repeat
        linkTags = soup.findAll('a')

        for linkTag in linkTags:
            try:
                linkUrl = linkTag['href']
                urlOk = True
                
                # skip if URL is a section on the same webpage
                if linkUrl.startswith('#'):
                    urlOk = False

                # skip if URL is an email
                # if linkUrl.lower().startswith('mailto:'):
                if linkUrl.find('@') > -1:
                    urlOk = False

                # skip if not an HTML URL 
                parsedUrl = urlparse.urlsplit(linkUrl)
                if parsedUrl.path.find('.') > -1: # is there a file name?
                    pathLower  = parsedUrl.path.lower()
                    if not (pathLower.endswith('.html') or pathLower.endswith('.htm')):
                        urlOk = False

                if urlOk:
                    # if relative URL then convert to absolute
                    if parsedUrl.scheme == '':
                        linkUrl = scheme + '://' + netloc + '/' + linkUrl

                    # remove '/' in the end if exists
                    if linkUrl.endswith('/'):
                        linkUrl = linkUrl.strip('/')

                    printWebsiteMap(linkUrl, level + 1)
            except:
                pass

# MAIN
rootUrl = 'http://www.bloodshed.net'
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
global website
website = netloc[-2] + netloc[-1]
global maxLevel
maxLevel = 9
printWebsiteMap(rootUrl)

Diff to Previous Revision

--- revision 2 2010-09-17 21:48:47
+++ revision 3 2010-09-23 01:23:04
@@ -1,6 +1,6 @@
 # WebsiteMapper.py
 # Prints a tree graph of any website.
-# FB - 201009164
+# FB - 201009223
 import urllib2
 from os.path import basename
 import urlparse
@@ -13,7 +13,9 @@
 
     # do not go to other websites
     global website
-    netloc = urlparse.urlsplit(url).netloc
+    parsedUrl = urlparse.urlsplit(url)
+    scheme = parsedUrl.scheme
+    netloc = parsedUrl.netloc
     netlocSplit = netloc.split('.')
     if netlocSplit[-2] + netlocSplit[-1] != website:
         return
@@ -27,10 +29,6 @@
         soup = BeautifulSoup(''.join(urlContent))
         urlList.append(url)
     except:
-        return
-
-    # if not an HTML file then return
-    if urlContent.find('<html') == -1 and urlContent.find('<HTML') == -1:
         return
 
     if level == 0:
@@ -47,20 +45,34 @@
         for linkTag in linkTags:
             try:
                 linkUrl = linkTag['href']
-
+                urlOk = True
+                
                 # skip if URL is a section on the same webpage
                 if linkUrl.startswith('#'):
-                    return
+                    urlOk = False
 
-                # if relative URL then convert to absolute
-                if urlparse.urlsplit(linkUrl).scheme == '':
-                    linkUrl = urlparse.urlsplit(url).scheme + '://' + netloc + '/' + linkUrl
+                # skip if URL is an email
+                # if linkUrl.lower().startswith('mailto:'):
+                if linkUrl.find('@') > -1:
+                    urlOk = False
 
-                # remove '/' in the end if exists
-                if linkUrl.endswith('/'):
-                    linkUrl = linkUrl.strip('/')
+                # skip if not an HTML URL 
+                parsedUrl = urlparse.urlsplit(linkUrl)
+                if parsedUrl.path.find('.') > -1: # is there a file name?
+                    pathLower  = parsedUrl.path.lower()
+                    if not (pathLower.endswith('.html') or pathLower.endswith('.htm')):
+                        urlOk = False
 
-                printWebsiteMap(linkUrl, level + 1)
+                if urlOk:
+                    # if relative URL then convert to absolute
+                    if parsedUrl.scheme == '':
+                        linkUrl = scheme + '://' + netloc + '/' + linkUrl
+
+                    # remove '/' in the end if exists
+                    if linkUrl.endswith('/'):
+                        linkUrl = linkUrl.strip('/')
+
+                    printWebsiteMap(linkUrl, level + 1)
             except:
                 pass
 
@@ -70,5 +82,5 @@
 global website
 website = netloc[-2] + netloc[-1]
 global maxLevel
-maxLevel = 4
+maxLevel = 9
 printWebsiteMap(rootUrl)

Recipe 577392 revision 3

Diff to Previous Revision

History

Accounts

Code Recipes

Feedback & Information

ActiveState