# ImageDownloader.py
# Finds and downloads all images from any given URL recursively.
# FB - 20140223
import sys
import os
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing
urlList = []
# recursively download images starting from the root URL
def downloadImages(url, level): # the root URL is level 0
# do not go to other websites
global website
netloc = urlparse.urlsplit(url).netloc.split('.')
if netloc[-2] + netloc[-1] != website:
return
global urlList
if url in urlList: # prevent using the same URL again
return
try:
urlContent = urllib2.urlopen(url).read()
urlList.append(url)
print url
except:
return
soup = BeautifulSoup(''.join(urlContent))
# find and download all images
imgTags = soup.findAll('img')
for imgTag in imgTags:
imgUrl = imgTag['src']
imgUrl = url[ : url.find(".com") + 4] + imgUrl if (imgUrl[ : 4] != "http") else imgUrl
# download only the proper image files
if imgUrl.lower().endswith('.jpeg') or \
imgUrl.lower().endswith('.jpg') or \
imgUrl.lower().endswith('.gif') or \
imgUrl.lower().endswith('.png') or \
imgUrl.lower().endswith('.bmp'):
try:
imgData = urllib2.urlopen(imgUrl).read()
global minImageFileSize
if len(imgData) >= minImageFileSize:
print " " + imgUrl
fileName = basename(urlparse.urlsplit(imgUrl)[2])
output = open(os.path.join(downloadLocationPath, fileName),'wb')
output.write(imgData)
output.close()
except Exception, e:
print str(e)
# pass
print
print
# if there are links on the webpage then recursively repeat
if level > 0:
linkTags = soup.findAll('a')
if len(linkTags) > 0:
for linkTag in linkTags:
try:
linkUrl = linkTag['href']
downloadImages(linkUrl, level - 1)
except Exception, e:
print str(e)
# pass
# MAIN
cla = sys.argv # command line arguments
if len(cla) != 5:
print "USAGE:"
print "[python] ImageDownloader.py URL MaxRecursionDepth DownloadLocationPath MinImageFileSize"
os._exit(1)
rootUrl = cla[1]
maxRecursionDepth = int(cla[2])
downloadLocationPath = cla[3] # absolute path
if not os.path.isdir(downloadLocationPath):
print downloadLocationPath + " is not an existing directory!"
os._exit(2)
minImageFileSize = long(cla[4]) # in bytes
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
website = netloc[-2] + netloc[-1]
downloadImages(rootUrl, maxRecursionDepth)
Diff to Previous Revision
--- revision 3 2010-09-08 18:47:57
+++ revision 4 2014-02-24 03:49:51
@@ -1,23 +1,88 @@
-# imageDownloader.py
-# Finds and downloads all images from any given URL.
-# FB - 201009072
+# ImageDownloader.py
+# Finds and downloads all images from any given URL recursively.
+# FB - 20140223
+import sys
+import os
import urllib2
-import re
from os.path import basename
-from urlparse import urlsplit
+import urlparse
+from BeautifulSoup import BeautifulSoup # for HTML parsing
-url = "http://www.yahoo.com"
-urlContent = urllib2.urlopen(url).read()
-# HTML image tag: <img src="url" alt="some_text"/>
-imgUrls = re.findall('img .*?src="(.*?)"', urlContent)
+urlList = []
-# download all images
-for imgUrl in imgUrls:
+# recursively download images starting from the root URL
+def downloadImages(url, level): # the root URL is level 0
+ # do not go to other websites
+ global website
+ netloc = urlparse.urlsplit(url).netloc.split('.')
+ if netloc[-2] + netloc[-1] != website:
+ return
+
+ global urlList
+ if url in urlList: # prevent using the same URL again
+ return
+
try:
- imgData = urllib2.urlopen(imgUrl).read()
- fileName = basename(urlsplit(imgUrl)[2])
- output = open(fileName,'wb')
- output.write(imgData)
- output.close()
+ urlContent = urllib2.urlopen(url).read()
+ urlList.append(url)
+ print url
except:
- pass
+ return
+
+ soup = BeautifulSoup(''.join(urlContent))
+ # find and download all images
+ imgTags = soup.findAll('img')
+ for imgTag in imgTags:
+ imgUrl = imgTag['src']
+ imgUrl = url[ : url.find(".com") + 4] + imgUrl if (imgUrl[ : 4] != "http") else imgUrl
+ # download only the proper image files
+ if imgUrl.lower().endswith('.jpeg') or \
+ imgUrl.lower().endswith('.jpg') or \
+ imgUrl.lower().endswith('.gif') or \
+ imgUrl.lower().endswith('.png') or \
+ imgUrl.lower().endswith('.bmp'):
+ try:
+ imgData = urllib2.urlopen(imgUrl).read()
+ global minImageFileSize
+ if len(imgData) >= minImageFileSize:
+ print " " + imgUrl
+ fileName = basename(urlparse.urlsplit(imgUrl)[2])
+ output = open(os.path.join(downloadLocationPath, fileName),'wb')
+ output.write(imgData)
+ output.close()
+ except Exception, e:
+ print str(e)
+ # pass
+ print
+ print
+
+ # if there are links on the webpage then recursively repeat
+ if level > 0:
+ linkTags = soup.findAll('a')
+ if len(linkTags) > 0:
+ for linkTag in linkTags:
+ try:
+ linkUrl = linkTag['href']
+ downloadImages(linkUrl, level - 1)
+ except Exception, e:
+ print str(e)
+ # pass
+
+# MAIN
+cla = sys.argv # command line arguments
+if len(cla) != 5:
+ print "USAGE:"
+ print "[python] ImageDownloader.py URL MaxRecursionDepth DownloadLocationPath MinImageFileSize"
+ os._exit(1)
+
+rootUrl = cla[1]
+maxRecursionDepth = int(cla[2])
+downloadLocationPath = cla[3] # absolute path
+if not os.path.isdir(downloadLocationPath):
+ print downloadLocationPath + " is not an existing directory!"
+ os._exit(2)
+
+minImageFileSize = long(cla[4]) # in bytes
+netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
+website = netloc[-2] + netloc[-1]
+downloadImages(rootUrl, maxRecursionDepth)