Image Downloader « Python recipes

Finds and downloads all images from any given URL.

Important note:

If your download location path has spaces then put quotes around it!

      # ImageDownloader.py
# Finds and downloads all images from any given URL recursively.
# FB - 20140223
import sys
import os
import urllib2
from os.path import basename
import urlparse
from BeautifulSoup import BeautifulSoup # for HTML parsing

urlList = []

# recursively download images starting from the root URL
def downloadImages(url, level): # the root URL is level 0
    # do not go to other websites
    global website
    netloc = urlparse.urlsplit(url).netloc.split('.')
    if netloc[-2] + netloc[-1] != website:
        return

    global urlList
    if url in urlList: # prevent using the same URL again
        return

    try:
        urlContent = urllib2.urlopen(url).read()
        urlList.append(url)
        print url
    except:
        return

    soup = BeautifulSoup(''.join(urlContent))
    # find and download all images
    imgTags = soup.findAll('img')
    for imgTag in imgTags:
        imgUrl = imgTag['src']
        imgUrl = url[ : url.find(".com") + 4] + imgUrl if (imgUrl[ : 4] != "http") else imgUrl
        # download only the proper image files
        if imgUrl.lower().endswith('.jpeg') or \
            imgUrl.lower().endswith('.jpg') or \
            imgUrl.lower().endswith('.gif') or \
            imgUrl.lower().endswith('.png') or \
            imgUrl.lower().endswith('.bmp'):
            try:
                imgData = urllib2.urlopen(imgUrl).read()
                global minImageFileSize
                if len(imgData) >= minImageFileSize:
                    print "    " + imgUrl
                    fileName = basename(urlparse.urlsplit(imgUrl)[2])
                    output = open(os.path.join(downloadLocationPath, fileName),'wb')
                    output.write(imgData)
                    output.close()
            except Exception, e:
                print str(e)
                # pass
    print
    print

    # if there are links on the webpage then recursively repeat
    if level > 0:
        linkTags = soup.findAll('a')
        if len(linkTags) > 0:
            for linkTag in linkTags:
                try:
                    linkUrl = linkTag['href']
                    downloadImages(linkUrl, level - 1)
                except Exception, e:
                    print str(e)
                    # pass

# MAIN
cla = sys.argv # command line arguments
if len(cla) != 5:
    print "USAGE:"
    print "[python] ImageDownloader.py URL MaxRecursionDepth DownloadLocationPath MinImageFileSize"
    os._exit(1)

rootUrl = cla[1]
maxRecursionDepth = int(cla[2])
downloadLocationPath = cla[3] # absolute path
if not os.path.isdir(downloadLocationPath):
    print downloadLocationPath + " is not an existing directory!"
    os._exit(2)

minImageFileSize = long(cla[4]) # in bytes
netloc = urlparse.urlsplit(rootUrl).netloc.split('.')
website = netloc[-2] + netloc[-1]
downloadImages(rootUrl, maxRecursionDepth)

      

Tags: html, http, url, urllib2, web

18 comments

nicobo 13 years, 7 months ago # | flag

And what about <img height="18" width="90" src="/static/activestyle/img/activestate.png" /> for instance ?

Or what if I put several spaces between "<img" and "src" like : <img src="activestate.png" /> ?

Sébastien Volle 13 years, 7 months ago # | flag

Yes, the img regex is not too solid. It would be better to use one of the many available libraries to parse the page and properly extract <img> elements, but things would get a little more complicated...