Welcome, guest | Sign In | My Account | Store | Cart

This is a way to grab a web page containing images and save this page and selected images to the same directory.

Python, 83 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import urllib2, re
from os import path

# global re
imre = None
gifre = None

def download(url,fname):
    try:
        print "Downloading "+url+" ... ",
        furl = urllib2.urlopen(url)
        f = file(fname,'wb')
        f.write(furl.read())
        f.close()
        print "OK"
        return 1
    except:
        print "Failed"
        return 0

def gifsub(matchobj):
    return gifre.findall(matchobj.group(0))[0]

# Main procedure
def grab(wurl, outdir, wfile, wgif, lgif, cachedir = 'cache',
         tmpfile = 'tmp.htm'):
    global imre, gifre
    imre = re.compile(wgif)
    gifre = re.compile(lgif)
    # path to temporary file
    tmpf = path.join(cachedir,tmpfile)
    print "Retrieving page..."
    download(wurl, tmpf)
    f = file(tmpf,'r')
    s = f.read()
    f.close()
    all = imre.findall(s)
    res = []
    res2 = []
    # Fill up result list
    for i in all:
        if i not in res:
            res.append(i)
            res2.append(gifre.findall(i)[0])
    result = zip(res, res2)

    # Replace web links with local links
    ns = re.sub(wgif,
                gifsub, s)
    f = file(path.join(outdir,wfile),'wb')
    f.write(ns)
    f.close()

    # Download images
    for i in result:
        if not path.exists(path.join(outdir,i[1])):
            download(i[0], path.join(outdir,i[1]))

    print "Done."

if __name__ == '__main__':
    # Document URL
    wurl = 'http://www.somesiteaddress.net/page.html'
    # Path to the local directory to save the document
    outdir = '~/downloads/somesite'
    # Filename for saved page in the local directory 
    wfile = 'index.html'
    # Patterns for images:
    # - process all gif images from <http://img.anothersiteaddress.net/images>
    #   i.e. <http://img.anothersiteaddress.net/images/image.gif>
    wgif = 'http://img\.anothersiteaddress\.net/images/[^+]*?\.gif'
    # - replace the original image URL with the simple filename
    #   i.e. <http://img.anothersiteaddress.net/images/image.gif>
    #   will be <image.gif>
    lgif = '[_a-zA-Z0-9]+\.gif'

    # Directory for storing temporary files
    cachedir = '~/downloads/temp'
    # Temporary filename
    tmpfile = 'temp.htm'

    # Call the main procedure
    grab(wurl, outdir, wfile, wgif, lgif, cachedir, tmpfile)