ActiveState Code

Recipe 576467: downloading a serial list of picture from the web


give this program an url from a pciture and it will try to download all the list of existing picture exemple : downloader.py http://www.example.com/picture01.jpg

Python
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
By bussiere bussiere @at gmail.com
"""

__Author__ ="bussiere"
__Email__ = "bussiere @at gmail.com"
__Titre__ = "downloading a serial list of picture from the web"
__Description__ = "give this program an url from a pciture and it will try to download all the list of existing picture exemple : downloader.py http://www.example.com/picture01.jpg"
__Discussion__ = "some times i've got a link like http://www.example.com/picture01.jpg and i guess that there are more picture of it this program will try to find them all and download them all"
__Tags__ ="download picture pictures jpg JPG url list series serie"
import re
import time,datetime
import urllib2

def download_pictures(url,start=1,limit=999,trial=3):
    #we compile the regular exepression that will get the number and the point of the url as 01. in the http://www.example.com/picture01.jpg
    reg = re.compile("[0-9]+.\.")
    # here we compile a regular expression for find html in data
    reghtml = re.compile('HTML')
    # we search the numbers with . in the url
    result = reg.search(url)
    # we get the begining position and the ending position of the numbers with .
    begin , end =  result.span()
    # we don't need the . at the end so we finish it earlier
    end =  end - 1
    #this string wil contain some zero because sometimes in the url it's 0001.jpg or 01.jpg or 1.jpg
    zero = ''
    #the i will count the zero
    i = 0
    #we put how 0 in zero that we have number in the url save one
    while  i < end -begin-1 :
        zero += '0'
        i += 1
    #here we get the extension a the end of the url .jpg here
    extension = url[end:]
    #count will determine how many times we have tried to download a file
    count = 0
    #while the download number is inferior of the limit let's goes on
    while start < limit :
        #we get the time
        t = datetime.datetime.now()
        # we make a file name with the epoch to be sure that the name will exist just one time
        name = "%s.%s"%(time.mktime(t.timetuple()),extension)
        # we make the url to download the file based on the url mixed with zero and the number of the download
        urlbis = "%s%s%s%s"%(url[:begin],zero,start,url[end:])
        #we try to download the file
        try :
            #here we open the url
            remoteFile = urllib2.urlopen(urlbis)
            #we read the file that we get
            data = remoteFile.read()
            # if the file does not exist sometimes we get a 404 error with the word HTML in it
            #so we check for it and if it is not in.
            if not reghtml.search(data) :
                #we create a file with the epoch name
                localFile = open(name, "wb")
                # we write the data in it
                localFile.write(data)
                #we close it
                localFile.close()
            else :
                #if there is the word HTML in it 
                #we end the loop
                break
        #here it is if we can't get any file at all
        except :
            # if the zero count equal zero we increase the count and put zero blank
            if i == 0 :
                zero = ''
                count += 1
            #here we just sustract zero
            else :
                # we reduce the zero counter
                i = i - 1   
                # we substract one zero
                zero = zero[:i] 
                # and we substract one to the counter of file to make it retry this file with one zero less
                start = start - 1           
        # if the failure counter is equal to the trial number we get out
        if count > trial  :
            break
        #we just increase the file counter
        start += 1
        

        
    


def main(argv=None):
    # we get the argument passed on the command line
    argv = sys.argv
    #we initialize the argument at none
    url = None
    #at which number we begin to download
    start =  None
    #how many download ?
    limit = None
    # how many time we will try to download
    trial = None
    #we get the argument in a list
    options =  sys.argv[1:]
    #we put the option one after one and initialise them
    for option in options :
        if not url :
            url = option
        else :
            if not start :
                start = option
            else :
                if not limit :
                    limit = option
                else :
                    trial = trial
    #if some options ar empty we put the default value
    if not start :
        start = 1
    if not limit :
        limit = 999 + start
    if not trial :
        trial = 3 
    #we call the download picture
    download_pictures(url,start,limit,trial)


if __name__ == "__main__":
    import sys
    #we call the main function
    sys.exit(main())


    
    

Discussion

some times i've got a link like http://www.example.com/picture01.jpg and i guess that there are more picture of it this program will try to find them all and download them all

Sign in to comment