ActiveState Code

Recipe 114217: Multi-segment Fast File Downloader


A fast multi-part file downloader ala FlashGet, GetRight, Gozilla, etc. Currently only supports HTTP but FTP wouldn't be hard. Use this instead of wget. ;-)

Python
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# mget.py
#
# by Nelson Rush
#
# This code has been released to the Public Domain.
#
# A simple program to download files in segments.
#
import sys
import os
import asyncore
import socket
from string import *
from math import *
from time import *
from mmap import *

SEEK_BEG = 0
SEEK_SET = 1
SEEK_END = 2

class http_client(asyncore.dispatcher):
    def __init__ (self,host,path,parts,pbegin=0,pend=0,m=None):
        asyncore.dispatcher.__init__(self)
        # Initialize class member variables.
        self.done = 0
        self.h = [self]
        self.recvhead = 1
        self.bytes = 0
        self.ack = 0
        self.begin = time()
        self.path = path
        self.parts = parts
        self.host = host
        self.buffer = ""
        self.pbegin = pbegin
        self.pend = pend
        self.length = 8192
        self.f = None
        # Grab the filename from the end of the URL.
        self.filename = split(path,"/")[-1]
        # Check if file exists and if so ask if overwrite necessary.
        if os.access(self.filename,os.O_RDWR) and self.parts > 0:
            u = raw_input("File already exists, overwrite? [y/N] ")
            if u == 'y' or u == 'Y':
                print "Overwriting..."
            else:
                print "Aborting..."
                return None
        # Create a TCP/IP socket and connect to the host with it on port 80.
        print "Connecting..."
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.connect((host, 80))
        # Parts are greater than 0 so we are the parent, open file, get header.
        if self.parts > 0:
            # Open and memory map the file.
            self.f = open(self.filename,'wb+')
            self.f.write("\0")
            self.m = mmap(self.f.fileno(),0)
            # Download the header.
            self.buffer = "HEAD %s HTTP/1.1\r\nHost:\r\n\r\n" % self.path
            print "Downloading http://%s%s" % (self.host,self.path)
        # Otherwise, we are a child, skip the header and download our segment.
        elif self.parts == 0:
            # Set our own mmap to the one given to us by the parent.
            self.m = m
            # Prepare ourselves to download the segment.
            self.bytes = self.pbegin
            self.length = self.pend
            self.recvhead = 2
            self.buffer = "GET %s HTTP/1.1\r\nHost:\r\nRange: bytes=%lu-%lu\r\n\r\n" % (self.path,self.pbegin,self.pend)
            print self.buffer
    def handle_connect(self):
        pass
    def handle_read(self):
        # Recieve incoming data.
        data = self.recv(8192)
        # Handle recieving the header, stage 1.
        if self.recvhead == 1:
            self.head = data
            print self.head
            # If the file was not found, exit.
            if find(data,"404 Not Found") > -1:
                print "404 Not Found"
                self.close()
                self.m.close()
                self.f.close()
                return None
            # Was it found, if not just check if OK.
            if find(data,"302 Found") == -1:
                # If we did not recieve the OK, exit.
                if find(data,"200 OK") == -1:
                    print "Unable to continue download."
                    self.close()
                    self.m.close()
                    self.f.close()
                    return None
            # If we cannot determine the length of the file, exit.
            if find(data,"Content-Length") == -1:
                print "Cannot determine size."
                self.close()
                self.m.close()
                self.f.close()
                return None
            # Determine the length of the file.
            line = self.head[find(self.head,"Content-Length"):]
            line = line[:find(line,"\r\n")]
            line = line[find(line,":")+1:]
            self.length = atoi(line)
            self.m.resize(self.length)
            self.recvhead = 2
            # If the number of parts is 1, only get the file.
            if self.parts == 1:
                self.buffer = "GET %s HTTP/1.1\r\nHost:\r\n\r\n" % self.path
                print self.buffer
                self.pbegin = 0
                self.pend = self.length
            # If the parts is greater than 1, split into segments.
            elif self.parts > 1:
                l = self.length / self.parts
                print "Segment size =",l
                # Download the other segments in separate instances.
                if self.parts == 2:
                    self.h.append(http_client(self.host,self.path,0,l + 1,self.length,self.m))
                if self.parts > 2:
                    for i in range(1,self.parts-1):
                        self.h.append(http_client(self.host,self.path,0,(i * l) + 1,(i+1) * l,self.m))
                    self.h.append(http_client(self.host,self.path,0,((i+1) * l) + 1,self.length,self.m))
                # Set up the parent download, from beginning of file to segment size.
                self.buffer = "GET %s HTTP/1.1\r\nHost:\r\nRange: bytes=0-%lu\r\n\r\n" % (self.path,l)
                self.length = l
                self.pbegin = 0
                self.pend = self.length
                print self.buffer
        # Stage 2, clip the second incoming header and start grabbing the file itself.
        elif self.recvhead == 2:
            # A blank line specifies the end of the header.
            body = data[find(data,"\r\n\r\n")+4:]
            size = len(body)
            if size > 0:
                # Write what we have to the file.
                self.m[self.bytes:self.bytes+size] = body
                self.bytes += size
                # Keep track of position and inform the user.
                if len(xrange(size / 1024)) == 0:
                    self.ack = size
                else:
                    print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
            self.recvhead = 0
        # Just download the rest of the file.
        else:
            size = len(data)
            dataend = self.bytes + size
            self.m[self.bytes:dataend] = data
            self.bytes += size
            # Keep track of position and inform the user.
            if len(xrange(size / 1024)) == 0:
                self.ack += size
            else:
                print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
            if len(range(self.ack / 1024)) > 0:
                print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
                self.ack -= (1024 * len(xrange(self.ack / 1024)))
    # Check to see if the buffer is clear.
    def writable(self):
        return(len(self.buffer) > 0)
    # Handle transmission of the data.
    def handle_write(self):
        sent = self.send(self.buffer)
        self.buffer = self.buffer[sent:]
    # Handle closing of the connection.
    def handle_close(self):
        self.complete = time()
        if self.bytes > self.length: self.bytes = self.bytes - 1
        print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
        self.close()

# Main
if __name__ == '__main__':
    from urlparse import *
    if len(sys.argv) < 2:
        print 'usage: %s host' % sys.argv[0]
    else:
       url = sys.argv[1]
       if find(url,"http://") == -1:
           url = "http://" + url
       url = urlparse(url)
       client = http_client(url[1],url[2],3)
       asyncore.loop()
       client.m.close()
       client.f.close()
       print "Client jet 1 finished at",client.bytes,"of",client.length

Discussion

Handy for downloading files. Could be extended to support new features; user-friendly GUI, helper jets (which help out other jets if they're finished themselves), password support, archival and tracking of downloads in progess/already downloaded files, etc. Use your imagination.

The first version I made of this used regular old files but I decided to go with mmap'd files instead because they're more enjoyable to work with.

Comments

  1. 1. At 10:40 a.m. on 26 apr 2002, Steven Yong said:

    can this be done in ActiveX Control. Hi,

    I am not familiar with Python but I want to build an activeX control in which the IE can use this object to speed up the download of a resource e.g. Flash movie. If you can guide me how to write it in ActiveX Control..that will be great...but if not, I hope you can explain the concept behind this in details possible. I am sorry to bug you here but I have been searching high and low for something that can guide me and apparently I only found yours which is the one and only I can find so far.

    I would really appreciate your help and guidance if you can generously help me. Thanks in advance and I really mean it.

    from, Steven

  2. 2. At 9:55 p.m. on 14 jan 2003, Santosh Pillai said:

    Help. Hi! there Mr.Steven and Mr.Nelson, I am Santosh.M.Pillai a software engineer from india. Well friends problem is also as similar to the ones stated by Mr.Steven. I too want to make an Win32 running application and I am not familiat with Python so I am unable to understand the logic and convert the pythyon code in Win32 application. I too searched the Web for information on File segmentation and mutiple connection download, logic details but couldn't find any. If u people have any(any) information eg.URL or documentation regarding these topics please help me. I hope u people will help to solve this problem. Thanksxx in advance. Expecting a reply soon.{ pillai_santosh@indiatimes.com ]. Thankyou.

  3. 3. At 10:41 a.m. on 10 dec 2007, David Loaiza said:

    For Linux and Python 2.5. This is the the code for work on linux ...

    <pre> import sys import os import asyncore import socket from string import * from math import * from time import * from mmap import *

    SEEK_BEG = 0 SEEK_SET = 1 SEEK_END = 2

    class http_client(asyncore.dispatcher): def __init__ (self,host,path,parts,pbegin=0,pend=0,m=None): asyncore.dispatcher.__init__(self) # Initialize class member variables. self.done = 0 self.h = [self] self.recvhead = 1 self.bytes = 0 self.ack = 0 self.begin = time() self.path = path self.parts = parts self.host = host self.buffer = "" self.pbegin = pbegin self.pend = pend self.length = 8192 self.f = None # Grab the filename from the end of the URL. self.filename = split(path,"/")[-1] # Check if file exists and if so ask if overwrite necessary. if os.access(self.filename,os.O_RDWR) and self.parts > 0: u = raw_input("File already exists, overwrite? [y/N] ") if u == 'y' or u == 'Y': print "Overwriting..." else: print "Aborting..." return None # Create a TCP/IP socket and connect to the host with it on port 80. print "Connecting..." self.create_socket(socket.AF_INET, socket.SOCK_STREAM) self.connect((host, 80)) # Parts are greater than 0 so we are the parent, open file, get header. if self.parts > 0: # Open and memory map the file. self.f = open(self.filename,'wb+') self.f.write("\0") self.f.tell() self.m = mmap(self.f.fileno(),os.fstat(self.f.fileno()).st_size) # Download the header. self.buffer = "HEAD %s HTTP/1.1\r\nHost: %s\r\n\r\n" %(self.path,self.host) print "Downloading http://%s%s" % (self.host,self.path) # Otherwise, we are a child, skip the header and download our segment. elif self.parts == 0: # Set our own mmap to the one given to us by the parent. self.m = m # Prepare ourselves to download the segment. self.bytes = self.pbegin self.length = self.pend self.recvhead = 2 self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\nRange: bytes=%lu-%lu\r\n\r\n" % (self.path,self.host,self.pbegin,self.pend) print self.buffer def handle_connect(self): pass def handle_read(self): # Recieve incoming data. data = self.recv(8192) # Handle recieving the header, stage 1. if self.recvhead == 1: self.head = data print self.head # If the file was not found, exit. if find(data,"404 Not Found") > -1: print "404 Not Found"

    (comment continued...)

  4. 4. At 10:41 a.m. on 10 dec 2007, David Loaiza said:

    (...continued from previous comment)

                self.close()
                self.m.close()
                self.f.close()
                return None
            # Was it found, if not just check if OK.
            if find(data,"302 Found") == -1:
                # If we did not recieve the OK, exit.
                if find(data,"200 OK") == -1:
                    print "Unable to continue download."
                    self.close()
                    self.m.close()
                    self.f.close()
                    return None
            # If we cannot determine the length of the file, exit.
            if find(data,"Content-Length") == -1:
                print "Cannot determine size."
                self.close()
                self.m.close()
                self.f.close()
                return None
            # Determine the length of the file.
            line = self.head[find(self.head,"Content-Length"):]
            line = line[:find(line,"\r\n")]
            line = line[find(line,":")+1:]
            self.length = atoi(line)
            print self.length
            self.m.resize(self.length)
            self.recvhead = 2
            # If the number of parts is 1, only get the file.
            if self.parts == 1:
                self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\n\r\n" %(self.path,self.host)
                print self.buffer
                self.pbegin = 0
                self.pend = self.length
            # If the parts is greater than 1, split into segments.
            elif self.parts > 1:
                l = self.length / self.parts
                print "Segment size =",l
                # Download the other segments in separate instances.
                if self.parts == 2:
                    self.h.append(http_client(self.host,self.path,0,l + 1,self.length,self.m))
                if self.parts > 2:
                    for i in range(1,self.parts-1):
                        self.h.append(http_client(self.host,self.path,0,(i * l) + 1,(i+1) * l,self.m))
                    self.h.append(http_client(self.host,self.path,0,((i+1) * l) + 1,self.length,self.m))
                # Set up the parent download, from beginning of file to segment size.
                self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\nRange: bytes=0-%lu\r\n\r\n" % (self.path,self.host,l)
                self.length = l
                self.pbegin = 0
                self.pend = self.length
                print self.buffer
        # Stage 2, clip the second incoming header and start grabbing the file itself.
        elif self.recvhead == 2:
            # A blank line specifies the end of the header.
            body = data[find(data,"\r\n\r\n")+4:]
            size = len(body)
            if size > 0:
                # Write what we have to the file.
                self.m[self.bytes:self.bytes+size] = body
                self.bytes += size
    

    (comment continued...)

  5. 5. At 10:41 a.m. on 10 dec 2007, David Loaiza said:

    (...continued from previous comment)

                # Keep track of position and inform the user.
                if len(xrange(size / 1024)) == 0:
                    self.ack = size
                else: pass
                    #print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
            self.recvhead = 0
        # Just download the rest of the file.
        else:
            size = len(data)
            dataend = self.bytes + size
            self.m[self.bytes:dataend] = data
            self.bytes += size
            # Keep track of position and inform the user.
            if len(xrange(size / 1024)) == 0:
                self.ack += size
            else: pass
                #print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
            if len(range(self.ack / 1024)) > 0:
                #print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
                self.ack -= (1024 * len(xrange(self.ack / 1024)))
    # Check to see if the buffer is clear.
    def writable(self):
        return(len(self.buffer) > 0)
    # Handle transmission of the data.
    def handle_write(self):
        sent = self.send(self.buffer)
        self.buffer = self.buffer[sent:]
    # Handle closing of the connection.
    def handle_close(self):
        self.complete = time()
        if self.bytes > self.length: self.bytes = self.bytes - 1
        #print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
        self.close()
    

    Main

    if __name__ == '__main__': from urlparse import * if len(sys.argv) This is the the code for work on linux ...

    <pre> import sys import os import asyncore import socket from string import * from math import * from time import * from mmap import *

    SEEK_BEG = 0 SEEK_SET = 1 SEEK_END = 2

    class http_client(asyncore.dispatcher): def __init__ (self,host,path,parts,pbegin=0,pend=0,m=None): asyncore.dispatcher.__init__(self) # Initialize class member variables

Sign in to comment