Welcome, guest | Sign In | My Account | Store | Cart

A fast multi-part file downloader ala FlashGet, GetRight, Gozilla, etc. Currently only supports HTTP but FTP wouldn't be hard. Use this instead of wget. ;-)

Python, 213 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# mget.py
#
# by Nelson Rush
#
# MIT/X license.
#
# A simple program to download files in segments.
#
# - Fixes by David Loaiza for Python 2.5 added.
# - Nelson added fixes to bring the code to 2.7 and add portability between Windows/Linux.
# - The output of segment information has been corrected and cleaned up.
# - In outside cases where the client instances were not being closed, they are now closed.
#
import sys
import os
import asyncore
import socket
import platform
from string import *
from math import *
from time import *
from mmap import *

platformName = platform.system()
SEEK_BEG = 0
SEEK_SET = 1
SEEK_END = 2

class http_client(asyncore.dispatcher):
    def __init__ (self,host,path,parts,pbegin=0,pend=0,m=None):
        asyncore.dispatcher.__init__(self)
        # Initialize class member variables.
        self.keepalive = False
        self.done = 0
        self.h = [self]
        self.recvhead = 1
        self.bytes = 0
        self.ack = 0
        self.begin = time()
        self.path = path
        self.parts = parts
        self.host = host
        self.buffer = ""
        self.pbegin = pbegin
        self.pend = pend
        self.length = 8192
        self.f = None
        # Grab the filename from the end of the URL.
        self.filename = split(path,"/")[-1]
        # Check if file exists and if so ask if overwrite necessary.
        if os.access(self.filename,os.O_RDWR) and self.parts > 0:
            u = raw_input("File already exists, overwrite? [y/N] ")
            if u == 'y' or u == 'Y':
                print "Overwriting..."
            else:
                print "Aborting..."
                return None
        # Connect to the host with it on port 80.
        print "Connecting..."
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.connect((host, 80))
        # Parts are greater than 0 so we are the parent, open file, get header.
        if self.parts > 0:
            # Open and memory map the file.
            self.f = open(self.filename,'wb+')
            self.f.write("\0")
            self.f.flush() # We have to flush to make the file buffer ready for mmap.
            # Windows uses 0 for second parameter to auto-size to current file size.
            # Whereas, on Linux and other platforms, mmap requires a size.
            if platformName == "Windows":
                self.m = mmap(self.f.fileno(), 0)
            else:
                self.m = mmap(self.f.fileno(), os.fstat(self.f.fileno()).st_size)
            # Download the header.
            self.buffer = "HEAD %s HTTP/1.1\r\nHost: %s\r\n\r\n" % (self.path,self.host)
            print "Downloading http://%s%s" % (self.host,self.path)
        # Otherwise, we are a child, skip the header and download our segment.
        elif self.parts == 0:
            # Set our own mmap to the one given to us by the parent.
            self.m = m
            # Prepare ourselves to download the segment.
            self.bytes = self.pbegin
            self.length = self.pend
            self.recvhead = 2
            self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\nRange: bytes=%lu-%lu\r\n\r\n" % (self.path,self.host,self.pbegin,self.pend)
            print self.buffer
    def handle_connect(self):
        pass
    def handle_read(self):
        # Recieve incoming data.
        data = self.recv(8192)
        # Handle recieving the header, stage 1.
        if self.recvhead == 1:
            self.head = data
            print self.head
            # If the file was not found, exit.
            if find(data,"404 Not Found") > -1:
                print "404 Not Found"
                self.close()
                self.m.close()
                self.f.close()
                return None
            # Was it found, if not just check if OK.
            if find(data,"302 Found") == -1:
                # If we did not recieve the OK, exit.
                if find(data,"200 OK") == -1:
                    print "Unable to continue download."
                    self.close()
                    self.m.close()
                    self.f.close()
                    return None
            # If we cannot determine the length of the file, exit.
            if find(data,"Content-Length") == -1:
                print "Cannot determine size."
                self.close()
                self.m.close()
                self.f.close()
                return None
            # Determine the length of the file.
            line = self.head[find(self.head,"Content-Length"):]
            line = line[:find(line,"\r\n")]
            line = line[find(line,":")+1:]
            self.length = int(line)
            self.m.resize(self.length)
            self.recvhead = 2
            # If the number of parts is 1, only get the file.
            if self.parts == 1:
                self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\n\r\n" % (self.path,self.host)
                print self.buffer
                self.pbegin = 0
                self.pend = self.length
            # If the parts is greater than 1, split into segments.
            elif self.parts > 1:
                l = self.length / self.parts
                print "Segment size =",l
                # Download the other segments in separate instances.
                if self.parts == 2:
                    self.h.append(http_client(self.host,self.path,0,l + 1,self.length,self.m))
                if self.parts > 2:
                    for i in range(1,self.parts-1):
                        self.h.append(http_client(self.host,self.path,0,(i * l) + 1,(i+1) * l,self.m))
                    self.h.append(http_client(self.host,self.path,0,((i+1) * l) + 1,self.length,self.m))
                # Set up the parent download, from beginning of file to segment size.
                self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\nRange: bytes=0-%lu\r\n\r\n" % (self.path,self.host,l)
                self.length = l
                self.pbegin = 0
                self.pend = self.length
                print self.buffer
        # Stage 2, clip the second incoming header and start grabbing the file itself.
        elif self.recvhead == 2:
            # A blank line specifies the end of the header.
            body = data[find(data,"\r\n\r\n")+4:]
            size = len(body)
            if size > 0:
                # Write what we have to the file.
                self.m[self.bytes:self.bytes+size] = body
                self.bytes += size
                # Keep track of position and inform the user every 1k downloaded.
                if len(xrange(size / 1024)) == 0:
                    self.ack = size
                else:
                    print "Segment %7lu-%7lu\t\ts%7lu to %7lu bytes recieved" % (self.pbegin,self.pend,self.bytes-size,self.bytes-1)
                if self.bytes >= self.length:
                    self.complete = time()
                    self.close()
            self.recvhead = 0
        # Just download the rest of the file.
        else:
            size = len(data)
            dataend = self.bytes + size
            self.m[self.bytes:dataend] = data
            self.bytes += size
            # Keep track of position and inform the user every 1k downloaded.
            if len(xrange(size / 1024)) == 0:
                self.ack += size
            else:
                print "Segment %7lu-%7lu\t\t%7lu to %7lu bytes recieved" % (self.pbegin,self.pend,self.bytes-size,self.bytes-1)
            if len(range(self.ack / 1024)) > 0:
                print "Segment %7lu-%7lu\t\t%7lu to %7lu bytes recieved" % (self.pbegin,self.pend,self.bytes-size,self.bytes-1)
                self.ack -= (1024 * len(xrange(self.ack / 1024)))
            if self.bytes >= self.length:
                self.complete = time()
                self.close()
    # Check to see if the buffer is clear.
    def writable(self):
        return(len(self.buffer) > 0)
    # Handle transmission of the data.
    def handle_write(self):
        sent = self.send(self.buffer)
        self.buffer = self.buffer[sent:]
    # Handle closing of the connection.
    def handle_close(self):
        self.complete = time()
        if self.bytes > self.length:
            self.bytes = self.bytes - 1
        print "Segment %7lu-%7lu\t\t%7lu to %7lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
        self.close()

# Main
if __name__ == '__main__':
    from urlparse import *
    if len(sys.argv) < 2:
        print 'usage: %s host' % sys.argv[0]
    else:
       url = sys.argv[1]
       if find(url,"http://") == -1:
           url = "http://" + url
       url = urlparse(url)
       client = http_client(url[1],url[2],3)
       asyncore.loop()
       client.m.close()
       client.f.close()
       print "Client download finished."

Handy for downloading files. Could be extended to support new features; user-friendly GUI, helper segments (which help out other segments if they're finished themselves), password support, archival and tracking of downloads in progess/already downloaded files, etc. Use your imagination. The first version I made of this used regular old files but I decided to go with mmap'd files instead because they're more enjoyable to work with.

This has been updated for Python 2.7.

7 comments

Steven Yong 21 years, 12 months ago  # | flag

can this be done in ActiveX Control. Hi,

I am not familiar with Python but I want to build an activeX control in which the IE can use this object to speed up the download of a resource e.g. Flash movie. If you can guide me how to write it in ActiveX Control..that will be great...but if not, I hope you can explain the concept behind this in details possible. I am sorry to bug you here but I have been searching high and low for something that can guide me and apparently I only found yours which is the one and only I can find so far.

I would really appreciate your help and guidance if you can generously help me. Thanks in advance and I really mean it.

from, Steven

Santosh Pillai 21 years, 3 months ago  # | flag

Help. Hi! there Mr.Steven and Mr.Nelson, I am Santosh.M.Pillai a software engineer from india. Well friends problem is also as similar to the ones stated by Mr.Steven. I too want to make an Win32 running application and I am not familiat with Python so I am unable to understand the logic and convert the pythyon code in Win32 application. I too searched the Web for information on File segmentation and mutiple connection download, logic details but couldn't find any. If u people have any(any) information eg.URL or documentation regarding these topics please help me. I hope u people will help to solve this problem. Thanksxx in advance. Expecting a reply soon.{ pillai_santosh@indiatimes.com ]. Thankyou.

David Loaiza 16 years, 4 months ago  # | flag

For Linux and Python 2.5. This is the the code for work on linux ...

<pre> import sys import os import asyncore import socket from string import * from math import * from time import * from mmap import *

SEEK_BEG = 0 SEEK_SET = 1 SEEK_END = 2

class http_client(asyncore.dispatcher): def __init__ (self,host,path,parts,pbegin=0,pend=0,m=None): asyncore.dispatcher.__init__(self) # Initialize class member variables. self.done = 0 self.h = [self] self.recvhead = 1 self.bytes = 0 self.ack = 0 self.begin = time() self.path = path self.parts = parts self.host = host self.buffer = "" self.pbegin = pbegin self.pend = pend self.length = 8192 self.f = None # Grab the filename from the end of the URL. self.filename = split(path,"/")[-1] # Check if file exists and if so ask if overwrite necessary. if os.access(self.filename,os.O_RDWR) and self.parts > 0: u = raw_input("File already exists, overwrite? [y/N] ") if u == 'y' or u == 'Y': print "Overwriting..." else: print "Aborting..." return None # Create a TCP/IP socket and connect to the host with it on port 80. print "Connecting..." self.create_socket(socket.AF_INET, socket.SOCK_STREAM) self.connect((host, 80)) # Parts are greater than 0 so we are the parent, open file, get header. if self.parts > 0: # Open and memory map the file. self.f = open(self.filename,'wb+') self.f.write("\0") self.f.tell() self.m = mmap(self.f.fileno(),os.fstat(self.f.fileno()).st_size) # Download the header. self.buffer = "HEAD %s HTTP/1.1\r\nHost: %s\r\n\r\n" %(self.path,self.host) print "Downloading http://%s%s" % (self.host,self.path) # Otherwise, we are a child, skip the header and download our segment. elif self.parts == 0: # Set our own mmap to the one given to us by the parent. self.m = m # Prepare ourselves to download the segment. self.bytes = self.pbegin self.length = self.pend self.recvhead = 2 self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\nRange: bytes=%lu-%lu\r\n\r\n" % (self.path,self.host,self.pbegin,self.pend) print self.buffer def handle_connect(self): pass def handle_read(self): # Recieve incoming data. data = self.recv(8192) # Handle recieving the header, stage 1. if self.recvhead == 1: self.head = data print self.head # If the file was not found, exit. if find(data,"404 Not Found") > -1: print "404 Not Found"

(comment continued...)

David Loaiza 16 years, 4 months ago  # | flag

(...continued from previous comment)

            self.close()
            self.m.close()
            self.f.close()
            return None
        # Was it found, if not just check if OK.
        if find(data,"302 Found") == -1:
            # If we did not recieve the OK, exit.
            if find(data,"200 OK") == -1:
                print "Unable to continue download."
                self.close()
                self.m.close()
                self.f.close()
                return None
        # If we cannot determine the length of the file, exit.
        if find(data,"Content-Length") == -1:
            print "Cannot determine size."
            self.close()
            self.m.close()
            self.f.close()
            return None
        # Determine the length of the file.
        line = self.head[find(self.head,"Content-Length"):]
        line = line[:find(line,"\r\n")]
        line = line[find(line,":")+1:]
        self.length = atoi(line)
        print self.length
        self.m.resize(self.length)
        self.recvhead = 2
        # If the number of parts is 1, only get the file.
        if self.parts == 1:
            self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\n\r\n" %(self.path,self.host)
            print self.buffer
            self.pbegin = 0
            self.pend = self.length
        # If the parts is greater than 1, split into segments.
        elif self.parts > 1:
            l = self.length / self.parts
            print "Segment size =",l
            # Download the other segments in separate instances.
            if self.parts == 2:
                self.h.append(http_client(self.host,self.path,0,l + 1,self.length,self.m))
            if self.parts > 2:
                for i in range(1,self.parts-1):
                    self.h.append(http_client(self.host,self.path,0,(i * l) + 1,(i+1) * l,self.m))
                self.h.append(http_client(self.host,self.path,0,((i+1) * l) + 1,self.length,self.m))
            # Set up the parent download, from beginning of file to segment size.
            self.buffer = "GET %s HTTP/1.1\r\nHost: %s\r\nRange: bytes=0-%lu\r\n\r\n" % (self.path,self.host,l)
            self.length = l
            self.pbegin = 0
            self.pend = self.length
            print self.buffer
    # Stage 2, clip the second incoming header and start grabbing the file itself.
    elif self.recvhead == 2:
        # A blank line specifies the end of the header.
        body = data[find(data,"\r\n\r\n")+4:]
        size = len(body)
        if size > 0:
            # Write what we have to the file.
            self.m[self.bytes:self.bytes+size] = body
            self.bytes += size

(comment continued...)

David Loaiza 16 years, 4 months ago  # | flag

(...continued from previous comment)

            # Keep track of position and inform the user.
            if len(xrange(size / 1024)) == 0:
                self.ack = size
            else: pass
                #print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
        self.recvhead = 0
    # Just download the rest of the file.
    else:
        size = len(data)
        dataend = self.bytes + size
        self.m[self.bytes:dataend] = data
        self.bytes += size
        # Keep track of position and inform the user.
        if len(xrange(size / 1024)) == 0:
            self.ack += size
        else: pass
            #print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
        if len(range(self.ack / 1024)) > 0:
            #print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
            self.ack -= (1024 * len(xrange(self.ack / 1024)))
# Check to see if the buffer is clear.
def writable(self):
    return(len(self.buffer) > 0)
# Handle transmission of the data.
def handle_write(self):
    sent = self.send(self.buffer)
    self.buffer = self.buffer[sent:]
# Handle closing of the connection.
def handle_close(self):
    self.complete = time()
    if self.bytes > self.length: self.bytes = self.bytes - 1
    #print "Jet %lu-%lu   %lu of %lu bytes recieved" % (self.pbegin,self.pend,self.bytes,self.length)
    self.close()

Main

if __name__ == '__main__': from urlparse import * if len(sys.argv) This is the the code for work on linux ...

<pre> import sys import os import asyncore import socket from string import * from math import * from time import * from mmap import *

SEEK_BEG = 0 SEEK_SET = 1 SEEK_END = 2

class http_client(asyncore.dispatcher): def __init__ (self,host,path,parts,pbegin=0,pend=0,m=None): asyncore.dispatcher.__init__(self) # Initialize class member variables

a 13 years, 12 months ago  # | flag

atoi() is deprecated. you should use int() instead

jose 11 years ago  # | flag

how can i download files on servers that i have to wait a few secons to start de download??