Small effort to store the python recipes to our local
similar effort by other people: 543267-i-will-download-all-of-the-recipes-from-the-python 535162-i-download-all-the-python-cookbook-recipes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | #Just a try using the thread modules.
import urllib as ul
import bs4 as bs
import urlparse as up
import re as re
import os.path as op
import Queue as que
import time
import threading
pat = re.compile('.*[\d]{4,7}.*')
count=0
class dldfile(threading.Thread):
def __init__(self,qu1):
threading.Thread.__init__(self)
self.qu1=qu1
self.ad='download/1/'
def run(self):
try:
url,filename=self.qu1.get()
url =url+self.ad #comment this line in case need to download whole web page instead of recipe ONLY...
ul.urlretrieve(url,filename)
global count
except:
print " RE-TRYING ",
count= count - 1
self.qu1.put((url,filename))
self.run()
finally:
count= count +1
print str(count)+"("+str( threading.activeCount()) +")",filename
self.qu1.task_done()
class dload(threading.Thread ):
def __init__(self,qu,url = "http://code.activestate.com/recipes/langs/python/?page=" ):
threading.Thread.__init__(self)
self.url= url
self.q =que.Queue()
self.qu=qu
def run(self):
ind=self.qu.get()
url=self.url+str(ind)
soup =bs.BeautifulSoup(''.join( ul.urlopen(url).readlines() ))
bu = up.urlsplit(self.url)
print 'started with the ' ,str(url).split('/')[-1],
for i in soup.find_all(attrs = { "class" : "recipe-title"}):
sp = up.urlsplit(i.a.get('href'))
path = sp.path
print path
if re.search(pat, path):
path = bu.scheme+'://'+bu.netloc+path
filename = str(path).split('/')[-2]
filename = op.join(op.abspath(op.curdir),filename+'.py') # recipe will be stored in given location
# filename = op.join(op.abspath(op.curdir),filename+'.html')
#uncomment the above line if downloading the web page for teh recipe
print path
self.q.put((path,filename))
self.fetch_data()
time.sleep(1)
self.qu.task_done()
self.q.join()
print 'done with the ' ,str(url).split('/')[-1],
def fetch_data(self):
Que1 = que.Queue()
minitask =10
while not self.q.empty():
for i in range(minitask):
x = dldfile(Que1)
x.setDaemon(True)
x.start()
for j in range(minitask):
Que1.put(self.q.get())
Que1.join()
del x
if __name__ =='__main__':
task=5
Que = que.Queue()
for k in range(1,190,task): # no. of pages included under the python tag. 188 is current count and 3700+ python recipes
print "\n PAGE # : {0} \t \nDeploying Fresh threads\n".format(k)
for i in range(task):
t = dload(Que)
t.start()
for j in range(task):
Que.put(k+j)
Que.join()
Que.queue.clear()
del t
print "DONE\n"
time.sleep(2)
del Que
print "Our buisness finished"
|