Popular recipes by Alireza Hosseini http://code.activestate.com/recipes/users/4185286/2013-02-17T12:54:44-08:00ActiveState Code RecipesAli (Java)
2013-02-17T12:54:44-08:00Alireza Hosseinihttp://code.activestate.com/recipes/users/4185286/http://code.activestate.com/recipes/578461-ali/
<p style="color: grey">
Java
recipe 578461
by <a href="/recipes/users/4185286/">Alireza Hosseini</a>
.
</p>
<h5 id="httpcodeactivestatecomrecipes578439-r1">{{{ <a href="http://code.activestate.com/recipes/578439/" rel="nofollow">http://code.activestate.com/recipes/578439/</a> (r1)</h5>
<h4 id="just-a-try-using-the-thread-modules">Just a try using the thread modules.</h4>
<p>import urllib as ul
import bs4 as bs
import urlparse as up
import re as re
import os.path as op
import Queue as que
import time
import threading</p>
<p>pat = re.compile('.<em>[\d]{4,7}.</em>')</p>
<p>count=0</p>
<p>class dldfile(threading.Thread):
def __init__(self,qu1):
threading.Thread.__init__(self)
self.qu1=qu1
self.ad='download/1/'</p>
<pre class="prettyprint"><code>def run(self):
try:
url,filename=self.qu1.get()
url =url+self.ad #comment this line in case need to download whole web page instead of recipe ONLY...
ul.urlretrieve(url,filename)
global count
except:
print " RE-TRYING ",
count= count - 1
self.qu1.put((url,filename))
self.run()
finally:
count= count +1
print str(count)+"("+str( threading.activeCount()) +")",filename
self.qu1.task_done()
</code></pre>
<p>class dload(threading.Thread ):
def __init__(self,qu,url = "http://code.activestate.com/recipes/langs/python/?page=" ):
threading.Thread.__init__(self)
self.url= url
self.q =que.Queue()
self.qu=qu</p>
<pre class="prettyprint"><code>def run(self):
ind=self.qu.get()
url=self.url+str(ind)
soup =bs.BeautifulSoup(''.join( ul.urlopen(url).readlines() ))
bu = up.urlsplit(self.url)
print 'started with the ' ,str(url).split('/')[-1],
for i in soup.find_all(attrs = { "class" : "recipe-title"}):
sp = up.urlsplit(i.a.get('href'))
path = sp.path
print path
if re.search(pat, path):
path = bu.scheme+'://'+bu.netloc+path
filename = str(path).split('/')[-2]
filename = op.join(op.abspath(op.curdir),filename+'.py') # recipe will be stored in given location
</code></pre>
<h4 id="filename-opjoinopabspathopcurdirfilenamehtml">filename = op.join(op.abspath(op.curdir),filename+'.html')</h4>
<h4 id="uncomment-the-above-line-if-downloading-the-web-page-for-teh-recipe">uncomment the above line if downloading the web page for teh recipe</h4>
<pre class="prettyprint"><code> print path
self.q.put((path,filename))
self.fetch_data()
time.sleep(1)
self.qu.task_done()
self.q.join()
print 'done with the ' ,str(url).split('/')[-1],
def fetch_data(self):
Que1 = que.Queue()
minitask =10
while not self.q.empty():
for i in range(minitask):
x = dldfile(Que1)
x.setDaemon(True)
x.start()
for j in range(minitask):
Que1.put(self.q.get())
Que1.join()
del x
</code></pre>
<p>if __name__ =='__main__':
task=5
Que = que.Queue()
for k in range(1,190,task): # no. of pages included under the python tag. 188 is current count and 3700+ python recipes
print "\n PAGE # : {0} \t \nDeploying Fresh threads\n".format(k)
for i in range(task):
t = dload(Que)
t.start()
for j in range(task):
Que.put(k+j)
Que.join()
Que.queue.clear()
del t
print "DONE\n"
time.sleep(2)
del Que
print "Our buisness finished"</p>
<h5 id="end-of-httpcodeactivestatecomrecipes578439">end of <a href="http://code.activestate.com/recipes/578439/" rel="nofollow">http://code.activestate.com/recipes/578439/</a> }}}</h5>