Recipe 577608 revision 3 « ActiveState Code

class url_spider(object):
    " it is like a spider aplication go through addresses and collect urls "
    def __init__(self,limit):
        self.limit=limit    # to limit extracting url

    def run(self):
        import sqlite3
        import urllib
        from re import findall

        


                           # ----------------------- #-
                     
        conn=sqlite3.connect('url_spider.db')
        cur=conn.cursor()
        
        cur.execute('CREATE TABLE IF NOT EXISTS urlbank(url TEXT UNIQUE,retv INTEGER,v INTEGER,times INTEGER)')
        #cur.execute('CREATE TABLE IF NOT EXISTS tags (url TEXT UNIQUE,tags TEXT)')
        # --url:the web adresses that is retrieved --retv:if we have searched through the link or not
        # --v: number of url's that another link have page to it --times:number of times that a url have retrieved
        
        
        c=1  # --limit control
        
        
                      # ---------------------- #-
        while True:            # for first time program prompt for an address and for secode time until limitation
            if c>self.limit:   # program use database and select a url that is not retieved
                return
            if c==1:
                host=raw_input('enter a url: ')    # where we start to collect url's
                
            if c>1 and c<=self.limit:       # ----second time loop
                try:
                    cur.execute('SELECT url,times FROM urlbank WHERE retv=0 LIMIT 1')
                    (host,t)=cur.fetchone()
                    t+=1
                    cur.execute('UPDATE urlbank SET times=? WHERE url=?',(t,host))
                except:
                    print 'there is a problem'
                    #return                # ----second time loop
                    
            else:        # ---continuing first time
                try: 
                   cur.execute('INSERT OR IGNORE INTO urlbank (url,retv,v,times) VALUES (?,0,0,1)',(host,))
                except:
                    cur.execute('SELECT times FROM urlbank WHERE url=?',(host,))
                    t=cur.fetchone()[0]
                    cur.execute('UPDATE urlbank SET times=? WHERE url=?',(t+1,host)) # ----end of first time
                                                                                    
            c+=1
            cur.execute('UPDATE urlbank SET retv=1 WHERE url=?',(host,))    # retv=1 becouse we are searcg through it
                        # --------------------- #-
                        
            try:
                if findall('.*(w3.org).*',host)[0]=='w3.org':  # --we would counter  a problem once we face to
                # ----this address so we ignore it.
                    continue
            except:
                pass
                       # --------------------- #-

                       
            try:
                doc=urllib.urlopen(host)   #---loading urs's destination
            except:                      
                continue


            for line in doc:    # ----- starting extract
                for link in findall('.*(http://\S+[.]{1}\S+[.]{1}[a-zA-Z]{2,4}[^\s"\<>.]+)/.*',line): # ---extracting usrl's
                    try:
                        cur.execute('SELECT v FROM urlbank WHERE url=?',(link,))
                        vis=cur.fetchone()[0]
                        cur.execute('UPDATE urlbank SET v=? WHERE url=?',(vis+1,link))
                    except:
                        cur.execute('INSERT OR IGNORE INTO urlbank (url,retv,v,times) VALUES (?,0,1,0)',(link,))
                        
                        

                        
                
                try:       # ----putting data in database by using try and except ----#-
                    conn.commit()
                except:
                    pass
                # ---------------------- - - -

                
                
            # ----------- END OF LOOP IS HERE --------- #-
            
        conn.close()
    
# ----------------------------------END OF class-----------------------------

# -----------------------------RUNNING:
###########################
if __name__=="__main__": ##
    t=url_spider(10)     ##
    t.run()              ##
###########################

Diff to Previous Revision

--- revision 2 2011-03-14 09:02:06
+++ revision 3 2011-03-14 09:08:28
@@ -8,11 +8,8 @@
         import urllib
         from re import findall
 
-        word=raw_input('what word do want to investigate? ')
         
 
-        t=0
-        vis=0
 
                            # ----------------------- #-

Recipe 577608 revision 3

Diff to Previous Revision

History

Accounts

Code Recipes

Feedback & Information

ActiveState