Welcome, guest | Sign In | My Account | Store | Cart

a simple url spider that goes through web pages and collects urls.

Python, 104 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
class url_spider(object):
    " it is like a spider aplication go through addresses and collect urls "
    def __init__(self,limit):
        self.limit=limit    # to limit extracting url

    def run(self):
        import sqlite3
        import urllib
        from re import findall

        


                           # ----------------------- #-
                     
        conn=sqlite3.connect('url_spider.db')
        cur=conn.cursor()
        
        cur.execute('CREATE TABLE IF NOT EXISTS urlbank(url TEXT UNIQUE,retv INTEGER,v INTEGER,times INTEGER)')
        #cur.execute('CREATE TABLE IF NOT EXISTS tags (url TEXT UNIQUE,tags TEXT)')
        # --url:the web adresses that is retrieved --retv:if we have searched through the link or not
        # --v: number of url's that another link have page to it --times:number of times that a url have retrieved
        
        
        c=1  # --limit control
        
        
                      # ---------------------- #-
        while True:            # for first time program prompt for an address and for secode time until limitation
            if c>self.limit:   # program use database and select a url that is not retieved
                return
            if c==1:
                host=raw_input('enter a url: ')    # where we start to collect url's
                
            if c>1 and c<=self.limit:       # ----second time loop
                try:
                    cur.execute('SELECT url,times FROM urlbank WHERE retv=0 LIMIT 1')
                    (host,t)=cur.fetchone()
                    t+=1
                    cur.execute('UPDATE urlbank SET times=? WHERE url=?',(t,host))
                except:
                    print 'there is a problem'
                    #return                # ----second time loop
                    
            else:        # ---continuing first time
                try: 
                   cur.execute('INSERT OR IGNORE INTO urlbank (url,retv,v,times) VALUES (?,0,0,1)',(host,))
                except:
                    cur.execute('SELECT times FROM urlbank WHERE url=?',(host,))
                    t=cur.fetchone()[0]
                    cur.execute('UPDATE urlbank SET times=? WHERE url=?',(t+1,host)) # ----end of first time
                                                                                    
            c+=1
            cur.execute('UPDATE urlbank SET retv=1 WHERE url=?',(host,))    # retv=1 becouse we are searcg through it
                        # --------------------- #-
                        
            try:
                if findall('.*(w3.org).*',host)[0]=='w3.org':  # --we would counter  a problem once we face to
                # ----this address so we ignore it.
                    continue
            except:
                pass
                       # --------------------- #-

                       
            try:
                doc=urllib.urlopen(host)   #---loading urs's destination
            except:                      
                continue


            for line in doc:    # ----- starting extract
                for link in findall('.*(http://\S+[.]{1}\S+[.]{1}[a-zA-Z]{2,4}[^\s"\<>.]+)/.*',line): # ---extracting usrl's
                    try:
                        cur.execute('SELECT v FROM urlbank WHERE url=?',(link,))
                        vis=cur.fetchone()[0]
                        cur.execute('UPDATE urlbank SET v=? WHERE url=?',(vis+1,link))
                    except:
                        cur.execute('INSERT OR IGNORE INTO urlbank (url,retv,v,times) VALUES (?,0,1,0)',(link,))
                        
                        

                        
                
                try:       # ----putting data in database by using try and except ----#-
                    conn.commit()
                except:
                    pass
                # ---------------------- - - -

                
                
            # ----------- END OF LOOP IS HERE --------- #-
            
        conn.close()
    
# ----------------------------------END OF class-----------------------------

# -----------------------------RUNNING:
###########################
if __name__=="__main__": ##
    t=url_spider(10)     ##
    t.run()              ##
###########################

            
                
        

6 comments

amir naghavi (author) 13 years, 1 month ago  # | flag

if you run and test it please comment.

Sunjay Varma 13 years, 1 month ago  # | flag

Will we not need the actual database file in order to run this?

amir naghavi (author) 13 years, 1 month ago  # | flag

Sunjay Varma:

Will we not need the actual database file in order to run this?

i didn't understand your question exactly, but you can run this program properly with standard python and the program creates it's database if it does not exist.

Sunjay Varma 13 years, 1 month ago  # | flag

Okay thanks. You answered my question. I thought there was a pre-compiled data base.

amir naghavi (author) 13 years, 1 month ago  # | flag

you'r welcome sunjay

dongzh 12 years ago  # | flag

root@dongzh-laptop:~/python/exercise# cat showsqlite.py

!/usr/bin/env python

import sqlite3 conn = sqlite3.connect('url_spider.db') c = conn.cursor()

c.execute('select * from urlbank')

for row in c: print row

use above script to see the result