a simple url spider that goes through web pages and collects urls.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | class url_spider(object):
" it is like a spider aplication go through addresses and collect urls "
def __init__(self,limit):
self.limit=limit # to limit extracting url
def run(self):
import sqlite3
import urllib
from re import findall
# ----------------------- #-
conn=sqlite3.connect('url_spider.db')
cur=conn.cursor()
cur.execute('CREATE TABLE IF NOT EXISTS urlbank(url TEXT UNIQUE,retv INTEGER,v INTEGER,times INTEGER)')
#cur.execute('CREATE TABLE IF NOT EXISTS tags (url TEXT UNIQUE,tags TEXT)')
# --url:the web adresses that is retrieved --retv:if we have searched through the link or not
# --v: number of url's that another link have page to it --times:number of times that a url have retrieved
c=1 # --limit control
# ---------------------- #-
while True: # for first time program prompt for an address and for secode time until limitation
if c>self.limit: # program use database and select a url that is not retieved
return
if c==1:
host=raw_input('enter a url: ') # where we start to collect url's
if c>1 and c<=self.limit: # ----second time loop
try:
cur.execute('SELECT url,times FROM urlbank WHERE retv=0 LIMIT 1')
(host,t)=cur.fetchone()
t+=1
cur.execute('UPDATE urlbank SET times=? WHERE url=?',(t,host))
except:
print 'there is a problem'
#return # ----second time loop
else: # ---continuing first time
try:
cur.execute('INSERT OR IGNORE INTO urlbank (url,retv,v,times) VALUES (?,0,0,1)',(host,))
except:
cur.execute('SELECT times FROM urlbank WHERE url=?',(host,))
t=cur.fetchone()[0]
cur.execute('UPDATE urlbank SET times=? WHERE url=?',(t+1,host)) # ----end of first time
c+=1
cur.execute('UPDATE urlbank SET retv=1 WHERE url=?',(host,)) # retv=1 becouse we are searcg through it
# --------------------- #-
try:
if findall('.*(w3.org).*',host)[0]=='w3.org': # --we would counter a problem once we face to
# ----this address so we ignore it.
continue
except:
pass
# --------------------- #-
try:
doc=urllib.urlopen(host) #---loading urs's destination
except:
continue
for line in doc: # ----- starting extract
for link in findall('.*(http://\S+[.]{1}\S+[.]{1}[a-zA-Z]{2,4}[^\s"\<>.]+)/.*',line): # ---extracting usrl's
try:
cur.execute('SELECT v FROM urlbank WHERE url=?',(link,))
vis=cur.fetchone()[0]
cur.execute('UPDATE urlbank SET v=? WHERE url=?',(vis+1,link))
except:
cur.execute('INSERT OR IGNORE INTO urlbank (url,retv,v,times) VALUES (?,0,1,0)',(link,))
try: # ----putting data in database by using try and except ----#-
conn.commit()
except:
pass
# ---------------------- - - -
# ----------- END OF LOOP IS HERE --------- #-
conn.close()
# ----------------------------------END OF class-----------------------------
# -----------------------------RUNNING:
###########################
if __name__=="__main__": ##
t=url_spider(10) ##
t.run() ##
###########################
|
if you run and test it please comment.
Will we not need the actual database file in order to run this?
Sunjay Varma:
Will we not need the actual database file in order to run this?
i didn't understand your question exactly, but you can run this program properly with standard python and the program creates it's database if it does not exist.
Okay thanks. You answered my question. I thought there was a pre-compiled data base.
you'r welcome sunjay
root@dongzh-laptop:~/python/exercise# cat showsqlite.py
!/usr/bin/env python
import sqlite3 conn = sqlite3.connect('url_spider.db') c = conn.cursor()
c.execute('select * from urlbank')
print c.fetchone()
for row in c: print row
use above script to see the result