This is a class to make easy the development of robots, to parse results over a website with a paging. For example Google, Yahoo, Bing, or any other page with paging system.
PagerEngine is the main class. I've developed three more clases implementing GoogleSearch, YahooSearch and BingSearch as examples.
Inheriting from PagerEngine (and having RexExp knowledge) you can easily develop other robots for other websites.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | #Covered by GPL V2.0
#Coded by Carlos del Ojo Elias (deepbit@gmail.com)
import re
from urllib import unquote
from urllib2 import Request,build_opener
class PagerEngine:
retag=re.compile("<[^>]+>")
remultag=re.compile("<[^>]+>(<[^>]+>)+")
def __init__(self,query):
query=query.replace(" ","%20")
query=query.replace("+","%2b")
query=query.replace("\"","%27")
self.query=query
self.results=[]
self.diccres={}
self.startIndex=0 ## Start index
self.increment=10 ## Index increment
self.lastResult=""
self.start=None ## First index for the search
self.MoreResults=None
########### Overload variables, must be modified per website #############
self.url=None
self.queryvar=None
self.startvar=None
self.urlRegexp=None ## Regexp of desired information
self.nextRegexp=None ## Regex to know if there are more pages to follow
def __iter__(self):
self.start=None
self.MoreResults=None
return self
def addResult(self,res):
res=self.processResult(res)
if not isinstance(res,list):
res=[res]
for i in res:
if not str(i) in self.diccres:
self.diccres[str(i)]=True
self.results.append(i)
def next(self):
while not self.results:
self.getNewPage()
if not self.results:
raise StopIteration
self.lastResult=self.results.pop()
if not self.lastResult:
return self.next()
return self.lastResult
def cleanString(self,res):
res=PagerEngine.remultag.sub(" ",res)
res=PagerEngine.retag.sub("",res)
res=res.replace(" "," ")
res=res.replace("&","&")
res=res.strip()
return res
def getNewPage(self):
if self.MoreResults==False:
raise StopIteration
if self.start==None:
self.start=self.startIndex
else:
self.start+=self.increment
url=self.url.replace("{query}",str(self.query))
url=url.replace("{startvar}",str(self.start))
request = Request(url)
opener = build_opener()
request.add_header('User-Agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14')
rawResponse=self.preProcess(opener.open(request).read())
for i in re.findall(self.urlRegexp,rawResponse):
self.addResult(i)
if re.findall(self.nextRegexp,rawResponse):
self.MoreResults=True
else:
self.MoreResults=False
def getResult(self):
try:
return self.next()
except:
return None
def getNResults(self,n):
l=[]
for i in range(n):
try:
l.append(self.next())
except:
break
return l
# Virtual functions, you can preprocess (html) and postprocess (each result)
def preProcess(self,raw):
return raw
def processResult (self,res):
return self.cleanString(res)
########################################################
#Class Examples
######################################################
##GOOGLE##
class GoogleSearch(PagerEngine):
def __init__(self,query):
PagerEngine.__init__(self,query)
self.url="http://www.google.com/search?q={query}&start={startvar}&num=100"
self.urlRegexp="\"([^\"]+)\" class=l "
self.nextRegexp=">Next<"
self.increment=100
## BING ##
class BingSearch(PagerEngine):
def __init__(self,query):
PagerEngine.__init__(self,query)
self.url="http://www.bing.com/search?q={query}&first={startvar}"
self.urlRegexp="sb_tlst\"><h3><a href=\"([^\"]+)\" onmousedown"
self.nextRegexp="\)\">Next</a></li>"
self.startIndex=1
self.increment=10
## YAHOO ##
class YahooSearch(PagerEngine):
def __init__(self,query):
PagerEngine.__init__(self,query)
self.url="http://search.yahoo.com/search?p={query}&b={startvar}&ei=UTF-8&y=Search&xargs=0&pstart=0"
self.urlRegexp="class=url>((?:[^<]|<[^/]|</[^s])+)</span>"
self.nextRegexp=">Next >"
def processResult(self,res):
res=self.cleanString(res)
if "yahoo" in res:
return None
res=unquote(res)
return res
############################################################
# Usage
#################################################
# Getting all google results for a search
for i in GoogleSearch("cooking recipes"):
print i
# Getting first 5 results in a yahoo search
a=YahooSearch("cooking recipes")
print a.getNResults(5)
|
You could use that code to automate parsing of a website divided into several pages.
for develop a robot pager you only need to inherit from PagerEngine and set the following variables (if needed)
# Url
# {query} is the keyword that will be the replaced by the search string (passed to constructor)
# {startvar} is the keyword where "indexpage" will be replaced in the URL for every page
self.url="http://www.bing.com/search?q={query}&first={startvar}"
# urlRegexp is the regular expression for the data you are interested in; put between parenthesis what you want (py regexps)
self.urlRegexp="sb_tlst\"><h3><a href=\"([^\"]+)\" onmousedown" # Regular expression to test if there is a "Next page" or not... self.nextRegexp="\)\">Next</a></li>"
# Page parameters
self.startIndex=1 # First page self.increment=10 # Increment value per page (sometimes 1, 10,50, etc...)
This is very good, but I see you have used old style iterators and lists instead of using generators. I think, the code can be enhanced by using generators. I will try to post a version of this recipe using them.