This is a class to make easy the development of robots, to parse results over a website with a paging. For example Google, Yahoo, Bing, or any other page with paging system.
PagerEngine is the main class. I've developed three more clases implementing GoogleSearch, YahooSearch and BingSearch as examples.
Inheriting from PagerEngine (and having RexExp knowledge) you can easily develop other robots for other websites.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | #Covered by GPL V2.0
#Coded by Carlos del Ojo Elias (deepbit@gmail.com)
import re
from urllib import unquote
from urllib2 import Request,build_opener
class PagerEngine:
    retag=re.compile("<[^>]+>")
    remultag=re.compile("<[^>]+>(<[^>]+>)+")
    def __init__(self,query):
        query=query.replace(" ","%20")
        query=query.replace("+","%2b")
        query=query.replace("\"","%27")
        self.query=query
        self.results=[]
        self.diccres={}
        self.startIndex=0               ## Start index
        self.increment=10               ## Index increment
        self.lastResult=""
        self.start=None                 ## First index for the search
        self.MoreResults=None
        ########### Overload variables, must be modified per website #############
        self.url=None
        self.queryvar=None
        self.startvar=None
        self.urlRegexp=None            ## Regexp of desired information
        self.nextRegexp=None           ## Regex to know if there are more pages to follow
    def __iter__(self):
        self.start=None
        self.MoreResults=None
        return self
    def addResult(self,res):
        res=self.processResult(res)
        if not isinstance(res,list):
            res=[res]
        for i in res:
            if not str(i) in self.diccres:
                self.diccres[str(i)]=True
                self.results.append(i)
    def next(self):
        while not self.results:
            self.getNewPage()
        if not self.results:
            raise StopIteration
        self.lastResult=self.results.pop()
        if not self.lastResult:
            return self.next()
        return self.lastResult
    def cleanString(self,res):
        res=PagerEngine.remultag.sub(" ",res)
        res=PagerEngine.retag.sub("",res)
        res=res.replace(" "," ")
        res=res.replace("&","&")
        res=res.strip()
        return res
    def getNewPage(self):
        if self.MoreResults==False:
            raise StopIteration
        if self.start==None:
            self.start=self.startIndex
        else:
            self.start+=self.increment
        url=self.url.replace("{query}",str(self.query))
        url=url.replace("{startvar}",str(self.start))
        request = Request(url)
        opener = build_opener()
        request.add_header('User-Agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14')
        rawResponse=self.preProcess(opener.open(request).read())
        for i in re.findall(self.urlRegexp,rawResponse):
            self.addResult(i)
        if re.findall(self.nextRegexp,rawResponse):
            self.MoreResults=True
        else:
            self.MoreResults=False
    def getResult(self):
        try:
            return self.next()
        except:
            return None
    def getNResults(self,n):
        l=[]
        for i in range(n):
            try:
                l.append(self.next())
            except:
                break
        return l
    # Virtual functions, you can preprocess (html) and postprocess (each result)    
    def preProcess(self,raw):
        return raw
    def processResult (self,res):
        return self.cleanString(res)
########################################################
#Class Examples
######################################################
##GOOGLE##
class GoogleSearch(PagerEngine):
    def __init__(self,query):
        PagerEngine.__init__(self,query)
        self.url="http://www.google.com/search?q={query}&start={startvar}&num=100"
        self.urlRegexp="\"([^\"]+)\" class=l "
        self.nextRegexp=">Next<"
        self.increment=100
## BING ##
class BingSearch(PagerEngine):
    def __init__(self,query):
        PagerEngine.__init__(self,query)
        self.url="http://www.bing.com/search?q={query}&first={startvar}"
        self.urlRegexp="sb_tlst\"><h3><a href=\"([^\"]+)\" onmousedown"
        self.nextRegexp="\)\">Next</a></li>"
        self.startIndex=1
        self.increment=10
## YAHOO ##
class YahooSearch(PagerEngine):
    def __init__(self,query):
        PagerEngine.__init__(self,query)
        self.url="http://search.yahoo.com/search?p={query}&b={startvar}&ei=UTF-8&y=Search&xargs=0&pstart=0"
        self.urlRegexp="class=url>((?:[^<]|<[^/]|</[^s])+)</span>"
        self.nextRegexp=">Next >"
    def processResult(self,res):
        res=self.cleanString(res)
        if "yahoo" in res:
            return None
        res=unquote(res)
        return res
############################################################
# Usage
#################################################
# Getting all google results for a search
for i in GoogleSearch("cooking recipes"):
    print i
# Getting first 5 results in a yahoo search
a=YahooSearch("cooking recipes")
print a.getNResults(5)
 | 
You could use that code to automate parsing of a website divided into several pages.
for develop a robot pager you only need to inherit from PagerEngine and set the following variables (if needed)
# Url
# {query} is the keyword that will be the replaced by the search string (passed to constructor)
# {startvar} is the keyword where "indexpage" will be replaced in the URL for every page
self.url="http://www.bing.com/search?q={query}&first={startvar}"
# urlRegexp is the regular expression for the data you are interested in; put between parenthesis what you want (py regexps)
self.urlRegexp="sb_tlst\"><h3><a href=\"([^\"]+)\" onmousedown" # Regular expression to test if there is a "Next page" or not... self.nextRegexp="\)\">Next</a></li>"
# Page parameters
self.startIndex=1 # First page self.increment=10 # Increment value per page (sometimes 1, 10,50, etc...)

 Download
Download Copy to clipboard
Copy to clipboard
This is very good, but I see you have used old style iterators and lists instead of using generators. I think, the code can be enhanced by using generators. I will try to post a version of this recipe using them.