Welcome, guest | Sign In | My Account | Store | Cart

This is a class to make easy the development of robots, to parse results over a website with a paging. For example Google, Yahoo, Bing, or any other page with paging system.

PagerEngine is the main class. I've developed three more clases implementing GoogleSearch, YahooSearch and BingSearch as examples.

Inheriting from PagerEngine (and having RexExp knowledge) you can easily develop other robots for other websites.

Python, 189 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#Covered by GPL V2.0
#Coded by Carlos del Ojo Elias (deepbit@gmail.com)

import re
from urllib import unquote
from urllib2 import Request,build_opener


class PagerEngine:
    retag=re.compile("<[^>]+>")
    remultag=re.compile("<[^>]+>(<[^>]+>)+")

    def __init__(self,query):
        query=query.replace(" ","%20")
        query=query.replace("+","%2b")
        query=query.replace("\"","%27")
        self.query=query

        self.results=[]
        self.diccres={}

        self.startIndex=0               ## Start index
        self.increment=10               ## Index increment
        self.lastResult=""
        self.start=None                 ## First index for the search

        self.MoreResults=None


        ########### Overload variables, must be modified per website #############
        self.url=None
        self.queryvar=None
        self.startvar=None

        self.urlRegexp=None            ## Regexp of desired information
        self.nextRegexp=None           ## Regex to know if there are more pages to follow


    def __iter__(self):
        self.start=None
        self.MoreResults=None
        return self

    def addResult(self,res):
        res=self.processResult(res)
        if not isinstance(res,list):
            res=[res]
        for i in res:
            if not str(i) in self.diccres:
                self.diccres[str(i)]=True
                self.results.append(i)


    def next(self):
        while not self.results:
            self.getNewPage()

        if not self.results:
            raise StopIteration

        self.lastResult=self.results.pop()

        if not self.lastResult:
            return self.next()

        return self.lastResult


    def cleanString(self,res):
        res=PagerEngine.remultag.sub(" ",res)
        res=PagerEngine.retag.sub("",res)
        res=res.replace("&nbsp;"," ")
        res=res.replace("&amp;","&")
        res=res.strip()
        return res


    def getNewPage(self):

        if self.MoreResults==False:
            raise StopIteration

        if self.start==None:
            self.start=self.startIndex
        else:
            self.start+=self.increment

        url=self.url.replace("{query}",str(self.query))
        url=url.replace("{startvar}",str(self.start))

        request = Request(url)
        opener = build_opener()
        request.add_header('User-Agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14')

        rawResponse=self.preProcess(opener.open(request).read())

        for i in re.findall(self.urlRegexp,rawResponse):
            self.addResult(i)

        if re.findall(self.nextRegexp,rawResponse):
            self.MoreResults=True
        else:
            self.MoreResults=False

    def getResult(self):
        try:
            return self.next()
        except:
            return None

    def getNResults(self,n):
        l=[]
        for i in range(n):
            try:
                l.append(self.next())
            except:
                break

        return l

    # Virtual functions, you can preprocess (html) and postprocess (each result)    
    def preProcess(self,raw):
        return raw

    def processResult (self,res):
        return self.cleanString(res)

########################################################
#Class Examples
######################################################

##GOOGLE##

class GoogleSearch(PagerEngine):
    def __init__(self,query):
        PagerEngine.__init__(self,query)

        self.url="http://www.google.com/search?q={query}&start={startvar}&num=100"

        self.urlRegexp="\"([^\"]+)\" class=l "
        self.nextRegexp=">Next<"
        self.increment=100

## BING ##

class BingSearch(PagerEngine):
    def __init__(self,query):
        PagerEngine.__init__(self,query)

        self.url="http://www.bing.com/search?q={query}&first={startvar}"

        self.urlRegexp="sb_tlst\"><h3><a href=\"([^\"]+)\" onmousedown"
        self.nextRegexp="\)\">Next</a></li>"

        self.startIndex=1
        self.increment=10

## YAHOO ##

class YahooSearch(PagerEngine):
    def __init__(self,query):
        PagerEngine.__init__(self,query)

        self.url="http://search.yahoo.com/search?p={query}&b={startvar}&ei=UTF-8&y=Search&xargs=0&pstart=0"

        self.urlRegexp="class=url>((?:[^<]|<[^/]|</[^s])+)</span>"
        self.nextRegexp=">Next &gt;"

    def processResult(self,res):
        res=self.cleanString(res)

        if "yahoo" in res:
            return None

        res=unquote(res)

        return res

############################################################
# Usage
#################################################

# Getting all google results for a search
for i in GoogleSearch("cooking recipes"):
    print i

# Getting first 5 results in a yahoo search
a=YahooSearch("cooking recipes")
print a.getNResults(5)

You could use that code to automate parsing of a website divided into several pages.

for develop a robot pager you only need to inherit from PagerEngine and set the following variables (if needed)

# Url

# {query} is the keyword that will be the replaced by the search string (passed to constructor)

# {startvar} is the keyword where "indexpage" will be replaced in the URL for every page

self.url="http://www.bing.com/search?q={query}&first={startvar}"

# urlRegexp is the regular expression for the data you are interested in; put between parenthesis what you want (py regexps)

self.urlRegexp="sb_tlst\"><h3><a href=\"([^\"]+)\" onmousedown" # Regular expression to test if there is a "Next page" or not... self.nextRegexp="\)\">Next</a></li>"

# Page parameters

self.startIndex=1 # First page self.increment=10 # Increment value per page (sometimes 1, 10,50, etc...)

1 comment

Anand B Pillai 13 years, 4 months ago  # | flag

This is very good, but I see you have used old style iterators and lists instead of using generators. I think, the code can be enhanced by using generators. I will try to post a version of this recipe using them.