Welcome, guest | Sign In | My Account | Store | Cart

Semi-lazy interrogation of searchhippo.com returns a virtual array of the search engine's records.

Python, 98 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class hippoFault ( Exception ): pass

class searchHippo:
    """
Python class able to return results from the XML interface at www.searchhippo.com

>>> from searchHippo import searchHippo
>>> hippo = searchHippo ( searchTerms = "bliss", requestSize = 5 )
>>> hippo.records ( 8 )
('GameDev.net - all your game development needs', 'Features Resources Community Directories More News Columns Contests Articles & Resources Books & Software For Beginners Forums GD Showcase IRC Chat Network Job Offers Member Search Hosted S...', 'http://208.148.122.26/r.php?i=7&u=http://www.gamedev.net/&q=bliss', 'http://www.gamedev.net/')
>>> hippo.records ( 9 )
('TOMPAINE.com - A Public Interest Journal', "Home | About Us | Contact Us | Submissions Search The Bush Administration's Dirty Little Secret Bush Policies Don't Protect Nuclear Material From Terrorist Theft by William Hartung The dirt...", 'http://208.148.122.26/r.php?i=8&u=http://www.tompaine.com/&q=bliss', 'http://www.tompaine.com/')

Each hippo record is returned as a four-tuple whose elements are a page title, a description of the page, a click-through URL
for the page and the URL to be displayed with the click-through URL.

The class uses a form of "semi-lazy" execution. That is, if record 'k' is already available then it is returned immediately 
without further action being taken. However, if record 'k' is unavailable then the class attempts to obtain the
records up to 'requestSize' records away from record 'k' from searchHippo, as well as record 'k' itself, if these records 
are not already available.
    """
    def __init__ ( self, searchTerms = None, requestSize = 50 ):
        if searchTerms == None:
            raise hippoFault, "searchHippo needs 'searchTerms'"
        self . _searchTerms = searchTerms
        self . requestSize = requestSize
        self . _records = { }
        self . parser = None
    def records ( self, num ):
        num = num - 1
        if num < 0:
            raise hippoFault, "searchHippo.records expects integer num > 0"
        if ( num + 1 ) in self . _records:
            return self . _records [ num + 1 ]
        else:
            half = self . requestSize / 2
            kLow = num
            while kLow > 0 and not ( kLow - 1 ) in self . _records and ( num - kLow ) < half:
                kLow -= 1
            half = ( self . requestSize + 1 ) / 2
            kHigh = num
            while not ( kHigh + 1 ) in self . _records and ( kHigh + 1 - num ) <= half:
                kHigh += 1
            reqsize = kHigh + 1 - kLow + self . requestSize
            if reqsize > self . requestSize: reqsize = self . requestSize
            self . _get ( kLow, reqsize )
            if ( num + 1 ) in self . _records:
                return self . _records [ num + 1 ]
            else:
                return None
    def _get ( self, startNumber, reqsize ):
        import pyRXP
        from urllib import urlopen

        hippoXML = urlopen ( "http://www.searchhippo.com/qxml.php?q=%s&c=%i&i=%i" % ( self . _searchTerms, self . requestSize, startNumber, ) ) . read ( )
        if self . parser == None: self. parser = pyRXP.Parser()
        tree = self . parser . parse ( hippoXML )
        self . _descend ( tree )
    def _descend ( self, fourTuple ):
        tagName, tagAttrs, content, unused = fourTuple
        if tagName == "RESULTS":
            pass
        elif tagName == "RECORD":
            global NUM
            NUM = tagAttrs [ 'NUM' ]
        elif tagName == "TITLE":
            global TITLE
            TITLE = content [ 0 ]
            return
        elif tagName == "DESCR":
            global DESCR
            DESCR = content [ 0 ]
            return
        elif tagName == "URL":
            global URL
            URL = content [ 0 ]
            return
        elif tagName == "DISPURL":
            global DISPURL
            DISPURL = content [ 0 ]
            return
        for item in content:
            try:
                self . _descend ( item )
            except:
                pass
        if tagName == "RECORD":
            self. _records [ int ( NUM ) ] = ( TITLE, DESCR, URL, DISPURL )
    def _get_searchTerms ( self ):
        return _searchTerms
    searchTerms = property ( None, _get_searchTerms, None, '' )

if __name__ == "__main__":
    hippo = searchHippo ( searchTerms = "snowman", requestSize = 5 )

    for i in xrange ( 10 ):
        item = hippo . records ( i + 1 )
        print "records item", i + 1, " is", item

Easy to obtain results from SearchHippo.

Uses pyRXP to tranform the XML from SearchHippo into a Python tree.