Welcome, guest | Sign In | My Account | Store | Cart
# Google Hits 1.01
# Date: 2008/06/08
# License: As-is; public domain
# Prerequisites: Python 2.5.2

# Description:
# The getHits function in this file takes as input a search term string, and returns as output a dict containing the number of web hits returned for that search term. The Google search engine and a SQLite database are used.
# The optional input arguments are:
#  - cond_enc (bool) (default is False): enclose the search term in quotes if it contains more than one word
#  - db_name (default is 'Hits.sqlite'): the name of the database to use for caching hits
#  - max_age (default is 30): the maximum acceptable age in days of cached hits

# Usage:
# from getHits import getHits
# hits=getHits('sample search term')['hits']

# Keywords:
# hits, count, popularity
# web search hits, web hits, Google hits

# Import needed modules
import datetime, os.path, re, sqlite3, urllib2

def getHits(term,cond_enc=False,db_name='Hits.sqlite',max_age=30):
    """Get web search hits for a given term"""

    # Validate term
    assert (isinstance(term,str) or isinstance(term,unicode)), 'Term must be a string'

    def getHitsDb(term,con,max_age):
        """Get web search hits for a given term from a database if available"""

        # Get hits from database if available
        row=con.execute('SELECT Hits,DateTimeUTC FROM Hits WHERE Term=?', (hits['term'],)).fetchone()

        # Determine hits from database as available
        if row==None:
            hits['inDb']=False
        else:

            # Postprocess row
            row={'Hits':row['Hits'], 'DateTimeUTC':row['DateTimeUTC']}
            row['DateTimeUTC']=datetime.datetime.strptime(row['DateTimeUTC'],'%Y-%m-%d %H:%M:%S')
            row['Age']=datetime.datetime.utcnow()-row['DateTimeUTC']
            row['Age']=row['Age'].days+row['Age'].seconds/float(datetime.timedelta.max.seconds)

            # Conditionally determine hits based on age of hits
            if row['Age']>max_age:
                con.execute('DELETE FROM Hits WHERE Term=?', (hits['term'],))
                hits['inDb']=False
            else:
                hits['inDb']=True
                hits['hits']=row['Hits']

        # Return updated hits
        return hits

    def getHitsWeb(hits):
        """Get web search hits for a given term from a web search"""

        # Set parameters
        url='http://google.com/search?' # Set web search URL

        # Generate web search term
        hits['web search term']=urllib2.quote(hits['term'])

        # Execute web search
        url=urllib2.Request('%sq=%s'%(url,hits['web search term']))
        url.add_header('User-Agent','')
        url=urllib2.urlopen(url).read()

        # Store date and time of web search
        hits['datetimeutc']=datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')

        # Parse web search results to determine hits
        hits['hits']=re.search('Results <b>1</b> - <b>10</b> of about <b>(?P<hits>.+?)</b> for <b>',url)
        if hits['hits']!=None:
            hits['hits']=hits['hits'].group('hits')
            hits['hits']=hits['hits'].replace(',','')
            hits['hits']=int(hits['hits'])
        else:
            hits['hits']=0

        # Return updated hits
        return hits

    def setHitsDb(con,hits):
        """Store web search hits for a given term in the database"""
        con.execute('INSERT INTO Hits (Term,Hits,DateTimeUTC) VALUES (?,?,?)', (hits['term'],hits['hits'],hits['datetimeutc']))

    def createDb(db_name):
        """Create a database to store web search hits"""
        con=sqlite3.connect(db_name,isolation_level=None)
        con.execute("""CREATE TABLE `Hits` (`Term` CHAR PRIMARY KEY  NOT NULL , `Hits` INTEGER NOT NULL , `DateTimeUTC` DATETIME NOT NULL )""")

    # Create database if not created
    if not os.path.isfile(db_name): createDb(db_name)

    # Initialize database
    con=sqlite3.connect(db_name,isolation_level=None)
    con.row_factory=sqlite3.Row

    # Create dict to store relevant info
    hits={'term':term}

    # Condtionally enclose term in quotes
    if cond_enc and hits['term'].__contains__(' '): hits['term']='"%s"'%hits['term']

    # Get hits
    hits=getHitsDb(hits,con,max_age) # Get hits from database if available
    if not hits['inDb']:
        hits=getHitsWeb(hits) # Get hits from web
        setHitsDb(con,hits) # Store hits in database

    # Return hits
    return hits

def cleanDb(db_name='Hits.sqlite',max_age=30):
    """Delete entries from database that are older than the specified maximum age in days"""

    # Determine threshold for date and time for old entries
    min_datetime=datetime.datetime.utcnow()-datetime.timedelta(days=30)
    min_datetime=min_datetime.strftime('%Y-%m-%d %H:%M:%S')

    # Delete old entries from database
    con=sqlite3.connect(db_name,isolation_level=None)
    con.execute('DELETE FROM Hits WHERE DateTimeUTC<?', (min_datetime,))

History

  • revision 2 (15 years ago)
  • previous revisions are not available