ActiveState Code

Recipe 523047: Search Google scholar


This code allows you to search Google scholar from Python code. The result is returned in a nice dictionary format with each field addressed by its key.

Python
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import httplib
import urllib
from BeautifulSoup import BeautifulSoup
import re

class GoogleScholarSearch:
	"""
	@brief This class searches Google Scholar (http://scholar.google.com)

	Search for articles and publications containing terms of interest.
	
	Usage example:\n
	<tt>
	> from google_search import *\n
	> searcher = GoogleScholarSearch()\n
	> searcher.search(['breast cancer', 'gene'])
	</tt>
	"""
	def __init__(self):
		"""
		@brief Empty constructor.
		"""
		self.SEARCH_HOST = "scholar.google.com"
		self.SEARCH_BASE_URL = "/scholar"

	def search(self, terms, limit=10):
		"""
		@brief This function searches Google Scholar using the specified terms.
		
		Returns a list of dictionarys. Each
		dictionary contains the information related to the article:
			"URL"		: link to the article/n
			"Title"		: title of the publication/n
			"Authors"	: authors (example: DF Easton, DT Bishop, D Ford)/n
			"JournalYear" 	: journal name & year (example: Nature, 2001)/n
			"JournalURL"	: link to the journal main website (example: www.nature.com)/n
			"Abstract"	: abstract of the publication/n
			"NumCited"	: number of times the publication is cited/n
			"Terms"		: list of search terms used in the query/n

		@param terms List of search terms
		@param limit Maximum number of results to be returned (default=10)
		@return List of results, this is the empty list if nothing is found
		"""
		params = urllib.urlencode({'q': "+".join(terms), 'num': limit})
		headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}

		url = self.SEARCH_BASE_URL+"?"+params
		conn = httplib.HTTPConnection(self.SEARCH_HOST)
		conn.request("GET", url, {}, headers)
    
		resp = conn.getresponse()      
        
		if resp.status==200:
			html = resp.read()
			results = []
			html = html.decode('ascii', 'ignore')
                        
			# Screen-scrape the result to obtain the publication information
			soup = BeautifulSoup(html)
			citations = 0
			for record in soup('p', {'class': 'g'}):
             
				# Includeds error checking
				topPart = record.first('span', {'class': 'w'})                                
                
				pubURL = topPart.a['href']
				# Clean up the URL, make sure it does not contain '\' but '/' instead
				pubURL = pubURL.replace('\\', '/')

				pubTitle = ""
                
				for part in topPart.a.contents:
					pubTitle += str(part)
                
				if pubTitle == "":
					match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a',str(record))
					match2 = re.split('- <a',match1[citations])
					pubTitle = re.sub('<\/?(\S)+>',"",match2[0])
					citations = citations + 1
               
				authorPart = record.first('font', {'color': 'green'}).string
				if str(authorPart)=='Null':	
					authorPart = ''
					# Sometimes even BeautifulSoup can fail, fall back to regex
					m = re.findall('<font color="green">(.*)</font>', str(record))
					if len(m)>0:
						authorPart = m[0]
				num = authorPart.count(" - ")
				# Assume that the fields are delimited by ' - ', the first entry will be the
				# list of authors, the last entry is the journal URL, anything in between
				# should be the journal year
				idx_start = authorPart.find(' - ')
				idx_end = authorPart.rfind(' - ')
				pubAuthors = authorPart[:idx_start]				
				pubJournalYear = authorPart[idx_start + 3:idx_end]
				pubJournalURL = authorPart[idx_end + 3:]
				# If (only one ' - ' is found) and (the end bit contains '\d\d\d\d')
				# then the last bit is journal year instead of journal URL
				if pubJournalYear=='' and re.search('\d\d\d\d', pubJournalURL)!=None:
					pubJournalYear = pubJournalURL
					pubJournalURL = ''
                               
				# This can potentially fail if all of the abstract can be contained in the space
				# provided such that no '...' is found
				delimiter = soup.firstText("...").parent
				pubAbstract = ""
				while str(delimiter)!='Null' and (str(delimiter)!='<b>...</b>' or pubAbstract==""):
					pubAbstract += str(delimiter)
					delimiter = delimiter.nextSibling
				pubAbstract += '<b>...</b>'
                
				match = re.search("Cited by ([^<]*)", str(record))
				pubCitation = ''
				if match != None:
					pubCitation = match.group(1)
				results.append({
					"URL": pubURL,
					"Title": pubTitle,
					"Authors": pubAuthors,
					"JournalYear": pubJournalYear,
					"JournalURL": pubJournalURL,
					"Abstract": pubAbstract,
					"NumCited": pubCitation,
					"Terms": terms
				})
			return results
		else:
			print "ERROR: ",
			print resp.status, resp.reason
			return []

if __name__ == '__main__':
    search = GoogleScholarSearch()
    pubs = search.search(["breast cancer", "gene"], 10)
    for pub in pubs:
        print pub['Title']
        print pub['Authors']
        print pub['JournalYear']
        print pub['Terms']
        print "======================================"

Discussion

So far as I know this is the only way to retrieve Google scholar search result from Python since Google does not release any API for Google scholar. Note that you will need an older version of BeautifulSoup (v2.1.1) which can be downloaded from http://www.physics.ox.ac.uk/users/santoso/BeautifulSoup.py.

Comments

  1. 1. At 5:20 a.m. on 29 nov 2007, Taoufik En-Najjary said:

    Mistake. It did not work. I think you forgot to define AuthorPart's atributes.

Sign in to comment