Skip to content

Ability to get related articles and specify page number #63

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 78 additions & 12 deletions scholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,12 @@ def __init__(self):
'num_citations': [0, 'Citations', 3],
'num_versions': [0, 'Versions', 4],
'cluster_id': [None, 'Cluster ID', 5],
'url_pdf': [None, 'PDF link', 6],
'url_citations': [None, 'Citations list', 7],
'url_versions': [None, 'Versions list', 8],
'url_citation': [None, 'Citation link', 9],
'excerpt': [None, 'Excerpt', 10],
'article_id': [None, 'Article ID', 6],
'url_pdf': [None, 'PDF link', 7],
'url_citations': [None, 'Citations list', 8],
'url_versions': [None, 'Versions list', 9],
'url_citation': [None, 'Citation link', 10],
'excerpt': [None, 'Excerpt', 11],
}

# The citation data in one of the standard export formats,
Expand Down Expand Up @@ -453,6 +454,9 @@ def _parse_links(self, span):
self.article['url_versions'] = \
self._strip_url_arg('num', self._path2url(tag.get('href')))

if tag.get('href').startswith('/scholar?q=related'):
self.article['article_id'] = tag.get('href').split(':')[1]

if tag.getText().startswith('Import'):
self.article['url_citation'] = self._path2url(tag.get('href'))

Expand Down Expand Up @@ -601,6 +605,9 @@ class ScholarQuery(object):
def __init__(self):
self.url = None

# Page number, used to calculate start index
self.page = 1

# The number of results requested from Scholar -- not the
# total number of results it reports (the latter gets stored
# in attrs, see below).
Expand All @@ -616,6 +623,13 @@ def set_num_page_results(self, num_page_results):
msg = 'maximum number of results on page must be numeric'
self.num_results = ScholarUtils.ensure_int(num_page_results, msg)

def set_page(self, page):
msg = 'page number must be numeric'
self.page = ScholarUtils.ensure_int(page, msg)

def get_start_index(self):
return self.num_results * (self.page - 1)

def get_url(self):
"""
Returns a complete, submittable URL string for this particular
Expand Down Expand Up @@ -679,6 +693,7 @@ class ClusterScholarQuery(ScholarQuery):
"""
SCHOLAR_CLUSTER_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \
+ 'cluster=%(cluster)s' \
+ '&start=%(start)s' \
+ '&num=%(num)s'

def __init__(self, cluster=None):
Expand All @@ -699,13 +714,46 @@ def get_url(self):
raise QueryArgumentError('cluster query needs cluster ID')

urlargs = {'cluster': self.cluster,
'start': self.get_start_index(),
'num': self.num_results or ScholarConf.MAX_PAGE_RESULTS}

for key, val in urlargs.items():
urlargs[key] = quote(encode(val))

return self.SCHOLAR_CLUSTER_URL % urlargs

class RelatedScholarQuery(ScholarQuery):
"""
This version just pulls up related articles of an article whose ID we already
know about.
"""
SCHOLAR_RELATED_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \
+ 'q=related:%(article_id)s:scholar.google.com' \
+ '&start=%(start)s' \
+ '&num=%(num)s'

def __init__(self, article_id=None):
ScholarQuery.__init__(self)
self._add_attribute_type('num_results', 'Results', 0)
self.article_id = None
self.set_article_id(article_id)

def set_article_id(self, article_id):
self.article_id = article_id

def get_url(self):
if self.article_id is None:
raise QueryArgumentError('related articles query needs article ID')

urlargs = {'article_id': self.article_id,
'start': self.get_start_index(),
'num': self.num_results or ScholarConf.MAX_PAGE_RESULTS}

for key, val in urlargs.items():
urlargs[key] = quote(encode(val))

return self.SCHOLAR_RELATED_URL % urlargs


class SearchScholarQuery(ScholarQuery):
"""
Expand All @@ -725,6 +773,7 @@ class SearchScholarQuery(ScholarQuery):
+ '&as_sdt=%(patents)s%%2C5' \
+ '&as_vis=%(citations)s' \
+ '&btnG=&hl=en' \
+ '&start=%(start)s' \
+ '&num=%(num)s'

def __init__(self):
Expand All @@ -735,7 +784,7 @@ def __init__(self):
self.words_none = None # None of these words
self.phrase = None
self.scope_title = False # If True, search in title only
self.author = None
self.author = None
self.pub = None
self.timeframe = [None, None]
self.include_patents = True
Expand Down Expand Up @@ -820,6 +869,7 @@ def get_url(self):
'yhi': self.timeframe[1] or '',
'patents': '0' if self.include_patents else '1',
'citations': '0' if self.include_citations else '1',
'start': self.get_start_index(),
'num': self.num_results or ScholarConf.MAX_PAGE_RESULTS}

for key, val in urlargs.items():
Expand Down Expand Up @@ -1154,8 +1204,12 @@ def main():
help='Do not include citations in results')
group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None,
help='Do not search, just use articles in given cluster ID')
group.add_option('-r', '--related', metavar='ARTICLE_ID', default=None,
help='Get related articles of a given article ID')
group.add_option('-c', '--count', type='int', default=None,
help='Maximum number of results')
group.add_option('--page', type='int', default=None,
help='Page number, default to 1')
parser.add_option_group(group)

group = optparse.OptionGroup(parser, 'Output format',
Expand Down Expand Up @@ -1200,14 +1254,21 @@ def main():
if options.cookie_file:
ScholarConf.COOKIE_JAR_FILE = options.cookie_file

have_search_arguments = options.author or options.allw or options.some or options.none \
or options.phrase or options.title_only or options.pub \
or options.after or options.before

# Sanity-check the options: if they include a cluster ID query, it
# makes no sense to have search arguments:
if options.cluster_id is not None:
if options.author or options.allw or options.some or options.none \
or options.phrase or options.title_only or options.pub \
or options.after or options.before:
print('Cluster ID queries do not allow additional search arguments.')
return 1
if options.cluster_id is not None and have_search_arguments:
print('Cluster ID queries do not allow additional search arguments.')
return 1

# Sanity-check the options: if they include a related articles query, it
# makes no sense to have search arguments:
if options.related is not None and have_search_arguments:
print('Related articles queries do not allow additional search arguments.')
return 1

querier = ScholarQuerier()
settings = ScholarSettings()
Expand All @@ -1228,6 +1289,8 @@ def main():

if options.cluster_id:
query = ClusterScholarQuery(cluster=options.cluster_id)
elif options.related:
query = RelatedScholarQuery(article_id=options.related)
else:
query = SearchScholarQuery()
if options.author:
Expand Down Expand Up @@ -1255,6 +1318,9 @@ def main():
options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS)
query.set_num_page_results(options.count)

if options.page is not None:
query.set_page(options.page)

querier.send_query(query)

if options.csv:
Expand Down