summaryrefslogtreecommitdiff
path: root/scraper/s2-citation-report.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-citation-report.py')
-rw-r--r--scraper/s2-citation-report.py91
1 files changed, 88 insertions, 3 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py
index 5c5fae9a..d70a378a 100644
--- a/scraper/s2-citation-report.py
+++ b/scraper/s2-citation-report.py
@@ -5,9 +5,11 @@ import simplejson as json
import math
import operator
import click
-#import builder
+import subprocess
from util import *
+DIR_PUBLIC_CITATIONS = "../site/datasets/citations"
+
@click.command()
def s2_citation_report():
addresses = AddressBook()
@@ -30,6 +32,82 @@ def s2_citation_report():
print("citations: {}".format(paper_count))
print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
+ write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers)
+
+ sts = subprocess.call([
+ "s3cmd", "put", "-P", "--recursive",
+ DIR_PUBLIC_CITATIONS + '/',
+ "s3://megapixels/v1/citations/",
+ ])
+
+def write_master_report(fn, papers):
+ # first make a lookup of the keys that have papers
+ paper_key_lookup = {}
+ for paper in papers:
+ if paper['key'] not in paper_key_lookup:
+ paper_key_lookup[paper['key']] = paper
+
+ # then fetch the statistics csv which has things like "year"
+ fields, rows = fetch_google_sheet('statistics')
+ master_papers = []
+ statistics = {}
+
+ def clean(n):
+ if type(n) is int:
+ return n
+ if type(n) is str and n:
+ s = str(n).replace(',','').replace('.','').replace('?','').strip()
+ try:
+ return int(s)
+ except e:
+ return s
+ if n:
+ return n
+ return None
+
+ for row in rows:
+ key = row[0]
+ if key not in paper_key_lookup:
+ continue
+ paper = paper_key_lookup[key]
+ stats = {}
+ for index, field in enumerate(fields):
+ stats[field] = row[index]
+ report_fn = '../site/content/datasets/{}/index.md'.format(key)
+ has_report = os.path.exists(report_fn)
+ statistics[key] = stats
+ search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId']))
+
+ image_count = stats['images']
+ if type(image_count) is str:
+ if len(image_count):
+ image_count = clean(image_count)
+ else:
+ image_count = None,
+ master_papers.append([
+ stats['key'],
+ stats['name'],
+ '/datasets/{}/'.format(key) if has_report else '',
+ image_count,
+ clean(stats['faces_unique']) or None,
+ stats['year_published'],
+ clean(paper['citation_count']) or 0,
+ clean(search_result['citationStats']['numKeyCitations']) or 0,
+ # origin
+ ])
+ master_paper_keys = [
+ 'key',
+ 'title',
+ 'link',
+ 'images',
+ 'people',
+ 'year',
+ 'citations',
+ 'influenced',
+ # 'origin'
+ ]
+ write_csv(fn, keys=master_paper_keys, rows=master_papers)
+
def write_papers_report(fn, title, papers, key, reverse=False):
sorted_papers = []
for paper in sorted(papers, key=lambda x: x[key], reverse=reverse):
@@ -105,7 +183,7 @@ def process_paper(row, addresses, success):
with open(fn, 'r') as f:
data = json.load(f)
- print('>> {}'.format(data['paperId']))
+ print('>> {} {}'.format(data['paperId'], row['key']))
paper = load_paper(data['paperId'])
if paper is None:
print("Paper missing! {}".format(data['paperId']))
@@ -145,7 +223,7 @@ def process_paper(row, addresses, success):
pdf_count += 1
if has_doi:
doi_count += 1
- if citation.data is None:
+ if citation is None or citation.data is None:
print("Citation missing! {}".format(cite['paperId']))
continue
institutions = load_institutions(citationId)
@@ -262,6 +340,13 @@ def process_paper(row, addresses, success):
f.write('<script src="../map.js"></script>')
f.write("</html>")
# template = env.get_template('paper.html')
+ with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f:
+ json.dump({
+ 'id': paper.paper_id,
+ 'paper': res,
+ 'address': paper_address,
+ 'citations': geocoded_citations,
+ }, f)
return res
def load_megapixels_queries():