summaryrefslogtreecommitdiff
path: root/scraper/s2-citation-report.py
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-12-16 16:29:04 +0100
committerJules Laplace <julescarbon@gmail.com>2018-12-16 16:29:04 +0100
commit05fc975a313aa38483d904cb9ad07a029641d086 (patch)
tree85f3d4fdf3688c2779d3ca3ba9c59910a48d1df9 /scraper/s2-citation-report.py
parent110f3a34f1f36d0ea999d4aa34bbe66d5f2a01da (diff)
rebuild
Diffstat (limited to 'scraper/s2-citation-report.py')
-rw-r--r--scraper/s2-citation-report.py67
1 files changed, 62 insertions, 5 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py
index e0d812d7..b5849329 100644
--- a/scraper/s2-citation-report.py
+++ b/scraper/s2-citation-report.py
@@ -32,16 +32,73 @@ def s2_citation_report():
print("citations: {}".format(paper_count))
print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count)))
- # fetch_google_sheet
+ write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers)
sts = subprocess.call([
- "s3cmd", "sync",
+ "s3cmd", "put", "-P", "--recursive",
DIR_PUBLIC_CITATIONS + '/',
"s3://megapixels/v1/citations/",
])
-def write_master_report(fn, title, papers, key):
- keys, rows = fetch_google_sheet('statistics')
+def write_master_report(fn, papers):
+ # first make a lookup of the keys that have papers
+ paper_key_lookup = {}
+ for paper in papers:
+ if paper['key'] not in paper_key_lookup:
+ paper_key_lookup[paper['key']] = paper
+
+ # then fetch the statistics csv which has things like "year"
+ fields, rows = fetch_google_sheet('statistics')
+ master_papers = []
+ statistics = {}
+
+ def clean(n):
+ if n:
+ return int(n.replace(',','').replace('.','').replace('?','').strip())
+ return None
+
+ for row in rows:
+ key = row[0]
+ if key not in paper_key_lookup:
+ continue
+ paper = paper_key_lookup[key]
+ stats = {}
+ for index, field in enumerate(fields):
+ stats[field] = row[index]
+ report_fn = '../site/content/datasets/{}/index.md'.format(key)
+ has_report = os.path.exists(report_fn)
+ statistics[key] = stats
+ search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId']))
+
+ image_count = stats['images']
+ if type(image_count) is str:
+ if len(image_count):
+ image_count = clean(image_count)
+ else:
+ image_count = None,
+ master_papers.append([
+ stats['key'],
+ stats['name'],
+ '/datasets/{}/'.format(key) if has_report else '',
+ image_count,
+ clean(stats['faces_unique']) or None,
+ stats['year_published'],
+ clean(paper['citation_count']) or 0,
+ clean(search_result['citationStats']['numKeyCitations']) or 0,
+ # origin
+ ])
+ master_paper_keys = [
+ 'key',
+ 'title',
+ 'link',
+ 'images',
+ 'people',
+ 'year',
+ 'citations',
+ 'influenced',
+ # 'origin'
+ ]
+ write_csv(fn, keys=master_paper_keys, rows=master_papers)
def write_papers_report(fn, title, papers, key, reverse=False):
sorted_papers = []
@@ -275,7 +332,7 @@ def process_paper(row, addresses, success):
f.write('<script src="../map.js"></script>')
f.write("</html>")
# template = env.get_template('paper.html')
- with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, paper.paper_id), 'w') as f:
+ with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, paper.key), 'w') as f:
json.dump({
'id': paper.paper_id,
'paper': res,