diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-12-16 16:29:04 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-12-16 16:29:04 +0100 |
| commit | 05fc975a313aa38483d904cb9ad07a029641d086 (patch) | |
| tree | 85f3d4fdf3688c2779d3ca3ba9c59910a48d1df9 /scraper/s2-citation-report.py | |
| parent | 110f3a34f1f36d0ea999d4aa34bbe66d5f2a01da (diff) | |
rebuild
Diffstat (limited to 'scraper/s2-citation-report.py')
| -rw-r--r-- | scraper/s2-citation-report.py | 67 |
1 files changed, 62 insertions, 5 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py index e0d812d7..b5849329 100644 --- a/scraper/s2-citation-report.py +++ b/scraper/s2-citation-report.py @@ -32,16 +32,73 @@ def s2_citation_report(): print("citations: {}".format(paper_count)) print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count))) - # fetch_google_sheet + write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers) sts = subprocess.call([ - "s3cmd", "sync", + "s3cmd", "put", "-P", "--recursive", DIR_PUBLIC_CITATIONS + '/', "s3://megapixels/v1/citations/", ]) -def write_master_report(fn, title, papers, key): - keys, rows = fetch_google_sheet('statistics') +def write_master_report(fn, papers): + # first make a lookup of the keys that have papers + paper_key_lookup = {} + for paper in papers: + if paper['key'] not in paper_key_lookup: + paper_key_lookup[paper['key']] = paper + + # then fetch the statistics csv which has things like "year" + fields, rows = fetch_google_sheet('statistics') + master_papers = [] + statistics = {} + + def clean(n): + if n: + return int(n.replace(',','').replace('.','').replace('?','').strip()) + return None + + for row in rows: + key = row[0] + if key not in paper_key_lookup: + continue + paper = paper_key_lookup[key] + stats = {} + for index, field in enumerate(fields): + stats[field] = row[index] + report_fn = '../site/content/datasets/{}/index.md'.format(key) + has_report = os.path.exists(report_fn) + statistics[key] = stats + search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId'])) + + image_count = stats['images'] + if type(image_count) is str: + if len(image_count): + image_count = clean(image_count) + else: + image_count = None, + master_papers.append([ + stats['key'], + stats['name'], + '/datasets/{}/'.format(key) if has_report else '', + image_count, + clean(stats['faces_unique']) or None, + stats['year_published'], + clean(paper['citation_count']) or 0, + clean(search_result['citationStats']['numKeyCitations']) or 0, + # origin + ]) + master_paper_keys = [ + 'key', + 'title', + 'link', + 'images', + 'people', + 'year', + 'citations', + 'influenced', + # 'origin' + ] + write_csv(fn, keys=master_paper_keys, rows=master_papers) def write_papers_report(fn, title, papers, key, reverse=False): sorted_papers = [] @@ -275,7 +332,7 @@ def process_paper(row, addresses, success): f.write('<script src="../map.js"></script>') f.write("</html>") # template = env.get_template('paper.html') - with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, paper.paper_id), 'w') as f: + with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, paper.key), 'w') as f: json.dump({ 'id': paper.paper_id, 'paper': res, |
