diff options
| author | adamhrv <adam@ahprojects.com> | 2018-12-16 19:38:54 +0100 |
|---|---|---|
| committer | adamhrv <adam@ahprojects.com> | 2018-12-16 19:38:54 +0100 |
| commit | 23e9fef5dce8b0b15dd94713816b9d7d45f12356 (patch) | |
| tree | 3ca9ffe3adce76318450991bfc613073470b604c /scraper/s2-citation-report.py | |
| parent | 759027d5fbfd6665082f72a3ceaeef68c2d2142e (diff) | |
| parent | 6431d06048791763f3644b3a0457cc9c4f1df6d3 (diff) | |
Merge branch 'master' of github.com:adamhrv/megapixels_dev
Diffstat (limited to 'scraper/s2-citation-report.py')
| -rw-r--r-- | scraper/s2-citation-report.py | 91 |
1 files changed, 88 insertions, 3 deletions
diff --git a/scraper/s2-citation-report.py b/scraper/s2-citation-report.py index 5c5fae9a..d70a378a 100644 --- a/scraper/s2-citation-report.py +++ b/scraper/s2-citation-report.py @@ -5,9 +5,11 @@ import simplejson as json import math import operator import click -#import builder +import subprocess from util import * +DIR_PUBLIC_CITATIONS = "../site/datasets/citations" + @click.command() def s2_citation_report(): addresses = AddressBook() @@ -30,6 +32,82 @@ def s2_citation_report(): print("citations: {}".format(paper_count)) print("geocoded: {} ({}%)".format(geocode_count, percent(geocode_count, paper_count))) + write_master_report('{}/{}'.format(DIR_PUBLIC_CITATIONS, "datasets.csv"), papers) + + sts = subprocess.call([ + "s3cmd", "put", "-P", "--recursive", + DIR_PUBLIC_CITATIONS + '/', + "s3://megapixels/v1/citations/", + ]) + +def write_master_report(fn, papers): + # first make a lookup of the keys that have papers + paper_key_lookup = {} + for paper in papers: + if paper['key'] not in paper_key_lookup: + paper_key_lookup[paper['key']] = paper + + # then fetch the statistics csv which has things like "year" + fields, rows = fetch_google_sheet('statistics') + master_papers = [] + statistics = {} + + def clean(n): + if type(n) is int: + return n + if type(n) is str and n: + s = str(n).replace(',','').replace('.','').replace('?','').strip() + try: + return int(s) + except e: + return s + if n: + return n + return None + + for row in rows: + key = row[0] + if key not in paper_key_lookup: + continue + paper = paper_key_lookup[key] + stats = {} + for index, field in enumerate(fields): + stats[field] = row[index] + report_fn = '../site/content/datasets/{}/index.md'.format(key) + has_report = os.path.exists(report_fn) + statistics[key] = stats + search_result = read_json('./datasets/s2/entries/{}.json'.format(paper['paperId'])) + + image_count = stats['images'] + if type(image_count) is str: + if len(image_count): + image_count = clean(image_count) + else: + image_count = None, + master_papers.append([ + stats['key'], + stats['name'], + '/datasets/{}/'.format(key) if has_report else '', + image_count, + clean(stats['faces_unique']) or None, + stats['year_published'], + clean(paper['citation_count']) or 0, + clean(search_result['citationStats']['numKeyCitations']) or 0, + # origin + ]) + master_paper_keys = [ + 'key', + 'title', + 'link', + 'images', + 'people', + 'year', + 'citations', + 'influenced', + # 'origin' + ] + write_csv(fn, keys=master_paper_keys, rows=master_papers) + def write_papers_report(fn, title, papers, key, reverse=False): sorted_papers = [] for paper in sorted(papers, key=lambda x: x[key], reverse=reverse): @@ -105,7 +183,7 @@ def process_paper(row, addresses, success): with open(fn, 'r') as f: data = json.load(f) - print('>> {}'.format(data['paperId'])) + print('>> {} {}'.format(data['paperId'], row['key'])) paper = load_paper(data['paperId']) if paper is None: print("Paper missing! {}".format(data['paperId'])) @@ -145,7 +223,7 @@ def process_paper(row, addresses, success): pdf_count += 1 if has_doi: doi_count += 1 - if citation.data is None: + if citation is None or citation.data is None: print("Citation missing! {}".format(cite['paperId'])) continue institutions = load_institutions(citationId) @@ -262,6 +340,13 @@ def process_paper(row, addresses, success): f.write('<script src="../map.js"></script>') f.write("</html>") # template = env.get_template('paper.html') + with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: + json.dump({ + 'id': paper.paper_id, + 'paper': res, + 'address': paper_address, + 'citations': geocoded_citations, + }, f) return res def load_megapixels_queries(): |
