diff options
Diffstat (limited to 'scraper/s2-final-report.py')
| -rw-r--r-- | scraper/s2-final-report.py | 62 |
1 files changed, 33 insertions, 29 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index 489e43df..3cceff43 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -9,6 +9,7 @@ import subprocess from util import * DIR_PUBLIC_CITATIONS = "../site/datasets/final" +DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown" @click.command() def s2_final_report(): @@ -23,11 +24,12 @@ def s2_final_report(): def process_paper(row, addresses): aggregate_citations = {} + unknown_citations = {} address = None papers = [] print(row['paper_ids']) for paper_id in row['paper_ids']: - res = process_single_paper(row, paper_id, addresses, aggregate_citations) + res = process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations) if res: papers.append(res) if res['address']: @@ -42,8 +44,16 @@ def process_paper(row, addresses): 'additional_papers': papers[1:], 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) + with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f: + json.dump({ + 'id': papers[0]['paper_id'], + 'paper': papers[0], + 'address': address, + 'additional_papers': papers[1:], + 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], + }, f) -def process_single_paper(row, paper_id, addresses, aggregate_citations): +def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations): res = { 'paper_id': '', 'key': '', @@ -60,13 +70,6 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations): # 'citations_doi': 0, } - geocoded_citations = [] - unknown_citations = [] - empty_citations = [] - pdf_count = 0 - doi_count = 0 - address_count = 0 - fn = file_path('papers', paper_id, 'paper.json') with open(fn, 'r') as f: @@ -103,14 +106,16 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations): citationId = cite['paperId'] if citationId in aggregate_citations: continue + elif citationId in unknown_citations: + continue seen_here = {} citation = load_paper(citationId) has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) - if has_pdf: - pdf_count += 1 - if has_doi: - doi_count += 1 + # if has_pdf: + # pdf_count += 1 + # if has_doi: + # doi_count += 1 if citation is None or citation.data is None: print("Citation missing! {}".format(cite['paperId'])) continue @@ -120,7 +125,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations): institution = '' address = None for inst in sorted(institutions, key=operator.itemgetter(1)): - address_count += 1 + # address_count += 1 institution = inst[1] next_address = addresses.findObject(institution) if next_address and next_address['address'] not in seen_here: @@ -142,21 +147,20 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations): address = next_address geocoded_addresses.append(next_address) if address: - if citationId not in aggregate_citations: - aggregate_citations[citationId] = { - 'id': citationId, - 'title': citation.title, - 'addresses': geocoded_addresses, - 'year': citation.year, - 'pdf': citation.pdf_link, - } - - # res['citation_count'] = len(data['citations']) - # res['citations_geocoded'] = len(geocoded_citations) - # res['citations_unknown'] = len(unknown_citations) - # res['citations_empty'] = len(empty_citations) - # res['citations_pdf'] = pdf_count - # res['citations_doi'] = doi_count + aggregate_citations[citationId] = { + 'id': citationId, + 'title': citation.title, + 'addresses': geocoded_addresses, + 'year': citation.year, + 'pdf': citation.pdf_link, + } + else: + unknown_citations[citationId] = { + 'id': citationId, + 'title': citation.title, + 'year': citation.year, + 'pdf': citation.pdf_link, + } return res def load_ft_lookup(): |
