summaryrefslogtreecommitdiff
path: root/scraper/s2-final-report.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-final-report.py')
-rw-r--r--scraper/s2-final-report.py62
1 files changed, 33 insertions, 29 deletions
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index 489e43df..3cceff43 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -9,6 +9,7 @@ import subprocess
from util import *
DIR_PUBLIC_CITATIONS = "../site/datasets/final"
+DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown"
@click.command()
def s2_final_report():
@@ -23,11 +24,12 @@ def s2_final_report():
def process_paper(row, addresses):
aggregate_citations = {}
+ unknown_citations = {}
address = None
papers = []
print(row['paper_ids'])
for paper_id in row['paper_ids']:
- res = process_single_paper(row, paper_id, addresses, aggregate_citations)
+ res = process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations)
if res:
papers.append(res)
if res['address']:
@@ -42,8 +44,16 @@ def process_paper(row, addresses):
'additional_papers': papers[1:],
'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
}, f)
+ with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f:
+ json.dump({
+ 'id': papers[0]['paper_id'],
+ 'paper': papers[0],
+ 'address': address,
+ 'additional_papers': papers[1:],
+ 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()],
+ }, f)
-def process_single_paper(row, paper_id, addresses, aggregate_citations):
+def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations):
res = {
'paper_id': '',
'key': '',
@@ -60,13 +70,6 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations):
# 'citations_doi': 0,
}
- geocoded_citations = []
- unknown_citations = []
- empty_citations = []
- pdf_count = 0
- doi_count = 0
- address_count = 0
-
fn = file_path('papers', paper_id, 'paper.json')
with open(fn, 'r') as f:
@@ -103,14 +106,16 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations):
citationId = cite['paperId']
if citationId in aggregate_citations:
continue
+ elif citationId in unknown_citations:
+ continue
seen_here = {}
citation = load_paper(citationId)
has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt'))
has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi'))
- if has_pdf:
- pdf_count += 1
- if has_doi:
- doi_count += 1
+ # if has_pdf:
+ # pdf_count += 1
+ # if has_doi:
+ # doi_count += 1
if citation is None or citation.data is None:
print("Citation missing! {}".format(cite['paperId']))
continue
@@ -120,7 +125,7 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations):
institution = ''
address = None
for inst in sorted(institutions, key=operator.itemgetter(1)):
- address_count += 1
+ # address_count += 1
institution = inst[1]
next_address = addresses.findObject(institution)
if next_address and next_address['address'] not in seen_here:
@@ -142,21 +147,20 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations):
address = next_address
geocoded_addresses.append(next_address)
if address:
- if citationId not in aggregate_citations:
- aggregate_citations[citationId] = {
- 'id': citationId,
- 'title': citation.title,
- 'addresses': geocoded_addresses,
- 'year': citation.year,
- 'pdf': citation.pdf_link,
- }
-
- # res['citation_count'] = len(data['citations'])
- # res['citations_geocoded'] = len(geocoded_citations)
- # res['citations_unknown'] = len(unknown_citations)
- # res['citations_empty'] = len(empty_citations)
- # res['citations_pdf'] = pdf_count
- # res['citations_doi'] = doi_count
+ aggregate_citations[citationId] = {
+ 'id': citationId,
+ 'title': citation.title,
+ 'addresses': geocoded_addresses,
+ 'year': citation.year,
+ 'pdf': citation.pdf_link,
+ }
+ else:
+ unknown_citations[citationId] = {
+ 'id': citationId,
+ 'title': citation.title,
+ 'year': citation.year,
+ 'pdf': citation.pdf_link,
+ }
return res
def load_ft_lookup():