import os import re import glob import simplejson as json import math import operator import click import subprocess from util import * DIR_PUBLIC_CITATIONS = "../site/datasets/citations" DIR_FINAL_CITATIONS = "../site/datasets/final" DIR_UNKNOWN_CITATIONS = "../site/datasets/unknown" DIR_VERIFIED_CITATIONS = "../site/datasets/verified" addresses = AddressBook() paper_location_lookup = fetch_google_lookup('paper_locations', item_key='paper_id') @click.command() def s2_final_report(): megapixels = load_megapixels_lookup() verified_lookup = fetch_verified_paper_lookup() items = [] for key, item in megapixels.items(): ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y' nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y' if ft_share or nyt_share: if key in verified_lookup: lookup = verified_lookup[key] else: lookup = {} items.append((item, lookup,)) parallelize(process_paper, items) # key name_short name_full purpose url # wild indoor outdoor campus cyberspace parent # child source usernames names year_start year_end year_published # ongoing images videos identities img_per_person num_cameras # faces_persons female male landmarks width height color gray # derivative_of tags size_gb agreement # citations_count # subprocess.call([ # "s3cmd", "put", "-P", "--recursive", # DIR_PUBLIC_CITATIONS + '/', # "s3://megapixels/v1/citations/", # ]) subprocess.call([ "s3cmd", "put", "-P", "--recursive", DIR_VERIFIED_CITATIONS + '/', "s3://megapixels/v1/citations/verified/", ]) def process_paper(row, verified_lookup): aggregate_citations = {} unknown_citations = {} address = None address_list = [] papers = [] # print(row['paper_ids']) for paper_id in row['paper_ids']: res = process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations) if res: papers.append(res) if res['address']: address_list.append(res['address']) process_single_paper(row, 'search', addresses, aggregate_citations, unknown_citations) if not len(papers): return paper = papers[0] # final citations - a report of all geocoded citations with open('{}/{}.json'.format(DIR_FINAL_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], 'dataset': row['dataset'], 'paper': paper, 'addresses': address_list, 'additional_papers': papers[1:], 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) # unkonwn citations - a report of all non-geocoded citations with open('{}/{}.json'.format(DIR_UNKNOWN_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': papers[0]['paper_id'], 'citations': [unknown_citations[key] for key in unknown_citations.keys()], }, f) # "public" citations - initial citation reports digested by the geocoding frontend -bad name i know # this might not need to get built... with open('{}/{}.json'.format(DIR_PUBLIC_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], 'paper': { 'key': row['key'], 'name': row['name'], 'title': paper['title'], 'year': paper['year'], 'addresses': address_list, }, 'citations': [aggregate_citations[key] for key in aggregate_citations.keys()], }, f) # verified citations - the final public reports with open('{}/{}.json'.format(DIR_VERIFIED_CITATIONS, row['key']), 'w') as f: json.dump({ 'id': paper['paper_id'], 'paper': { 'key': row['key'], 'name': row['name'], 'title': paper['title'], 'year': paper['year'], 'addresses': address_list, }, 'citations': [aggregate_citations[key] for key in verified_lookup.keys() if key in aggregate_citations], }, f) def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_citations): res = { 'paper_id': '', 'key': '', 'title': '', # 'journal': '', 'year': '', 'pdf': '', 'address': '', # 'citation_count': 0, # 'citations_geocoded': 0, # 'citations_unknown': 0, # 'citations_empty': 0, # 'citations_pdf': 0, # 'citations_doi': 0, } if paper_id == 'search': dataset = row['key'] fn = 'datasets/s2/search_papers/{}.json'.format(dataset) if not os.path.exists(fn): return print('>> {} {}'.format(data['paperId'], 'search results')) with open(fn, 'r') as f: citations = json.load(f) data = { 'citations': [ { 'paperId': paperId } for paperId in citations ] } else: fn = file_path('papers', paper_id, 'paper.json') with open(fn, 'r') as f: data = json.load(f) print('>> {} {}'.format(data['paperId'], row['key'])) paper = load_paper(data['paperId']) if paper is None: print("Paper missing! {}".format(data['paperId'])) return res['key'] = row['key'] res['name'] = row['name'] res['paper_id'] = paper.paper_id res['title'] = paper.title # res['journal'] = paper.journal res['year'] = paper.year res['pdf'] = paper.pdf_links() res['doi'] = paper.doi_links() # res['authors'] = ', '.join(paper.authors) # res['citations'] = [] paper_institutions = load_institutions(paper.paper_id, paper_location_lookup) paper_address = None for inst in sorted(paper_institutions, key=operator.itemgetter(1)): #print(inst[1]) institution = inst[1] if paper_address is None: paper_address = addresses.findObject(institution) if paper_address: # print(paper_address) res['address'] = paper_address for cite in data['citations']: citationId = cite['paperId'] if citationId in aggregate_citations: continue elif citationId in unknown_citations: continue seen_here = {} citation = load_paper(citationId) has_pdf = os.path.exists(file_path('pdf', citationId, 'paper.txt')) has_doi = os.path.exists(file_path('doi', citationId, 'paper.doi')) # if has_pdf: # pdf_count += 1 # if has_doi: # doi_count += 1 if citation is None or citation.data is None: print("Citation missing! {}".format(citationId)) continue institutions = load_institutions(citationId, paper_location_lookup) geocoded_addresses = [] geocoded_institutions = [] institution = '' address = None for inst in sorted(institutions, key=operator.itemgetter(1)): # address_count += 1 institution = inst[1] next_address = addresses.findObject(institution) if next_address and next_address['name'] not in seen_here: seen_here[next_address['name']] = True address = next_address geocoded_addresses.append(next_address) if not address: if has_pdf: headings, found_abstract = read_headings(file_path('pdf', citationId, 'paper.txt'), citation) heading_string = '\n'.join(headings[0:20]) found_addresses = [] if len(headings): for heading in headings: possible_address = heading.lower().strip() if possible_address: next_address = addresses.findObject(possible_address) if next_address and next_address['name'] not in seen_here: seen_here[next_address['name']] = True address = next_address geocoded_addresses.append(next_address) if address: aggregate_citations[citationId] = { 'id': citationId, 'title': citation.title, 'addresses': geocoded_addresses, 'year': citation.year, 'pdf': citation.pdf_links(), 'doi': citation.doi_links(), } else: unknown_citations[citationId] = { 'id': citationId, 'title': citation.title, 'year': citation.year, 'pdf': citation.pdf_links(), 'doi': citation.doi_links(), } return res def load_megapixels_lookup(): keys, rows = fetch_google_sheet('citation_lookup') dataset_lookup = fetch_google_lookup('datasets') lookup = {} for row in rows: rec = {} for index, key in enumerate(keys): rec[key] = row[index] if rec['paper_id'] == "" or (rec['verified'] != 1 and rec['verified'] != '1'): continue paper_key = rec['key'] if paper_key not in lookup: rec['paper_ids'] = [] lookup[paper_key] = rec lookup[paper_key]['paper_ids'].append(rec['paper_id']) if paper_key in dataset_lookup: lookup[paper_key]['dataset'] = dataset_lookup[paper_key] else: print("not in datasets lookup:", paper_key) lookup[paper_key]['dataset'] = {} return lookup if __name__ == '__main__': s2_final_report()