From 10c38b6b5916b2c7f84ca65fa471dda963dd9b5d Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Thu, 30 May 2019 14:03:34 +0200 Subject: s2 fetch missing verified papers --- scraper/s2-dump-missing-paper-ids.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'scraper/s2-dump-missing-paper-ids.py') diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py index b30fe167..47dd4238 100644 --- a/scraper/s2-dump-missing-paper-ids.py +++ b/scraper/s2-dump-missing-paper-ids.py @@ -4,7 +4,7 @@ import glob import click from util import * -DB_PAPER_DIR = './datasets/s2/db_papers' +# DB_PAPER_DIR = './datasets/s2/db_papers' RAW_PAPER_DIR = './datasets/s2/raw_papers' @click.command() @@ -20,15 +20,25 @@ def load_missing_ids(fn): found_count = 0 missing_count = 0 for paper_id in ids: - db_paper_path = make_db_paper_path(paper_id) + # db_paper_path = make_db_paper_path(paper_id) raw_paper_path = make_raw_paper_path(paper_id) # if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path): if os.path.exists(raw_paper_path): lookup[paper_id] = True found_count += 1 else: + print(">> {} {}".format(dataset paper_id)) missing_lookup[paper_id] = True missing_count += 1 + + verified_lookup, verified_totals = fetch_verified_paper_lookup() + rows = [] + for dataset, lookup in verified_lookup.items(): + for paper_id in lookup.keys(): + paper_path = data_path('raw_papers', paper_id): + if not os.path.exists(paper_path): + print(">> {} {}".format(dataset paper_id)) + print("{} papers found, {} must be fetched".format(found_count, missing_count)) return missing_lookup.keys() -- cgit v1.2.3-70-g09d2 From 4f1d44719221bb8195e32b8f1e97feb4c3e14991 Mon Sep 17 00:00:00 2001 From: "jules@lens" Date: Thu, 30 May 2019 14:30:39 +0200 Subject: fetching verified papers --- scraper/s2-doi-report.py | 14 +++++++------- scraper/s2-dump-ids.py | 2 +- scraper/s2-dump-missing-paper-ids.py | 12 ++++++++---- scraper/s2-fetch-pdf.py | 10 +++++++--- scraper/s2-final-report.py | 14 +++++++------- 5 files changed, 30 insertions(+), 22 deletions(-) (limited to 'scraper/s2-dump-missing-paper-ids.py') diff --git a/scraper/s2-doi-report.py b/scraper/s2-doi-report.py index ea708de2..c715b647 100644 --- a/scraper/s2-doi-report.py +++ b/scraper/s2-doi-report.py @@ -129,7 +129,7 @@ def load_ieee(paper, fn): write_json(fn.replace('paper.doi', 'ieee.json'), data) # print(data) except: - print('ieee: could not read data') + #print('ieee: could not read data') return None if 'authors' in data: affiliations = [ author['affiliation'] for author in data['authors'] if 'affiliation' in author ] @@ -145,7 +145,7 @@ def load_springer(paper, fn): try: soup = BeautifulSoup(f.read(), 'html.parser') except: - print('springer: could not read data') + # print('springer: could not read data') return None items = soup.find_all(class_='affiliation__item') affiliations = [ ', '.join(item.strings) for item in items ] @@ -159,7 +159,7 @@ def load_sciencedirect(paper, fn): try: soup = BeautifulSoup(f.read(), 'html.parser') except: - print('sciencedirect: could not read data') + # print('sciencedirect: could not read data') return None items = soup.find_all("script", type='application/json', limit=1) @@ -171,7 +171,7 @@ def load_sciencedirect(paper, fn): write_json(fn.replace('paper.doi', 'sciencedirect.json'), data) # print(data) except: - print('sciencedirect: json error') + # print('sciencedirect: json error') return None affiliations = [value['$$'][0]['_'] for value in data['authors']['affiliations'].values()] @@ -186,7 +186,7 @@ def load_acm(paper, fn): try: soup = BeautifulSoup(f.read(), 'html.parser') except: - print('acm: could not read data') + #print('acm: could not read data') return None items = soup.find_all("a", title='Institutional Profile Page') affiliations = [ item.string for item in items ] @@ -213,13 +213,13 @@ def load_computerorg(paper, fn): # return affiliations def load_elsevier(paper, fn): - print('elsevier: {}'.format(paper.paper_id)) + #print('elsevier: {}'.format(paper.paper_id)) if not os.path.exists(doi.old_doi_fn(paper.paper_id)): with open(fn, 'r') as f: try: soup = BeautifulSoup(f.read(), 'html.parser') except: - print('elsevier: could not read data') + #print('elsevier: could not read data') return None item = soup.find_all("input", attrs={"name": 'redirectURL'})[0] #new_url = unquote(item['value']) diff --git a/scraper/s2-dump-ids.py b/scraper/s2-dump-ids.py index bddc8040..4c9846b1 100644 --- a/scraper/s2-dump-ids.py +++ b/scraper/s2-dump-ids.py @@ -19,7 +19,7 @@ def s2_dump_ids(): def process_paper(fn, ids): with open(fn, 'r') as f: data = json.load(f) - print(data['paperId']) + # print(data['paperId']) ids[data['paperId']] = True for cite in data['citations']: ids[cite['paperId']] = True diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py index 47dd4238..6f7eb8ba 100644 --- a/scraper/s2-dump-missing-paper-ids.py +++ b/scraper/s2-dump-missing-paper-ids.py @@ -27,7 +27,7 @@ def load_missing_ids(fn): lookup[paper_id] = True found_count += 1 else: - print(">> {} {}".format(dataset paper_id)) + # print(">> {} {}".format(dataset, paper_id)) missing_lookup[paper_id] = True missing_count += 1 @@ -35,9 +35,13 @@ def load_missing_ids(fn): rows = [] for dataset, lookup in verified_lookup.items(): for paper_id in lookup.keys(): - paper_path = data_path('raw_papers', paper_id): - if not os.path.exists(paper_path): - print(">> {} {}".format(dataset paper_id)) + if dataset == 'brainwash': + print('>> {} {}'.format(dataset, paper_id)) + paper_path = make_raw_paper_path(paper_id) + if not os.path.exists(paper_path) and paper_id not in missing_lookup: + print(">> {} {}".format(dataset, paper_id)) + missing_count += 1 + missing_lookup[paper_id] = True print("{} papers found, {} must be fetched".format(found_count, missing_count)) return missing_lookup.keys() diff --git a/scraper/s2-fetch-pdf.py b/scraper/s2-fetch-pdf.py index 61574b90..c1b767b0 100644 --- a/scraper/s2-fetch-pdf.py +++ b/scraper/s2-fetch-pdf.py @@ -24,12 +24,14 @@ def fetch_pdf(paper_id, url): os.makedirs(make_pdf_path(paper_id), exist_ok=True) pdf_fn = make_pdf_fn(paper_id) txt_fn = make_txt_fn(paper_id) - if os.path.exists(pdf_fn) or os.path.exists(txt_fn): - return None + empty_fn = make_empty_fn(paper_id) + if os.path.exists(pdf_fn) or os.path.exists(txt_fn) or os.path.exists(empty_fn): + return size = s2.fetch_file(url, pdf_fn) if size is None: print("{} empty?".format(paper_id)) - return None + write_json(empty_fn, { 'paper_id': paper_id, 'url': url }) + return print("{} {} kb {}".format(paper_id, int(size / 1024), url)) def make_pdf_path(paper_id): @@ -38,6 +40,8 @@ def make_pdf_fn(paper_id): return './datasets/s2/pdf/{}/{}/paper.pdf'.format(paper_id[0:2], paper_id) def make_txt_fn(paper_id): return './datasets/s2/pdf/{}/{}/paper.txt'.format(paper_id[0:2], paper_id) +def make_empty_fn(paper_id): + return './datasets/s2/pdf/{}/{}/pdf.empty'.format(paper_id[0:2], paper_id) if __name__ == '__main__': fetch_pdfs() diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index 16d70f12..c9795680 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -22,8 +22,8 @@ def s2_final_report(): verified_lookup, verified_totals = fetch_verified_paper_lookup() items = [] for key, item in megapixels.items(): - #if key != 'brainwash': - # continue + if key != 'brainwash': + continue ft_share = 'ft_share' in item['dataset'] and item['dataset']['ft_share'] == 'Y' nyt_share = 'nyt_share' in item['dataset'] and item['dataset']['nyt_share'] == 'Y' if ft_share or nyt_share: @@ -47,11 +47,11 @@ def s2_final_report(): # DIR_PUBLIC_CITATIONS + '/', # "s3://megapixels/v1/citations/", # ]) - subprocess.call([ - "s3cmd", "put", "-P", "--recursive", - DIR_VERIFIED_CITATIONS + '/', - "s3://megapixels/v1/citations/verified/", - ]) + #subprocess.call([ + # "s3cmd", "put", "-P", "--recursive", + # DIR_VERIFIED_CITATIONS + '/', + # "s3://megapixels/v1/citations/verified/", + #]) def process_paper(row, verified_lookup, verified_totals): aggregate_citations = {} -- cgit v1.2.3-70-g09d2