diff options
| -rw-r--r-- | scraper/s2-dump-missing-paper-ids.py | 14 | ||||
| -rw-r--r-- | scraper/s2-fetch-pdf.py | 5 |
2 files changed, 14 insertions, 5 deletions
diff --git a/scraper/s2-dump-missing-paper-ids.py b/scraper/s2-dump-missing-paper-ids.py index b30fe167..47dd4238 100644 --- a/scraper/s2-dump-missing-paper-ids.py +++ b/scraper/s2-dump-missing-paper-ids.py @@ -4,7 +4,7 @@ import glob import click from util import * -DB_PAPER_DIR = './datasets/s2/db_papers' +# DB_PAPER_DIR = './datasets/s2/db_papers' RAW_PAPER_DIR = './datasets/s2/raw_papers' @click.command() @@ -20,15 +20,25 @@ def load_missing_ids(fn): found_count = 0 missing_count = 0 for paper_id in ids: - db_paper_path = make_db_paper_path(paper_id) + # db_paper_path = make_db_paper_path(paper_id) raw_paper_path = make_raw_paper_path(paper_id) # if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path): if os.path.exists(raw_paper_path): lookup[paper_id] = True found_count += 1 else: + print(">> {} {}".format(dataset paper_id)) missing_lookup[paper_id] = True missing_count += 1 + + verified_lookup, verified_totals = fetch_verified_paper_lookup() + rows = [] + for dataset, lookup in verified_lookup.items(): + for paper_id in lookup.keys(): + paper_path = data_path('raw_papers', paper_id): + if not os.path.exists(paper_path): + print(">> {} {}".format(dataset paper_id)) + print("{} papers found, {} must be fetched".format(found_count, missing_count)) return missing_lookup.keys() diff --git a/scraper/s2-fetch-pdf.py b/scraper/s2-fetch-pdf.py index 72ca4ca8..61574b90 100644 --- a/scraper/s2-fetch-pdf.py +++ b/scraper/s2-fetch-pdf.py @@ -25,13 +25,12 @@ def fetch_pdf(paper_id, url): pdf_fn = make_pdf_fn(paper_id) txt_fn = make_txt_fn(paper_id) if os.path.exists(pdf_fn) or os.path.exists(txt_fn): - # return read_json(pdf_fn) - return + return None size = s2.fetch_file(url, pdf_fn) if size is None: print("{} empty?".format(paper_id)) return None - print("{} {} kb".format(paper_id, int(size / 1024))) + print("{} {} kb {}".format(paper_id, int(size / 1024), url)) def make_pdf_path(paper_id): return './datasets/s2/pdf/{}/{}'.format(paper_id[0:2], paper_id) |
