scripts to fetch papers from main s2 api

author: Jules Laplace <julescarbon@gmail.com> 2018-11-03 18:10:21 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-03 18:10:21 +0100
commit: 753e41d819030a62418705fc4484d9303e3e1a00 (patch)
tree: 40a13336278e330de882de3cca90134286d8c952 /s2-dump-missing-paper-ids.py
parent: aa0470a3076f5ac65a0311c76e58254547f3eae0 (diff)
1 files changed, 40 insertions, 0 deletions
diff --git a/s2-dump-missing-paper-ids.py b/s2-dump-missing-paper-ids.py
new file mode 100644
index 00000000..72ff1c44
--- /dev/null
+++ b/s2-dump-missing-paper-ids.py
@@ -0,0 +1,40 @@
+import os
+import gzip
+import glob
+import click
+from util import *
+
+DB_PAPER_DIR = './datasets/s2/db_papers'
+RAW_PAPER_DIR = './datasets/s2/raw_papers'
+
+@click.command()
+@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.')
+def fetch_missing_entries(fn):
+  missing_ids = load_missing_ids(fn)
+  write_csv('./missing.csv', [[id] for id in missing_ids])
+
+def load_missing_ids(fn):
+  lookup = {}
+  missing_lookup = {}
+  ids = read_json(fn)
+  found_count = 0
+  missing_count = 0
+  for paper_id in ids:
+    db_paper_path = make_db_paper_path(paper_id)
+    raw_paper_path = make_raw_paper_path(paper_id)
+    if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path):
+      lookup[paper_id] = True
+      found_count += 1
+    else:
+      missing_lookup[paper_id] = True
+      missing_count += 1
+  print("{} papers found, {} must be fetched".format(found_count, missing_count))
+  return missing_lookup.keys()
+
+def make_db_paper_path(paper_id):
+  return '{}/{}/{}'.format(DB_PAPER_DIR, paper_id[0:2], paper_id)
+def make_raw_paper_path(paper_id):
+  return '{}/{}/{}'.format(RAW_PAPER_DIR, paper_id[0:2], paper_id)
+  
+if __name__ == '__main__':
+  fetch_missing_entries()
author	Jules Laplace <julescarbon@gmail.com>	2018-11-03 18:10:21 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-03 18:10:21 +0100
commit	753e41d819030a62418705fc4484d9303e3e1a00 (patch)
tree	40a13336278e330de882de3cca90134286d8c952 /s2-dump-missing-paper-ids.py
parent	aa0470a3076f5ac65a0311c76e58254547f3eae0 (diff)