moving stuff

author: Jules Laplace <julescarbon@gmail.com> 2018-11-25 22:19:15 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-25 22:19:15 +0100
commit: ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree: 41372528e78d4328bc2a47bbbabac7e809c58894 /s2-dump-missing-paper-ids.py
parent: 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)
1 files changed, 0 insertions, 40 deletions
diff --git a/s2-dump-missing-paper-ids.py b/s2-dump-missing-paper-ids.py
deleted file mode 100644
index bf0b7e50..00000000
--- a/s2-dump-missing-paper-ids.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-import gzip
-import glob
-import click
-from util import *
-
-DB_PAPER_DIR = './datasets/s2/db_papers'
-RAW_PAPER_DIR = './datasets/s2/raw_papers'
-
-@click.command()
-@click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.')
-def fetch_missing_entries(fn):
-  missing_ids = load_missing_ids(fn)
-  write_csv('./missing.csv', keys=None, rows=[[id] for id in missing_ids])
-
-def load_missing_ids(fn):
-  lookup = {}
-  missing_lookup = {}
-  ids = read_json(fn)
-  found_count = 0
-  missing_count = 0
-  for paper_id in ids:
-    db_paper_path = make_db_paper_path(paper_id)
-    raw_paper_path = make_raw_paper_path(paper_id)
-    if os.path.exists(db_paper_path) or os.path.exists(raw_paper_path):
-      lookup[paper_id] = True
-      found_count += 1
-    else:
-      missing_lookup[paper_id] = True
-      missing_count += 1
-  print("{} papers found, {} must be fetched".format(found_count, missing_count))
-  return missing_lookup.keys()
-
-def make_db_paper_path(paper_id):
-  return '{}/{}/{}'.format(DB_PAPER_DIR, paper_id[0:2], paper_id)
-def make_raw_paper_path(paper_id):
-  return '{}/{}/{}'.format(RAW_PAPER_DIR, paper_id[0:2], paper_id)
-  
-if __name__ == '__main__':
-  fetch_missing_entries()
author	Jules Laplace <julescarbon@gmail.com>	2018-11-25 22:19:15 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-25 22:19:15 +0100
commit	ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch)
tree	41372528e78d4328bc2a47bbbabac7e809c58894 /s2-dump-missing-paper-ids.py
parent	255b8178af1e25a71fd23703d30c0d1f74911f47 (diff)