diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-25 22:19:15 +0100 |
| commit | ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea (patch) | |
| tree | 41372528e78d4328bc2a47bbbabac7e809c58894 /s2-dump-ids.py | |
| parent | 255b8178af1e25a71fd23703d30c0d1f74911f47 (diff) | |
moving stuff
Diffstat (limited to 's2-dump-ids.py')
| -rw-r--r-- | s2-dump-ids.py | 31 |
1 files changed, 0 insertions, 31 deletions
diff --git a/s2-dump-ids.py b/s2-dump-ids.py deleted file mode 100644 index 2ce41399..00000000 --- a/s2-dump-ids.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import gzip -import glob -import json -import click -from util import * - -PAPER_JSON_DIR = 'datasets/s2/papers' - -@click.command() -def s2_dump_ids(): - ids = {} - for fn in glob.iglob('{}/**/*.json'.format(PAPER_JSON_DIR), recursive=True): - process_paper(fn, ids) - id_list = list(ids.keys()) - print("Wrote {} ids".format(len(id_list))) - write_json('ids.json', id_list) - -def process_paper(fn, ids): - with open(fn, 'r') as f: - data = json.load(f) - print(data['paperId']) - ids[data['paperId']] = True - for cite in data['citations']: - ids[cite['paperId']] = True - -def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) - -if __name__ == '__main__': - s2_dump_ids() |
