diff options
| author | Adam Harvey <adam@ahprojects.com> | 2018-12-23 01:37:03 +0100 |
|---|---|---|
| committer | Adam Harvey <adam@ahprojects.com> | 2018-12-23 01:37:03 +0100 |
| commit | 4452e02e8b04f3476273574a875bb60cfbb4568b (patch) | |
| tree | 3ffa44f9621b736250a8b94da14a187dc785c2fe /scraper/s2-dump-ids.py | |
| parent | 2a65f7a157bd4bace970cef73529867b0e0a374d (diff) | |
| parent | 5340bee951c18910fd764241945f1f136b5a22b4 (diff) | |
.
Diffstat (limited to 'scraper/s2-dump-ids.py')
| -rw-r--r-- | scraper/s2-dump-ids.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/scraper/s2-dump-ids.py b/scraper/s2-dump-ids.py new file mode 100644 index 00000000..bddc8040 --- /dev/null +++ b/scraper/s2-dump-ids.py @@ -0,0 +1,31 @@ +import os +import gzip +import glob +import simplejson as json +import click +from util import * + +PAPER_JSON_DIR = 'datasets/s2/papers' + +@click.command() +def s2_dump_ids(): + ids = {} + for fn in glob.iglob('{}/**/*.json'.format(PAPER_JSON_DIR), recursive=True): + process_paper(fn, ids) + id_list = list(ids.keys()) + print("Wrote {} ids".format(len(id_list))) + write_json('ids.json', id_list) + +def process_paper(fn, ids): + with open(fn, 'r') as f: + data = json.load(f) + print(data['paperId']) + ids[data['paperId']] = True + for cite in data['citations']: + ids[cite['paperId']] = True + +def paper_path(paper_id): + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) + +if __name__ == '__main__': + s2_dump_ids() |
