diff options
Diffstat (limited to 'scraper/s2-dump-db-pdf-urls.py')
| -rw-r--r-- | scraper/s2-dump-db-pdf-urls.py | 34 |
1 files changed, 8 insertions, 26 deletions
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index bc702e09..b82ac6dd 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -22,9 +22,9 @@ def s2_dump_pdf_urls(): pdf = [] doi = [] for fn in glob.iglob('./datasets/s2/*_papers/**/paper.json', recursive=True): - if 'db_paper' in fn: - row = process_db_paper(fn) - elif 'raw_paper' in fn: + # if 'db_paper' in fn: + # row = process_db_paper(fn) + if 'raw_paper' in fn: row = process_raw_paper(fn) if row is not None: rows.append(row) @@ -88,34 +88,16 @@ def process_db_paper(fn): def process_raw_paper(fn): # print(fn) - data = read_json(fn) - if 'paper' not in data: - print(data) - return - paper = data['paper'] + # 0 1 2 3 4 5 6 :) + # ./datasets/s2/raw_papers/00/00b13d00b13.../paper.json + paper_id = fn.split('/')[5] + paper = RawPaper(paper_id) if paper is None: return None - paper_id = paper['id'] - pdf_url = None + pdf_url = paper.pdf_link ieee_url = None doi_url = None extra_url = None - if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']: - primary_url = paper['primaryPaperLink']['url'] - if 'pdf' in primary_url: - pdf_url = primary_url - elif 'doi' in primary_url: - doi_url = primary_url - for link in paper['links']: - url = link['url'] - if 'ieeexplore.ieee.org' in url: - ieee_url = url - elif 'doi.org' in url: - doi_url = url - elif pdf_url is None and 'pdf' in url: - pdf_url = url - else: - extra_url = url return [paper_id, pdf_url, ieee_url, doi_url, extra_url] if __name__ == '__main__': |
