summaryrefslogtreecommitdiff
path: root/scraper/s2-dump-db-pdf-urls.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper/s2-dump-db-pdf-urls.py')
-rw-r--r--scraper/s2-dump-db-pdf-urls.py34
1 files changed, 8 insertions, 26 deletions
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py
index bc702e09..b82ac6dd 100644
--- a/scraper/s2-dump-db-pdf-urls.py
+++ b/scraper/s2-dump-db-pdf-urls.py
@@ -22,9 +22,9 @@ def s2_dump_pdf_urls():
pdf = []
doi = []
for fn in glob.iglob('./datasets/s2/*_papers/**/paper.json', recursive=True):
- if 'db_paper' in fn:
- row = process_db_paper(fn)
- elif 'raw_paper' in fn:
+ # if 'db_paper' in fn:
+ # row = process_db_paper(fn)
+ if 'raw_paper' in fn:
row = process_raw_paper(fn)
if row is not None:
rows.append(row)
@@ -88,34 +88,16 @@ def process_db_paper(fn):
def process_raw_paper(fn):
# print(fn)
- data = read_json(fn)
- if 'paper' not in data:
- print(data)
- return
- paper = data['paper']
+ # 0 1 2 3 4 5 6 :)
+ # ./datasets/s2/raw_papers/00/00b13d00b13.../paper.json
+ paper_id = fn.split('/')[5]
+ paper = RawPaper(paper_id)
if paper is None:
return None
- paper_id = paper['id']
- pdf_url = None
+ pdf_url = paper.pdf_link
ieee_url = None
doi_url = None
extra_url = None
- if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']:
- primary_url = paper['primaryPaperLink']['url']
- if 'pdf' in primary_url:
- pdf_url = primary_url
- elif 'doi' in primary_url:
- doi_url = primary_url
- for link in paper['links']:
- url = link['url']
- if 'ieeexplore.ieee.org' in url:
- ieee_url = url
- elif 'doi.org' in url:
- doi_url = url
- elif pdf_url is None and 'pdf' in url:
- pdf_url = url
- else:
- extra_url = url
return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
if __name__ == '__main__':