summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pdf_dump_first_page.sh8
-rw-r--r--pdf_report_first_pages.py36
-rw-r--r--s2.py2
-rw-r--r--util.py25
4 files changed, 70 insertions, 1 deletions
diff --git a/pdf_dump_first_page.sh b/pdf_dump_first_page.sh
new file mode 100644
index 00000000..dd67c87d
--- /dev/null
+++ b/pdf_dump_first_page.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+for i in datasets/s2/pdf/*/*/*.pdf
+ do
+ OUTPUT="${i%.*}.txt"
+ pdf2txt.py -p 1 $i > $OUTPUT
+ echo $OUTPUT
+ done \ No newline at end of file
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py
new file mode 100644
index 00000000..d7fd3061
--- /dev/null
+++ b/pdf_report_first_pages.py
@@ -0,0 +1,36 @@
+import os
+import gzip
+import glob
+import json
+import click
+from util import *
+
+PDF_DIR = 'datasets/s2/pdf'
+
+@click.command()
+def pdf_report_first_pages():
+ ids = {}
+ for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
+ process_paper(fn, ids)
+ first_pages = list(ids.keys())
+ print("Wrote {} ids".format(len(id_list)))
+ write_html('reports/first_pages.html', first_pages)
+
+def process_paper(fn, ids):
+ with open(fn, 'r') as f:
+ lines = []
+ for line in f.readlines:
+ if 'abstract' in line.lower():
+ break
+ if len(line) < 3:
+ continue
+ lines.append(line)
+ return [
+ lines.join(''),
+ ]
+
+def paper_path(paper_id):
+ return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
+
+if __name__ == '__main__':
+ pdf_report_first_pages()
diff --git a/s2.py b/s2.py
index 696b1a45..f3b1176f 100644
--- a/s2.py
+++ b/s2.py
@@ -122,7 +122,7 @@ class SemanticScholarAPI(object):
@staticmethod
def fetch_file(url, fn, **kwargs):
- resp = requests.get(url, params=kwargs, headers=SemanticScholarAPI.headers)
+ resp = requests.get(url, params=kwargs, headers=SemanticScholarAPI.headers, verify=False)
if resp.status_code != 200:
return None
size = 0
diff --git a/util.py b/util.py
index 9f321465..fde0519f 100644
--- a/util.py
+++ b/util.py
@@ -39,3 +39,28 @@ def write_csv(fn, keys, rows):
writer.writerow(keys)
for row in rows:
writer.writerow(row)
+
+def write_report(fn, title=None, keys=None, rows):
+ with open(fn, 'w') as f:
+ f.write("<!doctype html>")
+ f.write("<html>")
+ f.write("<head>")
+ if title is not None:
+ f.write("<title>{}</title>".format(title))
+ f.write("<link rel='stylesheet' href='report.css'>")
+ f.write("</head>")
+ f.write("<body>")
+ if title is not None:
+ f.write("<h2>{}</h2>".format(title))
+ f.write("<table>")
+ if keys is not None:
+ for key in keys:
+ f.write("<th>{}</th>".format(key))
+ for row in rows:
+ f.write("<tr>")
+ for cell in row:
+ f.write("<td>{}</td>".format(cell)
+ f.write("</tr>")
+ f.write("</table>")
+ f.write("</body>")
+ f.write("</html>")