blob: d7fd30612daa37959e806a5eccf094339f56980d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
import os
import gzip
import glob
import json
import click
from util import *
PDF_DIR = 'datasets/s2/pdf'
@click.command()
def pdf_report_first_pages():
ids = {}
for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
process_paper(fn, ids)
first_pages = list(ids.keys())
print("Wrote {} ids".format(len(id_list)))
write_html('reports/first_pages.html', first_pages)
def process_paper(fn, ids):
with open(fn, 'r') as f:
lines = []
for line in f.readlines:
if 'abstract' in line.lower():
break
if len(line) < 3:
continue
lines.append(line)
return [
lines.join(''),
]
def paper_path(paper_id):
return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id)
if __name__ == '__main__':
pdf_report_first_pages()
|