summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2018-11-06 01:42:13 +0100
committerJules Laplace <julescarbon@gmail.com>2018-11-06 01:42:13 +0100
commit002e72bb172c34bb71756f9e6c23294913f1ef85 (patch)
treeea3f3f91bb1a5219801fbf26cf5c12a3eeff1a55
parent743159991f1bcf2080693424ebe5ad7001865583 (diff)
maybe rm empty txts
-rw-r--r--pdf_dump_first_page.sh8
-rw-r--r--s2-pdf-report.py8
2 files changed, 15 insertions, 1 deletions
diff --git a/pdf_dump_first_page.sh b/pdf_dump_first_page.sh
index ec1d3bbb..2749915d 100644
--- a/pdf_dump_first_page.sh
+++ b/pdf_dump_first_page.sh
@@ -7,5 +7,13 @@ for i in datasets/s2/pdf/*/*/*.pdf
then
pdf2txt.py -p 1 $i > $OUTPUT
echo $OUTPUT
+ else
+ if [ -s $OUTPUT ]
+ then
+ echo "found $OUTPUT"
+ else
+ echo "rm empty $OUTPUT"
+ rm -f $OUTPUT
+ fi
fi
done
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index 6ef5c0f7..7c89381f 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -12,10 +12,11 @@ PDF_DIR = 'datasets/s2/pdf'
def pdf_report_first_pages():
rows = []
for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
- row = process_paper(fn)
+ row, institutions = process_paper(fn)
print(row)
rows.append(row)
write_report('reports/first_pages.html', title='First pages', keys=None, rows=rows)
+ write_report('reports/institutions.html', title='Institutions', keys=None, rows=institutions)
print("Wrote {} rows".format(len(rows)))
def process_paper(fn):
@@ -27,6 +28,7 @@ def process_paper(fn):
with open(fn, 'r') as f:
lines = []
emails = []
+ institutions = []
authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
journal = paper.journal.lower()
found_authors = []
@@ -55,6 +57,7 @@ def process_paper(fn):
# lines.append(NameLine(line))
continue
if 'university' in l or 'universiteit' in l or 'research center' in l:
+ institutions.append(line)
lines.append(BoldLine(line))
continue
lines.append(line)
@@ -63,6 +66,9 @@ def process_paper(fn):
lines,
found_authors,
emails,
+ ], [
+ paper_id,
+ sorted(institutions),
]
class NameLine(object):