summaryrefslogtreecommitdiff
path: root/s2-pdf-report.py
diff options
context:
space:
mode:
Diffstat (limited to 's2-pdf-report.py')
-rw-r--r--s2-pdf-report.py44
1 files changed, 1 insertions, 43 deletions
diff --git a/s2-pdf-report.py b/s2-pdf-report.py
index d659ed15..0748897f 100644
--- a/s2-pdf-report.py
+++ b/s2-pdf-report.py
@@ -22,10 +22,9 @@ def s2_pdf_report():
addresses = AddressBook()
for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True):
paper_id = fn.replace(PDF_DIR, '').split('/')[2]
- paper = load_paper(paper_id)
total_count += 1
# print(paper_id)
- headings, found_abstract = read_headings(fn, paper)
+ headings, found_abstract = read_headings(fn)
heading_string = '\n'.join(headings[0:20])
found_addresses = []
if not found_abstract:
@@ -67,47 +66,6 @@ def s2_pdf_report():
def percent(a,b):
return round(100 * a / b)
-def read_headings(fn, paper):
- headings = []
- found_abstract = False
- found_authors = []
- journal = paper.journal.lower()
- authors = [ (a[0], a[1], a[1].lower(),) for a in paper.authors ]
- with open(fn, 'r') as f:
- for line in f.readlines():
- line = re.sub(r"\S*@\S*\s?", '', line)
- l = line.lower().strip()
- if len(l) < 5:
- continue
- if line[0] == 'a' or line[0] == 'b' or line[0] == 'c' or line[0] == '1' or line[0] == '2' or line[0] == '3' or line[0] == '4':
- line = line[1:]
- line = line.strip("∗†‡")
- line = line.replace("fl", "fl").replace('ff', 'ff').replace('ffi', 'f‌f‌i').replace('ffl', 'f‌f‌l')
- line = line.strip()
- if 'abstract' in l:
- found_abstract = True
- break
- if journal and journal in l:
- continue
- names = [s.strip() for s in re.split(',| and ', l)]
- was_found = False
- for name in names:
- found = find_authors(authors, name)
- if found:
- was_found = True
- # print("found {}".format(found[1]))
- if found[0]:
- found_authors.append(found)
- continue
- headings.append(line.strip())
- return headings, found_abstract
-
-def find_authors(authors, line):
- for a in authors:
- if a[2] in line:
- return a
- return None
-
def paper_path(paper_id):
return '{}/{}/{}'.format(PDF_DIR, paper_id[0:2], paper_id)