diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-05 23:34:55 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-05 23:34:55 +0100 |
| commit | a7529c979563e87fe9c518a3b6a084772d4b63a4 (patch) | |
| tree | 88445643552fcf1e761957cb634012cf4e0bc1a8 | |
| parent | 2cb40f0220c14cc4b42673b4b75fc04406f651ff (diff) | |
k
| -rw-r--r-- | pdf_report_first_pages.py | 23 | ||||
| -rw-r--r-- | split-csv.py | 2 | ||||
| -rw-r--r-- | util.py | 4 |
3 files changed, 16 insertions, 13 deletions
diff --git a/pdf_report_first_pages.py b/pdf_report_first_pages.py index d7fd3061..ae080539 100644 --- a/pdf_report_first_pages.py +++ b/pdf_report_first_pages.py @@ -6,27 +6,30 @@ import click from util import * PDF_DIR = 'datasets/s2/pdf' +FIRST_PAGES_KEYS = '' @click.command() def pdf_report_first_pages(): - ids = {} + rows = [] for fn in glob.iglob('{}/**/*.txt'.format(PDF_DIR), recursive=True): - process_paper(fn, ids) - first_pages = list(ids.keys()) - print("Wrote {} ids".format(len(id_list))) - write_html('reports/first_pages.html', first_pages) + row = process_paper(fn) + rows.append(row) + write_report('reports/first_pages.html', title='First pages', keys=FIRST_PAGES_KEYS, rows=rows) + print("Wrote {} rows".format(len(rows))) -def process_paper(fn, ids): +def process_paper(fn): + index = fn.replace(PDF_DIR, '').split('/')[2] with open(fn, 'r') as f: - lines = [] - for line in f.readlines: + lines = '' + for line in f.readlines(): if 'abstract' in line.lower(): break if len(line) < 3: continue - lines.append(line) + lines += line + '<br>' return [ - lines.join(''), + index, + lines ] def paper_path(paper_id): diff --git a/split-csv.py b/split-csv.py index 2db45d85..e7c12883 100644 --- a/split-csv.py +++ b/split-csv.py @@ -9,7 +9,7 @@ import click @click.command() @click.option('--count', '-c', default=2, help='Number of subdivisions.') -@click.option('--has_keys/--has_no_keys', '-k', default=False, help='Whether to split off the keys.') +@click.option('--has_keys/--no_keys', '-k', default=False, help='Whether to split off the keys.') @click.option('--shuffle/--no_shuffle', default=False, help='Whether to shuffle.') @click.argument('filename') def split_csv(count, has_keys, shuffle, filename): @@ -47,12 +47,12 @@ def write_report(fn, title=None, keys=None, rows=[]): f.write("<head>") if title is not None: f.write("<title>{}</title>".format(title)) - f.write("<link rel='stylesheet' href='report.css'>") + f.write("<link rel='stylesheet' href='reports.css'>") f.write("</head>") f.write("<body>") if title is not None: f.write("<h2>{}</h2>".format(title)) - f.write("<table>") + f.write("<table border='1' cellpadding='3' cellspacing='3'>") if keys is not None: for key in keys: f.write("<th>{}</th>".format(key)) |
