diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2018-11-03 19:08:06 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2018-11-03 19:08:06 +0100 |
| commit | 8299124ecc34fd4885e7b525b849a44083ab334b (patch) | |
| tree | 77192f6095f7af6bb6c16ab2df594ebc4951d62c | |
| parent | 7f385e46dce654405fc965668a8104e876c8aa6d (diff) | |
s2-dump-db-pdf-urls.py
| -rw-r--r-- | s2-dump-db-pdf-urls.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py index 0fd84fd1..473e90af 100644 --- a/s2-dump-db-pdf-urls.py +++ b/s2-dump-db-pdf-urls.py @@ -13,11 +13,27 @@ def s2_dump_pdf_urls(): # store it and the paper id # another script will fetch the urls from this process rows = [] + pdf_count = 0 + ieee_count = 0 + extra_count = 0 + empty_count += 1 for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): row = process_paper(fn) if row is not None: rows.append(row) + if row[1] is not None: + pdf_count += 1 + if row[2] is not None: + ieee_count += 1 + if row[3] is not None: + extra_count += 1 + if row[1] is None and row[2] is None and row[3] is None: + empty_count += 1 print("Wrote {} rows".format(len(rows))) + print("pdf count: {}".format(pdf_count)) + print("ieee count: {}".format(ieee_count)) + print("url count: {}".format(url_count)) + print("empty count: {}".format(empty_count)) write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'Extra URL'], rows=rows) def process_paper(fn): |
