diff options
| -rw-r--r-- | s2-dump-db-pdf-urls.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py index 0fd84fd1..473e90af 100644 --- a/s2-dump-db-pdf-urls.py +++ b/s2-dump-db-pdf-urls.py @@ -13,11 +13,27 @@ def s2_dump_pdf_urls(): # store it and the paper id # another script will fetch the urls from this process rows = [] + pdf_count = 0 + ieee_count = 0 + extra_count = 0 + empty_count += 1 for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True): row = process_paper(fn) if row is not None: rows.append(row) + if row[1] is not None: + pdf_count += 1 + if row[2] is not None: + ieee_count += 1 + if row[3] is not None: + extra_count += 1 + if row[1] is None and row[2] is None and row[3] is None: + empty_count += 1 print("Wrote {} rows".format(len(rows))) + print("pdf count: {}".format(pdf_count)) + print("ieee count: {}".format(ieee_count)) + print("url count: {}".format(url_count)) + print("empty count: {}".format(empty_count)) write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'Extra URL'], rows=rows) def process_paper(fn): |
