s2-dump-db-pdf-urls.py

author: Jules Laplace <julescarbon@gmail.com> 2018-11-03 19:08:06 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2018-11-03 19:08:06 +0100
commit: 8299124ecc34fd4885e7b525b849a44083ab334b (patch)
tree: 77192f6095f7af6bb6c16ab2df594ebc4951d62c
parent: 7f385e46dce654405fc965668a8104e876c8aa6d (diff)
1 files changed, 16 insertions, 0 deletions
diff --git a/s2-dump-db-pdf-urls.py b/s2-dump-db-pdf-urls.py
index 0fd84fd1..473e90af 100644
--- a/s2-dump-db-pdf-urls.py
+++ b/s2-dump-db-pdf-urls.py
@@ -13,11 +13,27 @@ def s2_dump_pdf_urls():
   # store it and the paper id
   # another script will fetch the urls from this process
   rows = []
+  pdf_count = 0
+  ieee_count = 0
+  extra_count = 0
+  empty_count += 1
   for fn in glob.iglob('{}/**/paper.json'.format(PAPER_JSON_DIR), recursive=True):
     row = process_paper(fn)
     if row is not None:
       rows.append(row)
+      if row[1] is not None:
+        pdf_count += 1
+      if row[2] is not None:
+        ieee_count += 1
+      if row[3] is not None:
+        extra_count += 1
+      if row[1] is None and row[2] is None and row[3] is None:
+        empty_count += 1
   print("Wrote {} rows".format(len(rows)))
+  print("pdf count: {}".format(pdf_count))
+  print("ieee count: {}".format(ieee_count))
+  print("url count: {}".format(url_count))
+  print("empty count: {}".format(empty_count))
   write_csv('db_paper_pdf_list.csv', keys=['Paper ID', 'PDF URL', 'IEEE URL', 'Extra URL'], rows=rows)
 
 def process_paper(fn):
author	Jules Laplace <julescarbon@gmail.com>	2018-11-03 19:08:06 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2018-11-03 19:08:06 +0100
commit	8299124ecc34fd4885e7b525b849a44083ab334b (patch)
tree	77192f6095f7af6bb6c16ab2df594ebc4951d62c
parent	7f385e46dce654405fc965668a8104e876c8aa6d (diff)