diff options
| -rw-r--r-- | README.md | 84 | ||||
| -rw-r--r-- | s2-dump-ids.py | 2 |
2 files changed, 65 insertions, 21 deletions
@@ -3,29 +3,73 @@ ## installation ``` +conda create -n megapixels python=3.7 pip install urllib3 -pip install bs4 -pip install http +pip install requests +pip install simplejson +pip install click +pip install pdfminer.six +pip install csvtool +npm install ``` -## ascii +## workflow ``` -xxxxdddd5xxdddd5555vvvv~xxxxdddd55vvvv7777~~~~xxxxdddddd5555vvvv7777 -xxxxdddd5xxdddd55v~xxxxddddddddd55vvvv7777~~~~xxxxdddd5555v5vvvv7777 -xxxxdddd555vvvv77~~xxxxdddddddd55vvvv77777~~~xxxxddd55555v5vvvv77777 -aaaaxddd5555vvvv77xxxxddddddddd55vvvvv77777~~xxxxddd55555v5vvvvddddd -aaaaayyMMMMMqqqqddeeeexxxxxxdddd55vvvv77777~~xxaayyyyMMMMMqMqqqvdddd -aaaaayyMMMMMqqqqqddeeeaaaaaayyyyMMqqqqqddddeeaaaayyyyMMMMMqMqqqqdddd -ccccaDD%%%%%##qqqddeeaaaaaayyyyMMqqqqddddeeeeaaaayyyyMMMMMqMqqqq@@@@ -ccccc%%%###qqqd#@@eeeaaaaaayyyyMMqqqqddddeeeeacccDDDD%%%%%#%###q@@@@ -cccx%%%%###i##@#@eeeeaccccccDyyyM%##qddd@@eeeeccccDDyMMMMqMqqqq@@@@@ -llcccDDDD%%%%###@eeeecccccccDDDD%%####@@@@eeeecccDDD%%%%#%###q@&&&&& -lllllSDDD%%%%###@llllcccccccDDDD%%####@@@@eeeecclSSD%%%%#%####@&&&&& -llllSSSSQQ%%%##&&&lllllcclllSDDD%%####@@@@eellcllSSD%%%%#%####&&&&&& -aallSSSSQQQQQ%%&&&llllllllllSSSD%Q%%##@@&&llllllllSSQQQQ%Q%%%#&&rrrr -aaaaiiiiVVQQQ%%drrrrrrrrllllSSSSQQ%%%%&&&&llllllllSSSQQQQ%Q%%%%&rrrr -aaaaiiiiVVVVVdddrrrrrrrraaaaiSSSQVdd%%&&rrrrrraaaaiiSQQQQ%Q%%%%rrrrr -xxxxdddd55VVVddv7777~~~~aaaaiiiiVVddddrrrrrrrraaaaiiiiVVVVdVdddd7777 -xxxxdddd5555vvvv7777~~~~xxxxdiiiV555vvvv7777~~~~xxxxdddd5555vvvv7777 +Paper in spreadsheet -> paper_name + -> S2 Search API -> paper_id + -> S2 Paper API -> citations + -> S2 Dataset -> full records with PDF URLs, authors, more citations + -> wget -> .pdf files + -> pdfminer.six -> pdf text + -> Stanford NER -> named entities (organizations) + -> Geocoding service -> lat/lngs ``` + +To begin, export `datasets/citations.csv` from the Google doc. + +## Extracting data from S2 / ORC + +The Open Research Corpus (ORC) is produced by the Allen Institute / Semantic Scholar (S2) / arXiv people. It may be downloaded here: + +http://labs.semanticscholar.org/corpus/ + +### s2-search.py + +Loads titles from citations file and queries the S2 search API to get paper IDs. + +### s2-papers.py + +Uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc. + +### s2-dump-ids.py + +Extract all the paper IDs and citation IDs from the queried papers. + +### s2-extract-papers.py + +Extracts papers from the ORC dataset which have been queried from the API. + +### s2-dump-pdf-urls.py + +Dump PDF urls (and also IEEE urls etc) to pdfs.json, ieee.json, .... + +### s2-fetch-pdfs.py + +Fetch the files listed in pdfs.json and process them. + +### s2-fetch-ieee.py + +Fetch the files listed in ieee.json and process them. + +### s2-extract-first-page.py + +pdfminer the first page from the dumped PDFs. + +### s2-extract-entities.py + +Extract named entities from the mined text. + +### s2-geocode.py + +Geocode known entities from the database. diff --git a/s2-dump-ids.py b/s2-dump-ids.py index 66ff6d77..7b424787 100644 --- a/s2-dump-ids.py +++ b/s2-dump-ids.py @@ -24,7 +24,7 @@ def process_paper(fn, ids): ids[cite['paperId']] = True def paper_path(paper_id): - return '{}/{}/{}'.format(DATA_DIR, paper_id[0:3], paper_id) + return '{}/{}/{}'.format(DATA_DIR, paper_id[0:2], paper_id) if __name__ == '__main__': s2_dump_ids() |
