From ee3d0d98e19f1d8177d85af1866fd0ee431fe9ea Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Sun, 25 Nov 2018 22:19:15 +0100 Subject: moving stuff --- scraper/README.md | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 scraper/README.md (limited to 'scraper/README.md') diff --git a/scraper/README.md b/scraper/README.md new file mode 100644 index 00000000..964a3ee3 --- /dev/null +++ b/scraper/README.md @@ -0,0 +1,144 @@ +# megapixels dev + +## installation + +``` +conda create -n megapixels python=3.7 +pip install urllib3 +pip install requests +pip install simplejson +pip install click +pip install pdfminer.six +pip install csvtool +npm install +``` + +## workflow + +``` +Paper in spreadsheet -> paper_name + -> S2 Search API -> paper_id + -> S2 Paper API -> citations + -> S2 Dataset -> full records with PDF URLs, authors, more citations + -> wget -> .pdf files + -> pdfminer.six -> pdf text + -> text mining -> named entities (organizations) + -> Geocoding service -> lat/lngs +``` + +To begin, export `datasets/citations.csv` from the Google doc. + +--- + +## Extracting data from S2 / ORC + +The Open Research Corpus (ORC) is produced by the Allen Institute / Semantic Scholar (S2) / arXiv people. It may be downloaded here: + +http://labs.semanticscholar.org/corpus/ + +### s2-search.py + +Loads titles from citations file and queries the S2 search API to get paper IDs. + +### s2-papers.py + +Uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc. + +### s2-dump-ids.py + +Extract all the paper IDs and citation IDs from the queried papers. + +### s2-extract-papers.py + +Extracts papers from the ORC dataset which have been queried from the API. + +### s2-raw-papers.py + +Some papers are not in the ORC dataset and must be scraped from S2 directly. + +--- + +## Extracting data from Google Scholar + +Included in the content-script folder is a Chrome extension which scrapes Google Scholar through the browser, clicking the links and extracting PDF links, number of citations, etc, then saving a JSON file when it's done. Still requires work to process the output (crossreference with S2 and dump the PDFs). + +--- + +## Scraping Institutions + +Once you have the data from S2, you can scrape all the PDFs (and other URLs) you find, and then extract institutions from those and geocode them. + +### s2-dump-pdf-urls.py + +Dump PDF urls (and also IEEE urls etc) to CSV files. + +### s2-fetch-pdf.py + +Fetch the PDFs. + +### s2-fetch-doi.py + +Fetch the files listed in ieee.json and process them. + +### pdf_dump_first_page.sh + +Use pdfminer.six to extract the first page from the PDFs. + +### s2-pdf-first-pages.py + +Perform initial extraction of university-like terms, to be geocoded. + +### s2-doi-report.py + +Extract named entities from the scraped DOI links (IEEE, ACM, etc). + +### s2-geocode.py + +Geocode lists of entities using Nominativ. + +### s2-citation-report.py + +For each paper in the citations CSV, find the corresponding paper in the database, and get all the citations. +For each of the citations, try to find an address for each one. Embed the appropriate entries from institutions list and then render them on a leaflet map. + +--- + +## Cleaning the Data + +After scraping these universities, we got up to 47% match rate on papers from the dataset. However there is still more to solve: + +- Fix the geocoding - this must be done manually - we will dedupe the entries in the entities table, then extract specific entities from the dataset. +- Unknown addresses - we have addresses for some places but we need to a) geocode them again b) geocode just the city or something +- Match across multiple lines +- Empty addresses - some papers need to be gone through by hand? Maybe we can do digram/trigram analysis on the headings. Just finding common words would help. +- Make a list of bogus papers - ones where PDFminer returned empty results, or which did not contain the word ABSTRACT, or were too long. + +### expand-uni-lookup.py + +By now I had a list of institutions in `reports/all_institutions.csv` (done by merging the results of the geocoding, as I had done this on 4 computers and thus had 4 files of institutions). This file must be gone through manually. This technique geocoded around 47% of papers. + +At this point I moved `reports/all_institutions.csv` into the Google Sheets. All further results use the CSV on Google Sheets. + +### s2-pdf-report.py + +Generates reports of things from the PDFs that were not found. + +### s2-geocode-spreadsheet.py + +To add new institutions, simply list them in the spreadsheet with the lat/lng fields empty. Then run this script and anything missing a lat/lng will get one. + +### s2-citation-report.py + +Generate the main report with maps and citation lists. + +--- + +## Useful scripts for batch processing + +### split-csv.py + +Shuffle and split a CSV into multiple files. + +### merge-csv.py + +Merge a folder of CSVs into a single file, deduping based on the first column. -- cgit v1.2.3-70-g09d2 From 602527f66a438ffe9340299a242900057b175926 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 7 Dec 2018 15:25:06 +0100 Subject: update s2_search --- scraper/README.md | 2 +- scraper/s2-search.py | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) (limited to 'scraper/README.md') diff --git a/scraper/README.md b/scraper/README.md index 964a3ee3..a17f1efe 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -3,7 +3,7 @@ ## installation ``` -conda create -n megapixels python=3.7 +conda create -n megapixels python=3.6 pip install urllib3 pip install requests pip install simplejson diff --git a/scraper/s2-search.py b/scraper/s2-search.py index 44bab08b..ddecf2f9 100644 --- a/scraper/s2-search.py +++ b/scraper/s2-search.py @@ -27,20 +27,31 @@ totalResults @click.option('--index', '-n', default=0, help='Index of CSV (query,)') def fetch_entries(index): keys, lines = read_citation_list(index) + citation_lookup = [] s2 = SemanticScholarAPI() for line in lines: - label = line[0] - title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[1]) - entry_fn = './datasets/s2/entries/{}.json'.format(title) - if not os.path.exists(entry_fn): + key = line[0] + name = line[1] + title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2]) + dump_fn = './datasets/s2/dumps/{}.json'.format(key) + entry_fn = './datasets/s2/entries/{}.json'.format(key) + result = None + if os.path.exists(entry_fn): + result = read_json(entry_fn) + else: results = s2.search(title) write_json(dump_fn, results) if len(results['results']) == 0: - print("No results for {}".format(title)) + print("- {}".format(title)) else: - print(title) - write_json(entry_fn, results['results'][0]) - time.sleep(random.randint(10, 20)) + print("+ {}".format(title)) + result = results['results'][0] + write_json(entry_fn, result) + if result: + paper_id = result['id'] + paper = fetch_paper(paper_id) + citation_lookup.append([key, name, title, paper_id]) + write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup) if __name__ == '__main__': fetch_entries() -- cgit v1.2.3-70-g09d2 From 927fd8825101749cc8fcdc14f05ffd50d14ed652 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 7 Dec 2018 17:17:29 +0100 Subject: cleaning up --- scraper/README.md | 6 +----- scraper/s2-search.py | 27 +++++++++++++++++++++++---- scraper/s2.py | 20 -------------------- 3 files changed, 24 insertions(+), 29 deletions(-) (limited to 'scraper/README.md') diff --git a/scraper/README.md b/scraper/README.md index a17f1efe..782fa30a 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -38,11 +38,7 @@ http://labs.semanticscholar.org/corpus/ ### s2-search.py -Loads titles from citations file and queries the S2 search API to get paper IDs. - -### s2-papers.py - -Uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc. +Loads titles from citations file and queries the S2 search API to get paper IDs, then uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc. ### s2-dump-ids.py diff --git a/scraper/s2-search.py b/scraper/s2-search.py index 169a8d19..db5731d5 100644 --- a/scraper/s2-search.py +++ b/scraper/s2-search.py @@ -7,7 +7,7 @@ import random import re import simplejson as json import click -from s2 import SemanticScholarAPI, fetch_paper +from s2 import SemanticScholarAPI from util import * ''' @@ -32,8 +32,9 @@ def fetch_entries(index): for line in lines: key = line[0] name = line[1] - title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2]) - if len(title) < 2: + title = line[2].trim() + clean_title = re.sub(r'[^-0-9a-zA-Z ]+', '', line[2]) + if len(clean_title) < 2: continue dump_fn = './datasets/s2/dumps/{}.json'.format(key) entry_fn = './datasets/s2/entries/{}.json'.format(key) @@ -41,7 +42,7 @@ def fetch_entries(index): if os.path.exists(entry_fn): result = read_json(entry_fn) else: - results = s2.search(title) + results = s2.search(clean_title) write_json(dump_fn, results) if len(results['results']) == 0: print("- {}".format(title)) @@ -55,5 +56,23 @@ def fetch_entries(index): citation_lookup.append([key, name, title, paper_id]) write_csv("datasets/citation_lookup.csv", keys=['key', 'name', 'title', 'paper_id'], rows=citation_lookup) +def fetch_paper(s2, paper_id): + os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) + paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) + if os.path.exists(paper_fn): + return read_json(paper_fn) + print(paper_id) + paper = s2.paper(paper_id) + if paper is None: + print("Got none paper??") + # time.sleep(random.randint(1, 2)) + paper = s2.paper(paper_id) + if paper is None: + print("Paper not found") + return None + write_json(paper_fn, paper) + # time.sleep(random.randint(1, 2)) + return paper + if __name__ == '__main__': fetch_entries() diff --git a/scraper/s2.py b/scraper/s2.py index 4fdd5f28..b1b9742c 100644 --- a/scraper/s2.py +++ b/scraper/s2.py @@ -1,7 +1,5 @@ import os import requests -import time -import random from util import * class AuthorStub(object): @@ -195,21 +193,3 @@ class SemanticScholarAPI(object): }, headers=SemanticScholarAPI.headers) # print(resp.status_code) return None if resp.status_code != 200 else resp.json() - -def fetch_paper(s2, paper_id): - os.makedirs('./datasets/s2/papers/{}/{}'.format(paper_id[0:2], paper_id), exist_ok=True) - paper_fn = './datasets/s2/papers/{}/{}/paper.json'.format(paper_id[0:2], paper_id) - if os.path.exists(paper_fn): - return read_json(paper_fn) - print(paper_id) - paper = s2.paper(paper_id) - if paper is None: - print("Got none paper??") - # time.sleep(random.randint(1, 2)) - paper = s2.paper(paper_id) - if paper is None: - print("Paper not found") - return None - write_json(paper_fn, paper) - # time.sleep(random.randint(1, 2)) - return paper -- cgit v1.2.3-70-g09d2 From 588c96ab6d38f30bbef3aa773163b36838538355 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 7 Dec 2018 18:46:03 +0100 Subject: path --- scraper/README.md | 8 +++++++- scraper/s2-extract-papers.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'scraper/README.md') diff --git a/scraper/README.md b/scraper/README.md index 782fa30a..318bba9a 100644 --- a/scraper/README.md +++ b/scraper/README.md @@ -36,18 +36,24 @@ The Open Research Corpus (ORC) is produced by the Allen Institute / Semantic Sch http://labs.semanticscholar.org/corpus/ +We do a two-stage fetch process as only about 66% of their papers are in this dataset. + ### s2-search.py Loads titles from citations file and queries the S2 search API to get paper IDs, then uses the paper IDs from the search entries to query the S2 papers API to get first-degree citations, authors, etc. ### s2-dump-ids.py -Extract all the paper IDs and citation IDs from the queried papers. +Dump all the paper IDs and citation IDs from the queried papers. ### s2-extract-papers.py Extracts papers from the ORC dataset which have been queried from the API. +### s2-dump-missing-paper-ids.py + +Dump the citation IDs that were not found in the ORC dataset. + ### s2-raw-papers.py Some papers are not in the ORC dataset and must be scraped from S2 directly. diff --git a/scraper/s2-extract-papers.py b/scraper/s2-extract-papers.py index bd30c24b..7cbe1244 100644 --- a/scraper/s2-extract-papers.py +++ b/scraper/s2-extract-papers.py @@ -5,7 +5,7 @@ import click from util import * S2_DIR = '/media/blue/undisclosed/semantic-scholar/corpus-2018-05-03' -DATA_DIR = '/home/lens/undisclosed/megapixels_dev/datasets/s2/db_papers' +DATA_DIR = '/home/lens/undisclosed/megapixels_dev/scraper/datasets/s2/db_papers' @click.command() @click.option('--fn', '-f', default='ids.json', help='List of IDs to extract from the big dataset.') -- cgit v1.2.3-70-g09d2