diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2019-02-16 14:21:32 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2019-02-16 14:21:32 +0100 |
| commit | c1ce13b41b595847f18d2f7232850b10cd677e66 (patch) | |
| tree | 7ce70a6899f49ac3633b6364bf26b9da762a89d8 | |
| parent | 3a3a89f2c58eceee07b2cfcfb1700a61b34619e5 (diff) | |
get better pdf url
| -rw-r--r-- | scraper/client/paper/paper.address.js | 44 | ||||
| -rw-r--r-- | scraper/client/paper/paper.css | 3 | ||||
| -rw-r--r-- | scraper/s2-dump-db-pdf-urls.py | 34 | ||||
| -rw-r--r-- | scraper/s2-final-report.py | 3 | ||||
| -rw-r--r-- | scraper/util.py | 30 |
5 files changed, 76 insertions, 38 deletions
diff --git a/scraper/client/paper/paper.address.js b/scraper/client/paper/paper.address.js index 09b20758..9256d4ad 100644 --- a/scraper/client/paper/paper.address.js +++ b/scraper/client/paper/paper.address.js @@ -19,6 +19,7 @@ const initialState = { institution_3_vetting: '', institution_4_vetting: '', notes: '', + pdf_index: 0, } class PaperAddress extends Component { state = { @@ -111,7 +112,25 @@ class PaperAddress extends Component { institution_4_vetting: this.state.institution_4_vetting, notes: this.state.notes, }) - history.push('/paper/' + this.props.api.paperInfo.dataset.key + '/random/') + this.next(false) + } + + next() { + const { key } = this.props.api.paperInfo.dataset + const { unknownCitations } = this.props.api + let citationIndex = (unknownCitations.citations || []) + .findIndex(f => f.id === this.state.citation.id) + if (citationIndex === -1) { + history.push('/paper/' + key + '/info/') + } else { + citationIndex += 1 + if (citationIndex >= unknownCitations.length) { + history.push('/paper/' + key + '/info/') + } else { + let nextId = unknownCitations.citations[citationIndex].id + history.push('/paper/' + key + '/address/' + nextId) + } + } } render() { @@ -229,9 +248,30 @@ class PaperAddress extends Component { onClick={this.save.bind(this)} ref={ref => this.button = ref} >Save Institutions</button> + + <button + className='btn' + onClick={this.next.bind(this)} + >{'Next >'}</button> + </div> + + <div className='param pdf_links'> + {citation.pdf.map((pdf,i) => { + const pdf_partz = pdf.replace('www.','').split('/').slice(2,3)[0] || 'unknown' + let domain = '' + return ( + <a + key={i} + onClick={() => this.setState({ pdf_index: i })} + className={i === this.state.pdf_index ? 'selected' : ''} + > + {'[' + domain + ']'} + </a> + ) + })} </div> - <iframe className='pdfViewer' src={citation.pdf} /> + <iframe className='pdfViewer' src={citation.pdf[this.state.pdf_index]} /> </div> ) } diff --git a/scraper/client/paper/paper.css b/scraper/client/paper/paper.css index 430face4..9d55c55c 100644 --- a/scraper/client/paper/paper.css +++ b/scraper/client/paper/paper.css @@ -38,6 +38,9 @@ input.notes { width: 400px; } margin-right: 10px; } +.pdf_links a.selected { + font-weight: bold; +} iframe.pdfViewer { margin: 10px 0; width: 100%; diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py index bc702e09..b82ac6dd 100644 --- a/scraper/s2-dump-db-pdf-urls.py +++ b/scraper/s2-dump-db-pdf-urls.py @@ -22,9 +22,9 @@ def s2_dump_pdf_urls(): pdf = [] doi = [] for fn in glob.iglob('./datasets/s2/*_papers/**/paper.json', recursive=True): - if 'db_paper' in fn: - row = process_db_paper(fn) - elif 'raw_paper' in fn: + # if 'db_paper' in fn: + # row = process_db_paper(fn) + if 'raw_paper' in fn: row = process_raw_paper(fn) if row is not None: rows.append(row) @@ -88,34 +88,16 @@ def process_db_paper(fn): def process_raw_paper(fn): # print(fn) - data = read_json(fn) - if 'paper' not in data: - print(data) - return - paper = data['paper'] + # 0 1 2 3 4 5 6 :) + # ./datasets/s2/raw_papers/00/00b13d00b13.../paper.json + paper_id = fn.split('/')[5] + paper = RawPaper(paper_id) if paper is None: return None - paper_id = paper['id'] - pdf_url = None + pdf_url = paper.pdf_link ieee_url = None doi_url = None extra_url = None - if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']: - primary_url = paper['primaryPaperLink']['url'] - if 'pdf' in primary_url: - pdf_url = primary_url - elif 'doi' in primary_url: - doi_url = primary_url - for link in paper['links']: - url = link['url'] - if 'ieeexplore.ieee.org' in url: - ieee_url = url - elif 'doi.org' in url: - doi_url = url - elif pdf_url is None and 'pdf' in url: - pdf_url = url - else: - extra_url = url return [paper_id, pdf_url, ieee_url, doi_url, extra_url] if __name__ == '__main__': diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py index 283ca4fc..451c1f78 100644 --- a/scraper/s2-final-report.py +++ b/scraper/s2-final-report.py @@ -85,7 +85,8 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_ res['title'] = paper.title # res['journal'] = paper.journal res['year'] = paper.year - res['pdf'] = paper.pdf_link + res['pdf'] = paper.pdf_links() + res['doi'] = paper.doi_links() # res['authors'] = ', '.join(paper.authors) # res['citations'] = [] diff --git a/scraper/util.py b/scraper/util.py index 6c671cec..1b1a0a9b 100644 --- a/scraper/util.py +++ b/scraper/util.py @@ -195,25 +195,37 @@ class RawPaper(object): @property def authors(self): return [ (author[0]['ids'][0] if len(author[0]['ids']) else '', author[0]['name']) for author in self.data['authors'] ] - @property - def pdf_link(self): - if 'primaryPaperLink' in self.data: - link = self.data['primaryPaperLink'] + def paper_links(self): + def url_part(link): if type(link) == dict and 'url' in link: return link['url'] return link - return None + paper_links = [] + if 'primaryPaperLink' in self.data: + paper_links.append(url_part(self.data['primaryPaperLink'])) + if 'alternatePaperLinks' in self.data: + for link in self.data['alternatePaperLinks']: + paper_links.append(url_part(link)) + def pdf_links(self): + return [ link for link in self.paper_links() if 'pdf' in link ] + def doi_links(self): + return [ link for link in self.paper_links() if 'pdf' not in link ] + @property + def pdf_link(self): + links = self.pdf_links() + return links[0] if len(links) else None def record(self): return [ self.paper_id, self.title, self.journal, self.year ] def load_paper(paper_id): - if os.path.exists(paper_path('db_papers', paper_id)): - # print('db paper') - return DbPaper(paper_id) + # no longer using DB papers :p + # if os.path.exists(paper_path('db_papers', paper_id)) + # print('db paper') + # return DbPaper(paper_id) if os.path.exists(paper_path('raw_papers', paper_id)): # print('raw paper') return RawPaper(paper_id) - print('no paper') + print('no raw paper: {}'.format(paper_id)) return None def dedupe(a): |
