summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2019-02-16 14:21:32 +0100
committerJules Laplace <julescarbon@gmail.com>2019-02-16 14:21:32 +0100
commitc1ce13b41b595847f18d2f7232850b10cd677e66 (patch)
tree7ce70a6899f49ac3633b6364bf26b9da762a89d8
parent3a3a89f2c58eceee07b2cfcfb1700a61b34619e5 (diff)
get better pdf url
-rw-r--r--scraper/client/paper/paper.address.js44
-rw-r--r--scraper/client/paper/paper.css3
-rw-r--r--scraper/s2-dump-db-pdf-urls.py34
-rw-r--r--scraper/s2-final-report.py3
-rw-r--r--scraper/util.py30
5 files changed, 76 insertions, 38 deletions
diff --git a/scraper/client/paper/paper.address.js b/scraper/client/paper/paper.address.js
index 09b20758..9256d4ad 100644
--- a/scraper/client/paper/paper.address.js
+++ b/scraper/client/paper/paper.address.js
@@ -19,6 +19,7 @@ const initialState = {
institution_3_vetting: '',
institution_4_vetting: '',
notes: '',
+ pdf_index: 0,
}
class PaperAddress extends Component {
state = {
@@ -111,7 +112,25 @@ class PaperAddress extends Component {
institution_4_vetting: this.state.institution_4_vetting,
notes: this.state.notes,
})
- history.push('/paper/' + this.props.api.paperInfo.dataset.key + '/random/')
+ this.next(false)
+ }
+
+ next() {
+ const { key } = this.props.api.paperInfo.dataset
+ const { unknownCitations } = this.props.api
+ let citationIndex = (unknownCitations.citations || [])
+ .findIndex(f => f.id === this.state.citation.id)
+ if (citationIndex === -1) {
+ history.push('/paper/' + key + '/info/')
+ } else {
+ citationIndex += 1
+ if (citationIndex >= unknownCitations.length) {
+ history.push('/paper/' + key + '/info/')
+ } else {
+ let nextId = unknownCitations.citations[citationIndex].id
+ history.push('/paper/' + key + '/address/' + nextId)
+ }
+ }
}
render() {
@@ -229,9 +248,30 @@ class PaperAddress extends Component {
onClick={this.save.bind(this)}
ref={ref => this.button = ref}
>Save Institutions</button>
+
+ <button
+ className='btn'
+ onClick={this.next.bind(this)}
+ >{'Next >'}</button>
+ </div>
+
+ <div className='param pdf_links'>
+ {citation.pdf.map((pdf,i) => {
+ const pdf_partz = pdf.replace('www.','').split('/').slice(2,3)[0] || 'unknown'
+ let domain = ''
+ return (
+ <a
+ key={i}
+ onClick={() => this.setState({ pdf_index: i })}
+ className={i === this.state.pdf_index ? 'selected' : ''}
+ >
+ {'[' + domain + ']'}
+ </a>
+ )
+ })}
</div>
- <iframe className='pdfViewer' src={citation.pdf} />
+ <iframe className='pdfViewer' src={citation.pdf[this.state.pdf_index]} />
</div>
)
}
diff --git a/scraper/client/paper/paper.css b/scraper/client/paper/paper.css
index 430face4..9d55c55c 100644
--- a/scraper/client/paper/paper.css
+++ b/scraper/client/paper/paper.css
@@ -38,6 +38,9 @@ input.notes { width: 400px; }
margin-right: 10px;
}
+.pdf_links a.selected {
+ font-weight: bold;
+}
iframe.pdfViewer {
margin: 10px 0;
width: 100%;
diff --git a/scraper/s2-dump-db-pdf-urls.py b/scraper/s2-dump-db-pdf-urls.py
index bc702e09..b82ac6dd 100644
--- a/scraper/s2-dump-db-pdf-urls.py
+++ b/scraper/s2-dump-db-pdf-urls.py
@@ -22,9 +22,9 @@ def s2_dump_pdf_urls():
pdf = []
doi = []
for fn in glob.iglob('./datasets/s2/*_papers/**/paper.json', recursive=True):
- if 'db_paper' in fn:
- row = process_db_paper(fn)
- elif 'raw_paper' in fn:
+ # if 'db_paper' in fn:
+ # row = process_db_paper(fn)
+ if 'raw_paper' in fn:
row = process_raw_paper(fn)
if row is not None:
rows.append(row)
@@ -88,34 +88,16 @@ def process_db_paper(fn):
def process_raw_paper(fn):
# print(fn)
- data = read_json(fn)
- if 'paper' not in data:
- print(data)
- return
- paper = data['paper']
+ # 0 1 2 3 4 5 6 :)
+ # ./datasets/s2/raw_papers/00/00b13d00b13.../paper.json
+ paper_id = fn.split('/')[5]
+ paper = RawPaper(paper_id)
if paper is None:
return None
- paper_id = paper['id']
- pdf_url = None
+ pdf_url = paper.pdf_link
ieee_url = None
doi_url = None
extra_url = None
- if 'primaryPaperLink' in paper and 'url' in paper['primaryPaperLink']:
- primary_url = paper['primaryPaperLink']['url']
- if 'pdf' in primary_url:
- pdf_url = primary_url
- elif 'doi' in primary_url:
- doi_url = primary_url
- for link in paper['links']:
- url = link['url']
- if 'ieeexplore.ieee.org' in url:
- ieee_url = url
- elif 'doi.org' in url:
- doi_url = url
- elif pdf_url is None and 'pdf' in url:
- pdf_url = url
- else:
- extra_url = url
return [paper_id, pdf_url, ieee_url, doi_url, extra_url]
if __name__ == '__main__':
diff --git a/scraper/s2-final-report.py b/scraper/s2-final-report.py
index 283ca4fc..451c1f78 100644
--- a/scraper/s2-final-report.py
+++ b/scraper/s2-final-report.py
@@ -85,7 +85,8 @@ def process_single_paper(row, paper_id, addresses, aggregate_citations, unknown_
res['title'] = paper.title
# res['journal'] = paper.journal
res['year'] = paper.year
- res['pdf'] = paper.pdf_link
+ res['pdf'] = paper.pdf_links()
+ res['doi'] = paper.doi_links()
# res['authors'] = ', '.join(paper.authors)
# res['citations'] = []
diff --git a/scraper/util.py b/scraper/util.py
index 6c671cec..1b1a0a9b 100644
--- a/scraper/util.py
+++ b/scraper/util.py
@@ -195,25 +195,37 @@ class RawPaper(object):
@property
def authors(self):
return [ (author[0]['ids'][0] if len(author[0]['ids']) else '', author[0]['name']) for author in self.data['authors'] ]
- @property
- def pdf_link(self):
- if 'primaryPaperLink' in self.data:
- link = self.data['primaryPaperLink']
+ def paper_links(self):
+ def url_part(link):
if type(link) == dict and 'url' in link:
return link['url']
return link
- return None
+ paper_links = []
+ if 'primaryPaperLink' in self.data:
+ paper_links.append(url_part(self.data['primaryPaperLink']))
+ if 'alternatePaperLinks' in self.data:
+ for link in self.data['alternatePaperLinks']:
+ paper_links.append(url_part(link))
+ def pdf_links(self):
+ return [ link for link in self.paper_links() if 'pdf' in link ]
+ def doi_links(self):
+ return [ link for link in self.paper_links() if 'pdf' not in link ]
+ @property
+ def pdf_link(self):
+ links = self.pdf_links()
+ return links[0] if len(links) else None
def record(self):
return [ self.paper_id, self.title, self.journal, self.year ]
def load_paper(paper_id):
- if os.path.exists(paper_path('db_papers', paper_id)):
- # print('db paper')
- return DbPaper(paper_id)
+ # no longer using DB papers :p
+ # if os.path.exists(paper_path('db_papers', paper_id))
+ # print('db paper')
+ # return DbPaper(paper_id)
if os.path.exists(paper_path('raw_papers', paper_id)):
# print('raw paper')
return RawPaper(paper_id)
- print('no paper')
+ print('no raw paper: {}'.format(paper_id))
return None
def dedupe(a):