diff options
Diffstat (limited to 'scraper/content-script/check.js')
| -rw-r--r-- | scraper/content-script/check.js | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/scraper/content-script/check.js b/scraper/content-script/check.js new file mode 100644 index 00000000..0644084d --- /dev/null +++ b/scraper/content-script/check.js @@ -0,0 +1,102 @@ +(function(){ + + var listening = false, loaded = false + var TYPES = { Status: 0, Connected: 1, Disconnected: 2, Image: 3, Text: 4 } + var PUNCTUATION_REGEX = /[\[\]\{\}]/g // i.e. resembles code + var NULL_ARRAY = [] + var seen = {} + const toArray = (a) => Array.prototype.slice.call(a) + const $ = (s) => document.querySelector(s) + const $$ = (s) => document.querySelectorAll(s) + + function init () { + if (window.location.href.indexOf("lvh.me") !== -1 || window.location.href.indexOf("localhost") !== -1) return + bind() + } + function bind () { + chrome.extension.onMessage.addListener(onMessage) + chrome.extension.sendMessage({ type: TYPES.Status }, gotStatus) + } + function gotStatus (response) { + console.log('got status', response) + if (response && response.status === "on") { + console.log('its on', loaded) + if (! loaded) { + console.log('started') + setTimeout(() => { start() }, 5000) + // send(document.body.innerText) + // setInterval(function(){ + // send(document.body.innerText) + // }, 10000) + } + loaded = true + } + } + function onMessage (request, sender, sendResponse) { + switch (request.method) { + case 'start': + start() + break + case 'stop': + listening = false + break + } + } + function start(){ + console.log(window.location.href, listening) + if (window.location.href.indexOf('schol' + 'ar' + '.go' + 'og' + 'le') === -1) return + if (listening) return + listening = true + energize() + } + + function energize(){ + console.log('energize') + const records = toArray($$(".gs_r")).map((el, i) => { + let data = {} + let link = el.querySelector("h3 a") + if (link) { + data.link = link.href + } + let pdfLink = el.querySelector(".gs_or_ggsm a") + if (pdfLink) { + data.pdfLink = pdfLink.href + } + let attribution = el.querySelector('.gs_a') + if (attribution) { + data.attribution = attribution.innerText + data.attributionLinks = toArray(attribution.querySelectorAll('a')).map(a => ({ + href: a.href, + name: a.innerText, + })) + } + let snippet = el.querySelector('.gs_a') + if (snippet) { + data.snippet = snippet.innerText + } + let citationLink = el.querySelector('.gs_fl a:nth-of-type(3)') + if (citationLink && citationLink.innerText.match('Cited by')) { + data.citationLink = citationLink.href + data.citationCount = parseInt(citationLink.innerText.replace(/^\s*Cited by /, ''), 10) || -1 + } + return data + }) + let record = { + title: document.querySelector('title').innerText, + url: window.location.href, + records: records, + } + send(JSON.stringify(record)) + let nextLink = $("#gs_n td:last-child a") + if (nextLink) { + setTimeout(() => { nextLink.click() }, 19000 + (Math.random() * 21000)) + } + send("done") + } + function send (text) { + chrome.extension.sendMessage({ type: TYPES.Text, data: text }, function(){}) + } + + init() + +})() |
