diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2017-12-08 02:52:19 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2017-12-08 02:56:41 +0100 |
| commit | 192abb9db60f95968953b515ce18700c6b2da090 (patch) | |
| tree | e0e77c935ad95ca9e641c6a1f19a22556df9b8d2 /bucky | |
| parent | bbbd8bbab8737f5067c85376daf79cd8a5a9c4cb (diff) | |
snippets and middleware
Diffstat (limited to 'bucky')
| -rw-r--r-- | bucky/app/bucky.js | 2 | ||||
| -rw-r--r-- | bucky/app/router.js | 9 | ||||
| -rw-r--r-- | bucky/db/index.js | 6 | ||||
| -rw-r--r-- | bucky/search/middleware.js | 44 | ||||
| -rw-r--r-- | bucky/search/search.js | 91 | ||||
| -rw-r--r-- | bucky/search/snippet.js | 88 | ||||
| -rw-r--r-- | bucky/search/stopwords.js | 18 |
7 files changed, 257 insertions, 1 deletions
diff --git a/bucky/app/bucky.js b/bucky/app/bucky.js index 757592a..d1aad4d 100644 --- a/bucky/app/bucky.js +++ b/bucky/app/bucky.js @@ -157,7 +157,7 @@ var bucky = module.exports = { next() }) }, - + /* MAIL */ ensureMailboxes: function (req, res, next){ diff --git a/bucky/app/router.js b/bucky/app/router.js index c3af565..a87e1ec 100644 --- a/bucky/app/router.js +++ b/bucky/app/router.js @@ -4,6 +4,7 @@ var fortune = require('./fortune') var bucky = require('./bucky') var db = require('./db') var util = require('./util') +var search = require('../search/middleware') module.exports = function(app){ app.all('*', middleware.ensureLocals) @@ -91,6 +92,14 @@ module.exports = function(app){ // delete a comment }) + app.get("/api/search", + middleware.ensureAuthenticated, + search.search, + search.getComments, + search.getFiles, + search.logQuery, + search.success + ) app.get("/api/keyword/:keyword", middleware.ensureAuthenticated, diff --git a/bucky/db/index.js b/bucky/db/index.js index f376308..dcd5f20 100644 --- a/bucky/db/index.js +++ b/bucky/db/index.js @@ -92,6 +92,9 @@ db.getFileCounts = function(ids){ db.getFileSizes = function(ids){ return knex.column('thread').sum('size as size').select().from('files').where('thread', 'in', ids).groupBy('thread') } +db.getFilesById = function(ids){ + return File.where("id", "in", ids) +} db.createFile = function(data){ return new db.File(data).save() } @@ -117,6 +120,9 @@ db.getCommentsForThread = function (id, limit, offset, order){ return comments }) } +db.getCommentsById = function(ids){ + return Comment.where("id", "in", ids) +} db.getCommentCounts = function(ids){ return knex.column('thread').count('* as count').select().from('comments').where('thread', 'in', ids).groupBy('thread') } diff --git a/bucky/search/middleware.js b/bucky/search/middleware.js new file mode 100644 index 0000000..64ddd28 --- /dev/null +++ b/bucky/search/middleware.js @@ -0,0 +1,44 @@ +var search = require('./search') +var snippet = require('./snippet') +var db = require('../db') + +module.exports = { + + search: function (req, res, next) { + var results = search.search(req.body.query, req.body.start, req.body.limit) + res.search = results + next() + }, + + getComments: function (req, res, next){ + var comment_ids = res.search.comment_ids; + if (! comment_ids || ! comment_ids.length) { + return next() + } + db.getCommentsById(comment_ids).then(function(comments){ + res.search.comments = comments + next() + }) + }, + + getFiles: function (req, res, next){ + var file_ids = res.search.file_ids + if (! file_ids || ! file_ids.length) { + return next() + } + db.getFilesById(file_ids).then(function(files){ + res.search.files = files + next() + }) + }, + + logQuery: function(req, res, next) { + // req.search.query, req.search.count + next() + }, + + success: function(req, res, next){ + res.send(res.search) + } + +} diff --git a/bucky/search/search.js b/bucky/search/search.js new file mode 100644 index 0000000..afa9609 --- /dev/null +++ b/bucky/search/search.js @@ -0,0 +1,91 @@ +var db = require('../db') +var STOPWORDS = require('./stopwords') + +var bdb_lib = require('berkeleydb') +var bdb = new bdb_lib.Db() +bdb.open('search.db') + +var wordRegexp = new RegExp("(\W+)"); +var wordBoundaryRegexp = new RegExp("\W"); +function parse_terms (s) { + return s.toLowerCase().split(wordRegexp).filter((term) => { + if (! term.match(wordBoundaryRegexp)) { + return true + } + return false + }) +} +function cmp (a,b){ return (a<b)?a:(a===b)?0:1 } + +function find_term(term) { + var matches = bdb.get(term).split(",").map((s) => { + var partz = s.split(" ") + var match = { + thread: s[0], + comment: s[1], + file: s[2], + strength: s[3], + } + }) + return matches +} + +function search (query, start, limit) { + if (!query) return + start = start || 0; + limit = limit || 10; + var scores = {}; + var terms = parse_terms($query); + var i = 0 + var total + var to_display = limit + var threads = {} + var thread_ids = [] + var comment_ids = [] + var file_ids = [] + var results = [] + + terms.forEach((term) => { + if (STOPWORDS.has(term)) return; + var results = find_term(term); + if (!results) return; + results.forEach((result) => { + var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 } + score.thread = score.thread || result.thread + score.comment = score.comment || result.comment + score.file = score.file || result.file + score.strength += result.strength + score.count += 1 + }) + }) + total = Object.keys(scores).length + Object.values(scores).sort((a,b) => { + if (b.count !== a.count) { + return cmp(b.count, a.count) + } + return cmp(b.strength * b.count, a.strength * a.count) + }).some((match) => { + if (i++ < start) return false + if (to_display-- === 0) return true + results.push(match) + thread_ids.push(match.thread) + if (match.comment) comment_ids.push(match.comment) + if (match.file) file_ids.push(match.file) + return false + }) + + return { + query: query, + start: start, + next: start + limit, + limit: limit, + total: total, + results: results, + thread_ids: thread_ids, + comment_ids: comment_ids, + file_ids: $file_ids, + terms: terms, + }; +} + +module.exports = { search: search } diff --git a/bucky/search/snippet.js b/bucky/search/snippet.js new file mode 100644 index 0000000..cd0657f --- /dev/null +++ b/bucky/search/snippet.js @@ -0,0 +1,88 @@ +var util = require('../util/util') +var STOPWORDS = require('./stopwords') + +function bold_snippet(s, terms) { + return bold_terms(snippet(s, terms), terms) +} +function bold_terms (s, terms) { + s = util.sanitize(s) + terms.forEach( (term) => { + s.replace(new RegExp("\b" + term + "\b", "i"), "<b>" + term + "</b>") + }) +} +function snippet(s, terms) { + s = util.sanitize(s) + var term_re = new RegExp("\b(" + terms.join("|") + ")\b", "i") + var words = s.split(/\s+/) + var snippet = ""; + + // deduper for matching @words indexes, so we don't add a word twice + var index_matches = {} + + // words in the eventual snippet + var words_matched = [] + + // counter for aggregating context after a match + var aggr = 0; + + // amount of context to show, in number of words surrounding a match + var $pad = 4; + + // loop over each of the words in the string + words.some((word, i) => { + // if the word matches... + if (term_re.match(word) && ! STOPWORDS.has(word.toLowerCase())) { + // if we aren't already aggregating, add an ellipsis + if (! $aggr) { + words_matched.push("...") + } + + // look backward $pad words + var idx; + for (var j = -pad; j < 1; j++) { + // create a new index from the offset + idx = i + j; + + // is this a valid index? has it already been encountered? + if (idx < 0) continue; + if (idx > words.length) continue; + if (index_matches[idx]) continue; + + // checks out, save this word + words_matched.push(words[idx]) + + // note the matching index in our deduper + index_matches[idx] = 1; + } + // enter aggregate mode -- add the next (pad) words + aggr = pad; + } + + // have we been told to aggregate? + else if (aggr) { + // save this word + words_matched.push(word) + + // add index to the deduper + index_matches[i] = 1; + + // one less word to aggregate + aggr--; + } + + // keep snippets to a modest length + return words_matched.length > 30; + }) + + // add a trailing ellipsis + words_matched.push("...") + + // create the snippet from the saved context words + snippet = words_matched.join(" ") + + return snippet +} + +module.exports = { + bold_snippet, bold_terms, snippet, +}
\ No newline at end of file diff --git a/bucky/search/stopwords.js b/bucky/search/stopwords.js new file mode 100644 index 0000000..ceffe14 --- /dev/null +++ b/bucky/search/stopwords.js @@ -0,0 +1,18 @@ +module.exports = new Set( + "a about above across adj after again against all almost alone along also " + + "although always am among an and another any anybody anyone anything anywhere " + + "apart are around as aside at away be because been before behind being below " + + "besides between beyond both but by can cannot could did do does doing done " + + "down downwards during each either else enough etc even ever every everybody " + + "everyone except far few for forth from get gets got had hardly has have having " + + "her here herself him himself his how however i if in indeed instead into inward " + + "is it its itself just kept many maybe might mine more most mostly much must " + + "myself near neither next no nobody none nor not nothing nowhere of off often on " + + "only onto or other others ought our ours out outside over own p per please plus " + + "pp quite rather really said seem self selves several shall she should since so " + + "some somebody somewhat still such than that the their theirs them themselves " + + "then there therefore these they this thorough thoroughly those through thus to " + + "together too toward towards under until up upon v very was well were what " + + "whatever when whenever where whether which while who whom whose will with" + + "within without would yet young your yourself s".split(" ") +); |
