From 192abb9db60f95968953b515ce18700c6b2da090 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 8 Dec 2017 02:52:19 +0100 Subject: snippets and middleware --- bucky/app/bucky.js | 2 +- bucky/app/router.js | 9 ++++ bucky/db/index.js | 6 +++ bucky/search/middleware.js | 44 ++++++++++++++++++ bucky/search/search.js | 91 +++++++++++++++++++++++++++++++++++++ bucky/search/snippet.js | 88 ++++++++++++++++++++++++++++++++++++ bucky/search/stopwords.js | 18 ++++++++ lib/search/index.js | 110 --------------------------------------------- lib/search/snippet.js | 103 ------------------------------------------ 9 files changed, 257 insertions(+), 214 deletions(-) create mode 100644 bucky/search/middleware.js create mode 100644 bucky/search/search.js create mode 100644 bucky/search/snippet.js create mode 100644 bucky/search/stopwords.js delete mode 100644 lib/search/index.js delete mode 100644 lib/search/snippet.js diff --git a/bucky/app/bucky.js b/bucky/app/bucky.js index 757592a..d1aad4d 100644 --- a/bucky/app/bucky.js +++ b/bucky/app/bucky.js @@ -157,7 +157,7 @@ var bucky = module.exports = { next() }) }, - + /* MAIL */ ensureMailboxes: function (req, res, next){ diff --git a/bucky/app/router.js b/bucky/app/router.js index c3af565..a87e1ec 100644 --- a/bucky/app/router.js +++ b/bucky/app/router.js @@ -4,6 +4,7 @@ var fortune = require('./fortune') var bucky = require('./bucky') var db = require('./db') var util = require('./util') +var search = require('../search/middleware') module.exports = function(app){ app.all('*', middleware.ensureLocals) @@ -91,6 +92,14 @@ module.exports = function(app){ // delete a comment }) + app.get("/api/search", + middleware.ensureAuthenticated, + search.search, + search.getComments, + search.getFiles, + search.logQuery, + search.success + ) app.get("/api/keyword/:keyword", middleware.ensureAuthenticated, diff --git a/bucky/db/index.js b/bucky/db/index.js index f376308..dcd5f20 100644 --- a/bucky/db/index.js +++ b/bucky/db/index.js @@ -92,6 +92,9 @@ db.getFileCounts = function(ids){ db.getFileSizes = function(ids){ return knex.column('thread').sum('size as size').select().from('files').where('thread', 'in', ids).groupBy('thread') } +db.getFilesById = function(ids){ + return File.where("id", "in", ids) +} db.createFile = function(data){ return new db.File(data).save() } @@ -117,6 +120,9 @@ db.getCommentsForThread = function (id, limit, offset, order){ return comments }) } +db.getCommentsById = function(ids){ + return Comment.where("id", "in", ids) +} db.getCommentCounts = function(ids){ return knex.column('thread').count('* as count').select().from('comments').where('thread', 'in', ids).groupBy('thread') } diff --git a/bucky/search/middleware.js b/bucky/search/middleware.js new file mode 100644 index 0000000..64ddd28 --- /dev/null +++ b/bucky/search/middleware.js @@ -0,0 +1,44 @@ +var search = require('./search') +var snippet = require('./snippet') +var db = require('../db') + +module.exports = { + + search: function (req, res, next) { + var results = search.search(req.body.query, req.body.start, req.body.limit) + res.search = results + next() + }, + + getComments: function (req, res, next){ + var comment_ids = res.search.comment_ids; + if (! comment_ids || ! comment_ids.length) { + return next() + } + db.getCommentsById(comment_ids).then(function(comments){ + res.search.comments = comments + next() + }) + }, + + getFiles: function (req, res, next){ + var file_ids = res.search.file_ids + if (! file_ids || ! file_ids.length) { + return next() + } + db.getFilesById(file_ids).then(function(files){ + res.search.files = files + next() + }) + }, + + logQuery: function(req, res, next) { + // req.search.query, req.search.count + next() + }, + + success: function(req, res, next){ + res.send(res.search) + } + +} diff --git a/bucky/search/search.js b/bucky/search/search.js new file mode 100644 index 0000000..afa9609 --- /dev/null +++ b/bucky/search/search.js @@ -0,0 +1,91 @@ +var db = require('../db') +var STOPWORDS = require('./stopwords') + +var bdb_lib = require('berkeleydb') +var bdb = new bdb_lib.Db() +bdb.open('search.db') + +var wordRegexp = new RegExp("(\W+)"); +var wordBoundaryRegexp = new RegExp("\W"); +function parse_terms (s) { + return s.toLowerCase().split(wordRegexp).filter((term) => { + if (! term.match(wordBoundaryRegexp)) { + return true + } + return false + }) +} +function cmp (a,b){ return (a { + var partz = s.split(" ") + var match = { + thread: s[0], + comment: s[1], + file: s[2], + strength: s[3], + } + }) + return matches +} + +function search (query, start, limit) { + if (!query) return + start = start || 0; + limit = limit || 10; + var scores = {}; + var terms = parse_terms($query); + var i = 0 + var total + var to_display = limit + var threads = {} + var thread_ids = [] + var comment_ids = [] + var file_ids = [] + var results = [] + + terms.forEach((term) => { + if (STOPWORDS.has(term)) return; + var results = find_term(term); + if (!results) return; + results.forEach((result) => { + var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 } + score.thread = score.thread || result.thread + score.comment = score.comment || result.comment + score.file = score.file || result.file + score.strength += result.strength + score.count += 1 + }) + }) + total = Object.keys(scores).length + Object.values(scores).sort((a,b) => { + if (b.count !== a.count) { + return cmp(b.count, a.count) + } + return cmp(b.strength * b.count, a.strength * a.count) + }).some((match) => { + if (i++ < start) return false + if (to_display-- === 0) return true + results.push(match) + thread_ids.push(match.thread) + if (match.comment) comment_ids.push(match.comment) + if (match.file) file_ids.push(match.file) + return false + }) + + return { + query: query, + start: start, + next: start + limit, + limit: limit, + total: total, + results: results, + thread_ids: thread_ids, + comment_ids: comment_ids, + file_ids: $file_ids, + terms: terms, + }; +} + +module.exports = { search: search } diff --git a/bucky/search/snippet.js b/bucky/search/snippet.js new file mode 100644 index 0000000..cd0657f --- /dev/null +++ b/bucky/search/snippet.js @@ -0,0 +1,88 @@ +var util = require('../util/util') +var STOPWORDS = require('./stopwords') + +function bold_snippet(s, terms) { + return bold_terms(snippet(s, terms), terms) +} +function bold_terms (s, terms) { + s = util.sanitize(s) + terms.forEach( (term) => { + s.replace(new RegExp("\b" + term + "\b", "i"), "" + term + "") + }) +} +function snippet(s, terms) { + s = util.sanitize(s) + var term_re = new RegExp("\b(" + terms.join("|") + ")\b", "i") + var words = s.split(/\s+/) + var snippet = ""; + + // deduper for matching @words indexes, so we don't add a word twice + var index_matches = {} + + // words in the eventual snippet + var words_matched = [] + + // counter for aggregating context after a match + var aggr = 0; + + // amount of context to show, in number of words surrounding a match + var $pad = 4; + + // loop over each of the words in the string + words.some((word, i) => { + // if the word matches... + if (term_re.match(word) && ! STOPWORDS.has(word.toLowerCase())) { + // if we aren't already aggregating, add an ellipsis + if (! $aggr) { + words_matched.push("...") + } + + // look backward $pad words + var idx; + for (var j = -pad; j < 1; j++) { + // create a new index from the offset + idx = i + j; + + // is this a valid index? has it already been encountered? + if (idx < 0) continue; + if (idx > words.length) continue; + if (index_matches[idx]) continue; + + // checks out, save this word + words_matched.push(words[idx]) + + // note the matching index in our deduper + index_matches[idx] = 1; + } + // enter aggregate mode -- add the next (pad) words + aggr = pad; + } + + // have we been told to aggregate? + else if (aggr) { + // save this word + words_matched.push(word) + + // add index to the deduper + index_matches[i] = 1; + + // one less word to aggregate + aggr--; + } + + // keep snippets to a modest length + return words_matched.length > 30; + }) + + // add a trailing ellipsis + words_matched.push("...") + + // create the snippet from the saved context words + snippet = words_matched.join(" ") + + return snippet +} + +module.exports = { + bold_snippet, bold_terms, snippet, +} \ No newline at end of file diff --git a/bucky/search/stopwords.js b/bucky/search/stopwords.js new file mode 100644 index 0000000..ceffe14 --- /dev/null +++ b/bucky/search/stopwords.js @@ -0,0 +1,18 @@ +module.exports = new Set( + "a about above across adj after again against all almost alone along also " + + "although always am among an and another any anybody anyone anything anywhere " + + "apart are around as aside at away be because been before behind being below " + + "besides between beyond both but by can cannot could did do does doing done " + + "down downwards during each either else enough etc even ever every everybody " + + "everyone except far few for forth from get gets got had hardly has have having " + + "her here herself him himself his how however i if in indeed instead into inward " + + "is it its itself just kept many maybe might mine more most mostly much must " + + "myself near neither next no nobody none nor not nothing nowhere of off often on " + + "only onto or other others ought our ours out outside over own p per please plus " + + "pp quite rather really said seem self selves several shall she should since so " + + "some somebody somewhat still such than that the their theirs them themselves " + + "then there therefore these they this thorough thoroughly those through thus to " + + "together too toward towards under until up upon v very was well were what " + + "whatever when whenever where whether which while who whom whose will with" + + "within without would yet young your yourself s".split(" ") +); diff --git a/lib/search/index.js b/lib/search/index.js deleted file mode 100644 index 27f436f..0000000 --- a/lib/search/index.js +++ /dev/null @@ -1,110 +0,0 @@ -var db = require('../db') -var bdb_lib = require('berkeleydb') -var bdb = new bdb_lib.Db() -bdb.open('search.db') - -var wordRegexp = new RegExp("(\W+)"); -var wordBoundaryRegexp = new RegExp("\W"); -function parse_terms (s) { - return s.toLowerCase().split(wordRegexp).filter((term) => { - if (! term.match(wordBoundaryRegexp)) { - return true - } - return false - }) -} -function cmp (a,b){ return (a { - var partz = s.split(" ") - var match = { - thread: s[0], - comment: s[1], - file: s[2], - strength: s[3], - } - }) - return matches -} - -function search (query, start, limit) { - if (!query) return - start = start || 0; - limit = limit || 10; - var scores = {}; - var terms = parse_terms($query); - var i = 0 - var total - var to_display = limit - var threads = {} - var comment_ids = [] - var file_ids = [] - var results = [] - - terms.forEach((term) => { - if (STOPWORDS.has(term)) return; - var results = find_term(term); - if (!results) return; - results.forEach((result) => { - var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 } - score.thread = score.thread || result.thread - score.comment = score.comment || result.comment - score.file = score.file || result.file - score.strength += result.strength - score.count += 1 - }) - }) - total = Object.keys(scores).length - Object.values(scores).sort((a,b) => { - if (b.count !== a.count) { - return cmp(b.count, a.count) - } - return cmp(b.strength * b.count, a.strength * a.count) - }).some((match) => { - if (i++ < start) return false - if (to_display-- === 0) return true - results.push(match) - thread_ids.push(match.thread) - if (match.comment) comment_ids.push(match.comment) - if (match.file) file_ids.push(match.file) - return false - }) - - db.storeQuery(query, total) - - my $files = $self->files_by_id($files_to_get); - my $comments = $self->comments_by_id($comments_to_get); - $self->log_query($query, $total); - return { - start => $start + $limit, - limit => $limit, - total => $total, - results => $results, - threads => $threads, - comments => $comments, - files => $files, - terms => $terms, - }; -} - -module.exports = { search: search } diff --git a/lib/search/snippet.js b/lib/search/snippet.js deleted file mode 100644 index de71911..0000000 --- a/lib/search/snippet.js +++ /dev/null @@ -1,103 +0,0 @@ -var util = require('../util/util') - -function bold_terms (s, terms) { - -} -sub bold_terms - { - my ($self, $string, $terms) = @_; - $string = $self->strip_html($string); - foreach my $term (@$terms) - { - $string =~ s/\b($term)\b/$1<\/b>/gi; - } - return $string; - } -sub bold_snippet - { - my ($self, $string, $terms) = @_; - my $snippet = $self->snippet($string, $terms); - return $self->bold_terms($snippet, $terms); - } -sub snippet - { - my ($self, $string, $terms) = @_; - - # clean up the string we got - $string = $self->strip_html($string); - - # create a regex out of the search terms - my $term_re = join "|", @$terms; - - # take the string to be snippetized and split it into words - my @words = split /\s+/, $string; - - # deduper for matching @words indexes, so we don't add a word twice - my $index_matches = {}; - - # words in the eventual snippet - my @words_matched; - - # the snippet itself - my $snippet = ''; - - # counter for aggregating context after a match - my $aggr = 0; - - # amount of context to show, in number of words surrounding a match - my $pad = 4; - - # loop over each of the words in the string - for (my $i = 0; $i < scalar @words; $i++) - { - # does this word contain a match? - if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1)) - { - # if we aren't already aggregating, add an ellipsis - if (! $aggr) - { - push @words_matched, "..."; - } - # look backward $pad words - for (my $j = -$pad; $j < 1; $j++) - { - # create a new index from the offset - my $idx = $i + $j; - - # is this a valid index? has it already been encountered? - next if $idx < 0; - next if $idx > scalar @words; - next if exists $index_matches->{$i+$j}; - - # checks out, save this word - push @words_matched, $words[$i+$j]; - - # note the matching index in our deduper - $index_matches->{$i+$j} ++; - } - # enter aggregate mode -- add the next $pad words - $aggr = $pad; - } - # have we been told to aggregate? - elsif ($aggr) - { - # save this word - push @words_matched, $words[$i]; - - # add index to the deduper - $index_matches->{$i} ++; - - # one less word to aggregate - $aggr--; - } - # keep snippets to a modest length - last if scalar @words_matched > 30; - } - # add a trailing ellipsis - push @words_matched, "..."; - - # create the snippet from the saved context words - $snippet = join " ", @words_matched; - - return $snippet; - } \ No newline at end of file -- cgit v1.2.3-70-g09d2