diff options
| author | Jules Laplace <julescarbon@gmail.com> | 2017-12-08 02:52:19 +0100 |
|---|---|---|
| committer | Jules Laplace <julescarbon@gmail.com> | 2017-12-08 02:56:41 +0100 |
| commit | 192abb9db60f95968953b515ce18700c6b2da090 (patch) | |
| tree | e0e77c935ad95ca9e641c6a1f19a22556df9b8d2 | |
| parent | bbbd8bbab8737f5067c85376daf79cd8a5a9c4cb (diff) | |
snippets and middleware
| -rw-r--r-- | bucky/app/bucky.js | 2 | ||||
| -rw-r--r-- | bucky/app/router.js | 9 | ||||
| -rw-r--r-- | bucky/db/index.js | 6 | ||||
| -rw-r--r-- | bucky/search/middleware.js | 44 | ||||
| -rw-r--r-- | bucky/search/search.js (renamed from lib/search/index.js) | 47 | ||||
| -rw-r--r-- | bucky/search/snippet.js | 88 | ||||
| -rw-r--r-- | bucky/search/stopwords.js | 18 | ||||
| -rw-r--r-- | lib/search/snippet.js | 103 |
8 files changed, 180 insertions, 137 deletions
diff --git a/bucky/app/bucky.js b/bucky/app/bucky.js index 757592a..d1aad4d 100644 --- a/bucky/app/bucky.js +++ b/bucky/app/bucky.js @@ -157,7 +157,7 @@ var bucky = module.exports = { next() }) }, - + /* MAIL */ ensureMailboxes: function (req, res, next){ diff --git a/bucky/app/router.js b/bucky/app/router.js index c3af565..a87e1ec 100644 --- a/bucky/app/router.js +++ b/bucky/app/router.js @@ -4,6 +4,7 @@ var fortune = require('./fortune') var bucky = require('./bucky') var db = require('./db') var util = require('./util') +var search = require('../search/middleware') module.exports = function(app){ app.all('*', middleware.ensureLocals) @@ -91,6 +92,14 @@ module.exports = function(app){ // delete a comment }) + app.get("/api/search", + middleware.ensureAuthenticated, + search.search, + search.getComments, + search.getFiles, + search.logQuery, + search.success + ) app.get("/api/keyword/:keyword", middleware.ensureAuthenticated, diff --git a/bucky/db/index.js b/bucky/db/index.js index f376308..dcd5f20 100644 --- a/bucky/db/index.js +++ b/bucky/db/index.js @@ -92,6 +92,9 @@ db.getFileCounts = function(ids){ db.getFileSizes = function(ids){ return knex.column('thread').sum('size as size').select().from('files').where('thread', 'in', ids).groupBy('thread') } +db.getFilesById = function(ids){ + return File.where("id", "in", ids) +} db.createFile = function(data){ return new db.File(data).save() } @@ -117,6 +120,9 @@ db.getCommentsForThread = function (id, limit, offset, order){ return comments }) } +db.getCommentsById = function(ids){ + return Comment.where("id", "in", ids) +} db.getCommentCounts = function(ids){ return knex.column('thread').count('* as count').select().from('comments').where('thread', 'in', ids).groupBy('thread') } diff --git a/bucky/search/middleware.js b/bucky/search/middleware.js new file mode 100644 index 0000000..64ddd28 --- /dev/null +++ b/bucky/search/middleware.js @@ -0,0 +1,44 @@ +var search = require('./search') +var snippet = require('./snippet') +var db = require('../db') + +module.exports = { + + search: function (req, res, next) { + var results = search.search(req.body.query, req.body.start, req.body.limit) + res.search = results + next() + }, + + getComments: function (req, res, next){ + var comment_ids = res.search.comment_ids; + if (! comment_ids || ! comment_ids.length) { + return next() + } + db.getCommentsById(comment_ids).then(function(comments){ + res.search.comments = comments + next() + }) + }, + + getFiles: function (req, res, next){ + var file_ids = res.search.file_ids + if (! file_ids || ! file_ids.length) { + return next() + } + db.getFilesById(file_ids).then(function(files){ + res.search.files = files + next() + }) + }, + + logQuery: function(req, res, next) { + // req.search.query, req.search.count + next() + }, + + success: function(req, res, next){ + res.send(res.search) + } + +} diff --git a/lib/search/index.js b/bucky/search/search.js index 27f436f..afa9609 100644 --- a/lib/search/index.js +++ b/bucky/search/search.js @@ -1,4 +1,6 @@ var db = require('../db') +var STOPWORDS = require('./stopwords') + var bdb_lib = require('berkeleydb') var bdb = new bdb_lib.Db() bdb.open('search.db') @@ -15,25 +17,6 @@ function parse_terms (s) { } function cmp (a,b){ return (a<b)?a:(a===b)?0:1 } -var STOPWORDS = new Set( - "a about above across adj after again against all almost alone along also " + - "although always am among an and another any anybody anyone anything anywhere " + - "apart are around as aside at away be because been before behind being below " + - "besides between beyond both but by can cannot could did do does doing done " + - "down downwards during each either else enough etc even ever every everybody " + - "everyone except far few for forth from get gets got had hardly has have having " + - "her here herself him himself his how however i if in indeed instead into inward " + - "is it its itself just kept many maybe might mine more most mostly much must " + - "myself near neither next no nobody none nor not nothing nowhere of off often on " + - "only onto or other others ought our ours out outside over own p per please plus " + - "pp quite rather really said seem self selves several shall she should since so " + - "some somebody somewhat still such than that the their theirs them themselves " + - "then there therefore these they this thorough thoroughly those through thus to " + - "together too toward towards under until up upon v very was well were what " + - "whatever when whenever where whether which while who whom whose will with" + - "within without would yet young your yourself s".split(" ") -); - function find_term(term) { var matches = bdb.get(term).split(",").map((s) => { var partz = s.split(" ") @@ -57,6 +40,7 @@ function search (query, start, limit) { var total var to_display = limit var threads = {} + var thread_ids = [] var comment_ids = [] var file_ids = [] var results = [] @@ -89,21 +73,18 @@ function search (query, start, limit) { if (match.file) file_ids.push(match.file) return false }) - - db.storeQuery(query, total) - - my $files = $self->files_by_id($files_to_get); - my $comments = $self->comments_by_id($comments_to_get); - $self->log_query($query, $total); + return { - start => $start + $limit, - limit => $limit, - total => $total, - results => $results, - threads => $threads, - comments => $comments, - files => $files, - terms => $terms, + query: query, + start: start, + next: start + limit, + limit: limit, + total: total, + results: results, + thread_ids: thread_ids, + comment_ids: comment_ids, + file_ids: $file_ids, + terms: terms, }; } diff --git a/bucky/search/snippet.js b/bucky/search/snippet.js new file mode 100644 index 0000000..cd0657f --- /dev/null +++ b/bucky/search/snippet.js @@ -0,0 +1,88 @@ +var util = require('../util/util') +var STOPWORDS = require('./stopwords') + +function bold_snippet(s, terms) { + return bold_terms(snippet(s, terms), terms) +} +function bold_terms (s, terms) { + s = util.sanitize(s) + terms.forEach( (term) => { + s.replace(new RegExp("\b" + term + "\b", "i"), "<b>" + term + "</b>") + }) +} +function snippet(s, terms) { + s = util.sanitize(s) + var term_re = new RegExp("\b(" + terms.join("|") + ")\b", "i") + var words = s.split(/\s+/) + var snippet = ""; + + // deduper for matching @words indexes, so we don't add a word twice + var index_matches = {} + + // words in the eventual snippet + var words_matched = [] + + // counter for aggregating context after a match + var aggr = 0; + + // amount of context to show, in number of words surrounding a match + var $pad = 4; + + // loop over each of the words in the string + words.some((word, i) => { + // if the word matches... + if (term_re.match(word) && ! STOPWORDS.has(word.toLowerCase())) { + // if we aren't already aggregating, add an ellipsis + if (! $aggr) { + words_matched.push("...") + } + + // look backward $pad words + var idx; + for (var j = -pad; j < 1; j++) { + // create a new index from the offset + idx = i + j; + + // is this a valid index? has it already been encountered? + if (idx < 0) continue; + if (idx > words.length) continue; + if (index_matches[idx]) continue; + + // checks out, save this word + words_matched.push(words[idx]) + + // note the matching index in our deduper + index_matches[idx] = 1; + } + // enter aggregate mode -- add the next (pad) words + aggr = pad; + } + + // have we been told to aggregate? + else if (aggr) { + // save this word + words_matched.push(word) + + // add index to the deduper + index_matches[i] = 1; + + // one less word to aggregate + aggr--; + } + + // keep snippets to a modest length + return words_matched.length > 30; + }) + + // add a trailing ellipsis + words_matched.push("...") + + // create the snippet from the saved context words + snippet = words_matched.join(" ") + + return snippet +} + +module.exports = { + bold_snippet, bold_terms, snippet, +}
\ No newline at end of file diff --git a/bucky/search/stopwords.js b/bucky/search/stopwords.js new file mode 100644 index 0000000..ceffe14 --- /dev/null +++ b/bucky/search/stopwords.js @@ -0,0 +1,18 @@ +module.exports = new Set( + "a about above across adj after again against all almost alone along also " + + "although always am among an and another any anybody anyone anything anywhere " + + "apart are around as aside at away be because been before behind being below " + + "besides between beyond both but by can cannot could did do does doing done " + + "down downwards during each either else enough etc even ever every everybody " + + "everyone except far few for forth from get gets got had hardly has have having " + + "her here herself him himself his how however i if in indeed instead into inward " + + "is it its itself just kept many maybe might mine more most mostly much must " + + "myself near neither next no nobody none nor not nothing nowhere of off often on " + + "only onto or other others ought our ours out outside over own p per please plus " + + "pp quite rather really said seem self selves several shall she should since so " + + "some somebody somewhat still such than that the their theirs them themselves " + + "then there therefore these they this thorough thoroughly those through thus to " + + "together too toward towards under until up upon v very was well were what " + + "whatever when whenever where whether which while who whom whose will with" + + "within without would yet young your yourself s".split(" ") +); diff --git a/lib/search/snippet.js b/lib/search/snippet.js deleted file mode 100644 index de71911..0000000 --- a/lib/search/snippet.js +++ /dev/null @@ -1,103 +0,0 @@ -var util = require('../util/util') - -function bold_terms (s, terms) { - -} -sub bold_terms - { - my ($self, $string, $terms) = @_; - $string = $self->strip_html($string); - foreach my $term (@$terms) - { - $string =~ s/\b($term)\b/<b>$1<\/b>/gi; - } - return $string; - } -sub bold_snippet - { - my ($self, $string, $terms) = @_; - my $snippet = $self->snippet($string, $terms); - return $self->bold_terms($snippet, $terms); - } -sub snippet - { - my ($self, $string, $terms) = @_; - - # clean up the string we got - $string = $self->strip_html($string); - - # create a regex out of the search terms - my $term_re = join "|", @$terms; - - # take the string to be snippetized and split it into words - my @words = split /\s+/, $string; - - # deduper for matching @words indexes, so we don't add a word twice - my $index_matches = {}; - - # words in the eventual snippet - my @words_matched; - - # the snippet itself - my $snippet = ''; - - # counter for aggregating context after a match - my $aggr = 0; - - # amount of context to show, in number of words surrounding a match - my $pad = 4; - - # loop over each of the words in the string - for (my $i = 0; $i < scalar @words; $i++) - { - # does this word contain a match? - if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1)) - { - # if we aren't already aggregating, add an ellipsis - if (! $aggr) - { - push @words_matched, "..."; - } - # look backward $pad words - for (my $j = -$pad; $j < 1; $j++) - { - # create a new index from the offset - my $idx = $i + $j; - - # is this a valid index? has it already been encountered? - next if $idx < 0; - next if $idx > scalar @words; - next if exists $index_matches->{$i+$j}; - - # checks out, save this word - push @words_matched, $words[$i+$j]; - - # note the matching index in our deduper - $index_matches->{$i+$j} ++; - } - # enter aggregate mode -- add the next $pad words - $aggr = $pad; - } - # have we been told to aggregate? - elsif ($aggr) - { - # save this word - push @words_matched, $words[$i]; - - # add index to the deduper - $index_matches->{$i} ++; - - # one less word to aggregate - $aggr--; - } - # keep snippets to a modest length - last if scalar @words_matched > 30; - } - # add a trailing ellipsis - push @words_matched, "..."; - - # create the snippet from the saved context words - $snippet = join " ", @words_matched; - - return $snippet; - }
\ No newline at end of file |
