diff options
Diffstat (limited to 'lib/search')
| -rw-r--r-- | lib/search/index.js | 110 | ||||
| -rw-r--r-- | lib/search/snippet.js | 103 |
2 files changed, 213 insertions, 0 deletions
diff --git a/lib/search/index.js b/lib/search/index.js new file mode 100644 index 0000000..27f436f --- /dev/null +++ b/lib/search/index.js @@ -0,0 +1,110 @@ +var db = require('../db') +var bdb_lib = require('berkeleydb') +var bdb = new bdb_lib.Db() +bdb.open('search.db') + +var wordRegexp = new RegExp("(\W+)"); +var wordBoundaryRegexp = new RegExp("\W"); +function parse_terms (s) { + return s.toLowerCase().split(wordRegexp).filter((term) => { + if (! term.match(wordBoundaryRegexp)) { + return true + } + return false + }) +} +function cmp (a,b){ return (a<b)?a:(a===b)?0:1 } + +var STOPWORDS = new Set( + "a about above across adj after again against all almost alone along also " + + "although always am among an and another any anybody anyone anything anywhere " + + "apart are around as aside at away be because been before behind being below " + + "besides between beyond both but by can cannot could did do does doing done " + + "down downwards during each either else enough etc even ever every everybody " + + "everyone except far few for forth from get gets got had hardly has have having " + + "her here herself him himself his how however i if in indeed instead into inward " + + "is it its itself just kept many maybe might mine more most mostly much must " + + "myself near neither next no nobody none nor not nothing nowhere of off often on " + + "only onto or other others ought our ours out outside over own p per please plus " + + "pp quite rather really said seem self selves several shall she should since so " + + "some somebody somewhat still such than that the their theirs them themselves " + + "then there therefore these they this thorough thoroughly those through thus to " + + "together too toward towards under until up upon v very was well were what " + + "whatever when whenever where whether which while who whom whose will with" + + "within without would yet young your yourself s".split(" ") +); + +function find_term(term) { + var matches = bdb.get(term).split(",").map((s) => { + var partz = s.split(" ") + var match = { + thread: s[0], + comment: s[1], + file: s[2], + strength: s[3], + } + }) + return matches +} + +function search (query, start, limit) { + if (!query) return + start = start || 0; + limit = limit || 10; + var scores = {}; + var terms = parse_terms($query); + var i = 0 + var total + var to_display = limit + var threads = {} + var comment_ids = [] + var file_ids = [] + var results = [] + + terms.forEach((term) => { + if (STOPWORDS.has(term)) return; + var results = find_term(term); + if (!results) return; + results.forEach((result) => { + var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 } + score.thread = score.thread || result.thread + score.comment = score.comment || result.comment + score.file = score.file || result.file + score.strength += result.strength + score.count += 1 + }) + }) + total = Object.keys(scores).length + Object.values(scores).sort((a,b) => { + if (b.count !== a.count) { + return cmp(b.count, a.count) + } + return cmp(b.strength * b.count, a.strength * a.count) + }).some((match) => { + if (i++ < start) return false + if (to_display-- === 0) return true + results.push(match) + thread_ids.push(match.thread) + if (match.comment) comment_ids.push(match.comment) + if (match.file) file_ids.push(match.file) + return false + }) + + db.storeQuery(query, total) + + my $files = $self->files_by_id($files_to_get); + my $comments = $self->comments_by_id($comments_to_get); + $self->log_query($query, $total); + return { + start => $start + $limit, + limit => $limit, + total => $total, + results => $results, + threads => $threads, + comments => $comments, + files => $files, + terms => $terms, + }; +} + +module.exports = { search: search } diff --git a/lib/search/snippet.js b/lib/search/snippet.js new file mode 100644 index 0000000..de71911 --- /dev/null +++ b/lib/search/snippet.js @@ -0,0 +1,103 @@ +var util = require('../util/util') + +function bold_terms (s, terms) { + +} +sub bold_terms + { + my ($self, $string, $terms) = @_; + $string = $self->strip_html($string); + foreach my $term (@$terms) + { + $string =~ s/\b($term)\b/<b>$1<\/b>/gi; + } + return $string; + } +sub bold_snippet + { + my ($self, $string, $terms) = @_; + my $snippet = $self->snippet($string, $terms); + return $self->bold_terms($snippet, $terms); + } +sub snippet + { + my ($self, $string, $terms) = @_; + + # clean up the string we got + $string = $self->strip_html($string); + + # create a regex out of the search terms + my $term_re = join "|", @$terms; + + # take the string to be snippetized and split it into words + my @words = split /\s+/, $string; + + # deduper for matching @words indexes, so we don't add a word twice + my $index_matches = {}; + + # words in the eventual snippet + my @words_matched; + + # the snippet itself + my $snippet = ''; + + # counter for aggregating context after a match + my $aggr = 0; + + # amount of context to show, in number of words surrounding a match + my $pad = 4; + + # loop over each of the words in the string + for (my $i = 0; $i < scalar @words; $i++) + { + # does this word contain a match? + if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1)) + { + # if we aren't already aggregating, add an ellipsis + if (! $aggr) + { + push @words_matched, "..."; + } + # look backward $pad words + for (my $j = -$pad; $j < 1; $j++) + { + # create a new index from the offset + my $idx = $i + $j; + + # is this a valid index? has it already been encountered? + next if $idx < 0; + next if $idx > scalar @words; + next if exists $index_matches->{$i+$j}; + + # checks out, save this word + push @words_matched, $words[$i+$j]; + + # note the matching index in our deduper + $index_matches->{$i+$j} ++; + } + # enter aggregate mode -- add the next $pad words + $aggr = $pad; + } + # have we been told to aggregate? + elsif ($aggr) + { + # save this word + push @words_matched, $words[$i]; + + # add index to the deduper + $index_matches->{$i} ++; + + # one less word to aggregate + $aggr--; + } + # keep snippets to a modest length + last if scalar @words_matched > 30; + } + # add a trailing ellipsis + push @words_matched, "..."; + + # create the snippet from the saved context words + $snippet = join " ", @words_matched; + + return $snippet; + }
\ No newline at end of file |
