summaryrefslogtreecommitdiff
path: root/lib/search
diff options
context:
space:
mode:
Diffstat (limited to 'lib/search')
-rw-r--r--lib/search/index.js110
-rw-r--r--lib/search/snippet.js103
2 files changed, 0 insertions, 213 deletions
diff --git a/lib/search/index.js b/lib/search/index.js
deleted file mode 100644
index 27f436f..0000000
--- a/lib/search/index.js
+++ /dev/null
@@ -1,110 +0,0 @@
-var db = require('../db')
-var bdb_lib = require('berkeleydb')
-var bdb = new bdb_lib.Db()
-bdb.open('search.db')
-
-var wordRegexp = new RegExp("(\W+)");
-var wordBoundaryRegexp = new RegExp("\W");
-function parse_terms (s) {
- return s.toLowerCase().split(wordRegexp).filter((term) => {
- if (! term.match(wordBoundaryRegexp)) {
- return true
- }
- return false
- })
-}
-function cmp (a,b){ return (a<b)?a:(a===b)?0:1 }
-
-var STOPWORDS = new Set(
- "a about above across adj after again against all almost alone along also " +
- "although always am among an and another any anybody anyone anything anywhere " +
- "apart are around as aside at away be because been before behind being below " +
- "besides between beyond both but by can cannot could did do does doing done " +
- "down downwards during each either else enough etc even ever every everybody " +
- "everyone except far few for forth from get gets got had hardly has have having " +
- "her here herself him himself his how however i if in indeed instead into inward " +
- "is it its itself just kept many maybe might mine more most mostly much must " +
- "myself near neither next no nobody none nor not nothing nowhere of off often on " +
- "only onto or other others ought our ours out outside over own p per please plus " +
- "pp quite rather really said seem self selves several shall she should since so " +
- "some somebody somewhat still such than that the their theirs them themselves " +
- "then there therefore these they this thorough thoroughly those through thus to " +
- "together too toward towards under until up upon v very was well were what " +
- "whatever when whenever where whether which while who whom whose will with" +
- "within without would yet young your yourself s".split(" ")
-);
-
-function find_term(term) {
- var matches = bdb.get(term).split(",").map((s) => {
- var partz = s.split(" ")
- var match = {
- thread: s[0],
- comment: s[1],
- file: s[2],
- strength: s[3],
- }
- })
- return matches
-}
-
-function search (query, start, limit) {
- if (!query) return
- start = start || 0;
- limit = limit || 10;
- var scores = {};
- var terms = parse_terms($query);
- var i = 0
- var total
- var to_display = limit
- var threads = {}
- var comment_ids = []
- var file_ids = []
- var results = []
-
- terms.forEach((term) => {
- if (STOPWORDS.has(term)) return;
- var results = find_term(term);
- if (!results) return;
- results.forEach((result) => {
- var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 }
- score.thread = score.thread || result.thread
- score.comment = score.comment || result.comment
- score.file = score.file || result.file
- score.strength += result.strength
- score.count += 1
- })
- })
- total = Object.keys(scores).length
- Object.values(scores).sort((a,b) => {
- if (b.count !== a.count) {
- return cmp(b.count, a.count)
- }
- return cmp(b.strength * b.count, a.strength * a.count)
- }).some((match) => {
- if (i++ < start) return false
- if (to_display-- === 0) return true
- results.push(match)
- thread_ids.push(match.thread)
- if (match.comment) comment_ids.push(match.comment)
- if (match.file) file_ids.push(match.file)
- return false
- })
-
- db.storeQuery(query, total)
-
- my $files = $self->files_by_id($files_to_get);
- my $comments = $self->comments_by_id($comments_to_get);
- $self->log_query($query, $total);
- return {
- start => $start + $limit,
- limit => $limit,
- total => $total,
- results => $results,
- threads => $threads,
- comments => $comments,
- files => $files,
- terms => $terms,
- };
-}
-
-module.exports = { search: search }
diff --git a/lib/search/snippet.js b/lib/search/snippet.js
deleted file mode 100644
index de71911..0000000
--- a/lib/search/snippet.js
+++ /dev/null
@@ -1,103 +0,0 @@
-var util = require('../util/util')
-
-function bold_terms (s, terms) {
-
-}
-sub bold_terms
- {
- my ($self, $string, $terms) = @_;
- $string = $self->strip_html($string);
- foreach my $term (@$terms)
- {
- $string =~ s/\b($term)\b/<b>$1<\/b>/gi;
- }
- return $string;
- }
-sub bold_snippet
- {
- my ($self, $string, $terms) = @_;
- my $snippet = $self->snippet($string, $terms);
- return $self->bold_terms($snippet, $terms);
- }
-sub snippet
- {
- my ($self, $string, $terms) = @_;
-
- # clean up the string we got
- $string = $self->strip_html($string);
-
- # create a regex out of the search terms
- my $term_re = join "|", @$terms;
-
- # take the string to be snippetized and split it into words
- my @words = split /\s+/, $string;
-
- # deduper for matching @words indexes, so we don't add a word twice
- my $index_matches = {};
-
- # words in the eventual snippet
- my @words_matched;
-
- # the snippet itself
- my $snippet = '';
-
- # counter for aggregating context after a match
- my $aggr = 0;
-
- # amount of context to show, in number of words surrounding a match
- my $pad = 4;
-
- # loop over each of the words in the string
- for (my $i = 0; $i < scalar @words; $i++)
- {
- # does this word contain a match?
- if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1))
- {
- # if we aren't already aggregating, add an ellipsis
- if (! $aggr)
- {
- push @words_matched, "...";
- }
- # look backward $pad words
- for (my $j = -$pad; $j < 1; $j++)
- {
- # create a new index from the offset
- my $idx = $i + $j;
-
- # is this a valid index? has it already been encountered?
- next if $idx < 0;
- next if $idx > scalar @words;
- next if exists $index_matches->{$i+$j};
-
- # checks out, save this word
- push @words_matched, $words[$i+$j];
-
- # note the matching index in our deduper
- $index_matches->{$i+$j} ++;
- }
- # enter aggregate mode -- add the next $pad words
- $aggr = $pad;
- }
- # have we been told to aggregate?
- elsif ($aggr)
- {
- # save this word
- push @words_matched, $words[$i];
-
- # add index to the deduper
- $index_matches->{$i} ++;
-
- # one less word to aggregate
- $aggr--;
- }
- # keep snippets to a modest length
- last if scalar @words_matched > 30;
- }
- # add a trailing ellipsis
- push @words_matched, "...";
-
- # create the snippet from the saved context words
- $snippet = join " ", @words_matched;
-
- return $snippet;
- } \ No newline at end of file