From 192abb9db60f95968953b515ce18700c6b2da090 Mon Sep 17 00:00:00 2001 From: Jules Laplace Date: Fri, 8 Dec 2017 02:52:19 +0100 Subject: snippets and middleware --- lib/search/index.js | 110 -------------------------------------------------- lib/search/snippet.js | 103 ---------------------------------------------- 2 files changed, 213 deletions(-) delete mode 100644 lib/search/index.js delete mode 100644 lib/search/snippet.js (limited to 'lib') diff --git a/lib/search/index.js b/lib/search/index.js deleted file mode 100644 index 27f436f..0000000 --- a/lib/search/index.js +++ /dev/null @@ -1,110 +0,0 @@ -var db = require('../db') -var bdb_lib = require('berkeleydb') -var bdb = new bdb_lib.Db() -bdb.open('search.db') - -var wordRegexp = new RegExp("(\W+)"); -var wordBoundaryRegexp = new RegExp("\W"); -function parse_terms (s) { - return s.toLowerCase().split(wordRegexp).filter((term) => { - if (! term.match(wordBoundaryRegexp)) { - return true - } - return false - }) -} -function cmp (a,b){ return (a { - var partz = s.split(" ") - var match = { - thread: s[0], - comment: s[1], - file: s[2], - strength: s[3], - } - }) - return matches -} - -function search (query, start, limit) { - if (!query) return - start = start || 0; - limit = limit || 10; - var scores = {}; - var terms = parse_terms($query); - var i = 0 - var total - var to_display = limit - var threads = {} - var comment_ids = [] - var file_ids = [] - var results = [] - - terms.forEach((term) => { - if (STOPWORDS.has(term)) return; - var results = find_term(term); - if (!results) return; - results.forEach((result) => { - var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 } - score.thread = score.thread || result.thread - score.comment = score.comment || result.comment - score.file = score.file || result.file - score.strength += result.strength - score.count += 1 - }) - }) - total = Object.keys(scores).length - Object.values(scores).sort((a,b) => { - if (b.count !== a.count) { - return cmp(b.count, a.count) - } - return cmp(b.strength * b.count, a.strength * a.count) - }).some((match) => { - if (i++ < start) return false - if (to_display-- === 0) return true - results.push(match) - thread_ids.push(match.thread) - if (match.comment) comment_ids.push(match.comment) - if (match.file) file_ids.push(match.file) - return false - }) - - db.storeQuery(query, total) - - my $files = $self->files_by_id($files_to_get); - my $comments = $self->comments_by_id($comments_to_get); - $self->log_query($query, $total); - return { - start => $start + $limit, - limit => $limit, - total => $total, - results => $results, - threads => $threads, - comments => $comments, - files => $files, - terms => $terms, - }; -} - -module.exports = { search: search } diff --git a/lib/search/snippet.js b/lib/search/snippet.js deleted file mode 100644 index de71911..0000000 --- a/lib/search/snippet.js +++ /dev/null @@ -1,103 +0,0 @@ -var util = require('../util/util') - -function bold_terms (s, terms) { - -} -sub bold_terms - { - my ($self, $string, $terms) = @_; - $string = $self->strip_html($string); - foreach my $term (@$terms) - { - $string =~ s/\b($term)\b/$1<\/b>/gi; - } - return $string; - } -sub bold_snippet - { - my ($self, $string, $terms) = @_; - my $snippet = $self->snippet($string, $terms); - return $self->bold_terms($snippet, $terms); - } -sub snippet - { - my ($self, $string, $terms) = @_; - - # clean up the string we got - $string = $self->strip_html($string); - - # create a regex out of the search terms - my $term_re = join "|", @$terms; - - # take the string to be snippetized and split it into words - my @words = split /\s+/, $string; - - # deduper for matching @words indexes, so we don't add a word twice - my $index_matches = {}; - - # words in the eventual snippet - my @words_matched; - - # the snippet itself - my $snippet = ''; - - # counter for aggregating context after a match - my $aggr = 0; - - # amount of context to show, in number of words surrounding a match - my $pad = 4; - - # loop over each of the words in the string - for (my $i = 0; $i < scalar @words; $i++) - { - # does this word contain a match? - if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1)) - { - # if we aren't already aggregating, add an ellipsis - if (! $aggr) - { - push @words_matched, "..."; - } - # look backward $pad words - for (my $j = -$pad; $j < 1; $j++) - { - # create a new index from the offset - my $idx = $i + $j; - - # is this a valid index? has it already been encountered? - next if $idx < 0; - next if $idx > scalar @words; - next if exists $index_matches->{$i+$j}; - - # checks out, save this word - push @words_matched, $words[$i+$j]; - - # note the matching index in our deduper - $index_matches->{$i+$j} ++; - } - # enter aggregate mode -- add the next $pad words - $aggr = $pad; - } - # have we been told to aggregate? - elsif ($aggr) - { - # save this word - push @words_matched, $words[$i]; - - # add index to the deduper - $index_matches->{$i} ++; - - # one less word to aggregate - $aggr--; - } - # keep snippets to a modest length - last if scalar @words_matched > 30; - } - # add a trailing ellipsis - push @words_matched, "..."; - - # create the snippet from the saved context words - $snippet = join " ", @words_matched; - - return $snippet; - } \ No newline at end of file -- cgit v1.2.3-70-g09d2