2 files changed, 213 insertions, 0 deletions
diff --git a/lib/search/index.js b/lib/search/index.js
new file mode 100644
index 0000000..27f436f
--- /dev/null
+++ b/lib/search/index.js
@@ -0,0 +1,110 @@
+var db = require('../db')
+var bdb_lib = require('berkeleydb')
+var bdb = new bdb_lib.Db()
+bdb.open('search.db')
+
+var wordRegexp = new RegExp("(\W+)");
+var wordBoundaryRegexp = new RegExp("\W");
+function parse_terms (s) {
+  return s.toLowerCase().split(wordRegexp).filter((term) => {
+	  if (! term.match(wordBoundaryRegexp)) {
+	    return true
+	  }
+	  return false
+	})
+}
+function cmp (a,b){ return (a<b)?a:(a===b)?0:1 }
+
+var STOPWORDS = new Set(
+  "a about above across adj after again against all almost alone along also " +
+  "although always am among an and another any anybody anyone anything anywhere " +
+  "apart are around as aside at away be because been before behind being below " +
+  "besides between beyond both but by can cannot could did do does doing done " +
+  "down downwards during each either else enough etc even ever every everybody " +
+  "everyone except far few for forth from get gets got had hardly has have having " +
+  "her here herself him himself his how however i if in indeed instead into inward " +
+  "is it its itself just kept many maybe might mine more most mostly much must " +
+  "myself near neither next no nobody none nor not nothing nowhere of off often on " +
+  "only onto or other others ought our ours out outside over own p per please plus " +
+  "pp quite rather really said seem self selves several shall she should since so " +
+  "some somebody somewhat still such than that the their theirs them themselves " +
+  "then there therefore these they this thorough thoroughly those through thus to " +
+  "together too toward towards under until up upon v very was well were what " +
+  "whatever when whenever where whether which while who whom whose will with" +
+  "within without would yet young your yourself s".split(" ")
+);
+
+function find_term(term) {
+  var matches = bdb.get(term).split(",").map((s) => {
+    var partz = s.split(" ")
+    var match = {
+      thread: s[0],
+      comment: s[1],
+      file: s[2],
+      strength: s[3],
+    }
+  })
+  return matches
+}
+
+function search (query, start, limit) {
+  if (!query) return
+	start = start || 0;
+	limit = limit || 10;
+	var scores = {};
+	var terms = parse_terms($query);
+  var i = 0
+  var total
+  var to_display = limit
+  var threads = {}
+  var comment_ids = []
+  var file_ids = []
+  var results = []
+
+  terms.forEach((term) => {
+    if (STOPWORDS.has(term)) return;
+    var results = find_term(term);
+    if (!results) return;
+    results.forEach((result) => {
+      var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 }
+      score.thread = score.thread || result.thread
+      score.comment = score.comment || result.comment
+      score.file = score.file || result.file
+      score.strength += result.strength
+      score.count += 1
+    })
+  })
+  total = Object.keys(scores).length
+  Object.values(scores).sort((a,b) => {
+    if (b.count !== a.count) {
+      return cmp(b.count, a.count)
+    }
+    return cmp(b.strength * b.count, a.strength * a.count)
+  }).some((match) => {
+    if (i++ < start) return false
+    if (to_display-- === 0) return true
+    results.push(match)
+    thread_ids.push(match.thread)
+    if (match.comment) comment_ids.push(match.comment)
+    if (match.file) file_ids.push(match.file)
+    return false
+  })
+  
+  db.storeQuery(query, total)
+  
+	my $files = $self->files_by_id($files_to_get);
+	my $comments = $self->comments_by_id($comments_to_get);
+	$self->log_query($query, $total);
+  return {
+		start => $start + $limit,
+		limit => $limit,
+    total => $total,
+    results => $results,
+		threads => $threads,
+		comments => $comments,
+		files => $files,
+		terms => $terms,
+  };
+}
+
+module.exports = { search: search }
diff --git a/lib/search/snippet.js b/lib/search/snippet.js
new file mode 100644
index 0000000..de71911
--- /dev/null
+++ b/lib/search/snippet.js
@@ -0,0 +1,103 @@
+var util = require('../util/util')
+
+function bold_terms (s, terms) {
+  
+}
+sub bold_terms
+	{
+	my ($self, $string, $terms) = @_;
+	$string = $self->strip_html($string);
+	foreach my $term (@$terms)
+		{
+		$string =~ s/\b($term)\b/<b>$1<\/b>/gi;
+		}
+	return $string;
+	}
+sub bold_snippet
+	{
+	my ($self, $string, $terms) = @_;
+	my $snippet = $self->snippet($string, $terms);
+	return $self->bold_terms($snippet, $terms);
+	}
+sub snippet
+	{
+	my ($self, $string, $terms) = @_;
+
+	# clean up the string we got
+	$string = $self->strip_html($string);
+
+	# create a regex out of the search terms
+	my $term_re = join "|", @$terms;
+
+	# take the string to be snippetized and split it into words
+	my @words = split /\s+/, $string;
+
+	# deduper for matching @words indexes, so we don't add a word twice
+	my $index_matches = {};
+
+	# words in the eventual snippet
+	my @words_matched;
+
+	# the snippet itself
+	my $snippet = '';
+
+	# counter for aggregating context after a match
+	my $aggr = 0;
+
+	# amount of context to show, in number of words surrounding a match
+	my $pad = 4;
+
+	# loop over each of the words in the string
+	for (my $i = 0; $i < scalar @words; $i++)
+		{
+		# does this word contain a match?
+		if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1))
+			{
+			# if we aren't already aggregating, add an ellipsis
+			if (! $aggr)
+				{
+				push @words_matched, "...";
+				}
+			# look backward $pad words
+			for (my $j = -$pad; $j < 1; $j++)
+				{
+				# create a new index from the offset
+				my $idx = $i + $j;
+
+				# is this a valid index? has it already been encountered?
+				next if $idx < 0;
+				next if $idx > scalar @words;
+				next if exists $index_matches->{$i+$j};
+
+				# checks out, save this word
+				push @words_matched, $words[$i+$j];
+
+				# note the matching index in our deduper
+				$index_matches->{$i+$j} ++;
+				}
+			# enter aggregate mode -- add the next $pad words
+			$aggr = $pad;
+			}
+		# have we been told to aggregate?
+		elsif ($aggr)
+			{
+			# save this word
+			push @words_matched, $words[$i];
+
+			# add index to the deduper
+			$index_matches->{$i} ++;
+
+			# one less word to aggregate
+			$aggr--;
+			}
+		# keep snippets to a modest length
+		last if scalar @words_matched > 30;
+		}
+	# add a trailing ellipsis
+	push @words_matched, "...";
+
+	# create the snippet from the saved context words
+	$snippet = join " ", @words_matched;
+
+	return $snippet;
+	}
+\ No newline at end of file