From 192abb9db60f95968953b515ce18700c6b2da090 Mon Sep 17 00:00:00 2001
From: Jules Laplace <julescarbon@gmail.com>
Date: Fri, 8 Dec 2017 02:52:19 +0100
Subject: snippets and middleware

---
 bucky/app/bucky.js         |   2 +-
 bucky/app/router.js        |   9 ++++
 bucky/db/index.js          |   6 +++
 bucky/search/middleware.js |  44 ++++++++++++++++++
 bucky/search/search.js     |  91 +++++++++++++++++++++++++++++++++++++
 bucky/search/snippet.js    |  88 ++++++++++++++++++++++++++++++++++++
 bucky/search/stopwords.js  |  18 ++++++++
 lib/search/index.js        | 110 ---------------------------------------------
 lib/search/snippet.js      | 103 ------------------------------------------
 9 files changed, 257 insertions(+), 214 deletions(-)
 create mode 100644 bucky/search/middleware.js
 create mode 100644 bucky/search/search.js
 create mode 100644 bucky/search/snippet.js
 create mode 100644 bucky/search/stopwords.js
 delete mode 100644 lib/search/index.js
 delete mode 100644 lib/search/snippet.js

diff --git a/bucky/app/bucky.js b/bucky/app/bucky.js
index 757592a..d1aad4d 100644
--- a/bucky/app/bucky.js
+++ b/bucky/app/bucky.js
@@ -157,7 +157,7 @@ var bucky = module.exports = {
       next()
     })
   },
-  
+ 
   /* MAIL */
 
   ensureMailboxes: function (req, res, next){
diff --git a/bucky/app/router.js b/bucky/app/router.js
index c3af565..a87e1ec 100644
--- a/bucky/app/router.js
+++ b/bucky/app/router.js
@@ -4,6 +4,7 @@ var fortune = require('./fortune')
 var bucky = require('./bucky')
 var db = require('./db')
 var util = require('./util')
+var search = require('../search/middleware')
 
 module.exports = function(app){
 	app.all('*', middleware.ensureLocals)
@@ -91,6 +92,14 @@ module.exports = function(app){
       // delete a comment
   })
   
+  app.get("/api/search",
+    middleware.ensureAuthenticated,
+    search.search,
+    search.getComments,
+    search.getFiles,
+    search.logQuery,
+    search.success
+  )
 
   app.get("/api/keyword/:keyword",
     middleware.ensureAuthenticated,
diff --git a/bucky/db/index.js b/bucky/db/index.js
index f376308..dcd5f20 100644
--- a/bucky/db/index.js
+++ b/bucky/db/index.js
@@ -92,6 +92,9 @@ db.getFileCounts = function(ids){
 db.getFileSizes = function(ids){
   return knex.column('thread').sum('size as size').select().from('files').where('thread', 'in', ids).groupBy('thread')
 }
+db.getFilesById = function(ids){
+  return File.where("id", "in", ids)
+}
 db.createFile = function(data){
   return new db.File(data).save()
 }
@@ -117,6 +120,9 @@ db.getCommentsForThread = function (id, limit, offset, order){
     return comments
   })
 }
+db.getCommentsById = function(ids){
+  return Comment.where("id", "in", ids)
+}
 db.getCommentCounts = function(ids){
   return knex.column('thread').count('* as count').select().from('comments').where('thread', 'in', ids).groupBy('thread')
 }
diff --git a/bucky/search/middleware.js b/bucky/search/middleware.js
new file mode 100644
index 0000000..64ddd28
--- /dev/null
+++ b/bucky/search/middleware.js
@@ -0,0 +1,44 @@
+var search = require('./search')
+var snippet = require('./snippet')
+var db = require('../db')
+
+module.exports = {
+  
+  search: function (req, res, next) {
+    var results = search.search(req.body.query, req.body.start, req.body.limit)
+    res.search = results
+    next()
+  },
+  
+  getComments: function (req, res, next){
+    var comment_ids = res.search.comment_ids;
+    if (! comment_ids || ! comment_ids.length) {
+      return next()
+    }
+    db.getCommentsById(comment_ids).then(function(comments){
+      res.search.comments = comments
+      next()
+    })
+  },
+  
+  getFiles: function (req, res, next){
+    var file_ids = res.search.file_ids
+    if (! file_ids || ! file_ids.length) {
+      return next()
+    }
+    db.getFilesById(file_ids).then(function(files){
+      res.search.files = files
+      next()
+    })
+  },
+
+  logQuery: function(req, res, next) {
+    // req.search.query, req.search.count
+    next()
+  },
+
+  success: function(req, res, next){
+    res.send(res.search)
+  }
+
+}
diff --git a/bucky/search/search.js b/bucky/search/search.js
new file mode 100644
index 0000000..afa9609
--- /dev/null
+++ b/bucky/search/search.js
@@ -0,0 +1,91 @@
+var db = require('../db')
+var STOPWORDS = require('./stopwords')
+
+var bdb_lib = require('berkeleydb')
+var bdb = new bdb_lib.Db()
+bdb.open('search.db')
+
+var wordRegexp = new RegExp("(\W+)");
+var wordBoundaryRegexp = new RegExp("\W");
+function parse_terms (s) {
+  return s.toLowerCase().split(wordRegexp).filter((term) => {
+	  if (! term.match(wordBoundaryRegexp)) {
+	    return true
+	  }
+	  return false
+	})
+}
+function cmp (a,b){ return (a<b)?a:(a===b)?0:1 }
+
+function find_term(term) {
+  var matches = bdb.get(term).split(",").map((s) => {
+    var partz = s.split(" ")
+    var match = {
+      thread: s[0],
+      comment: s[1],
+      file: s[2],
+      strength: s[3],
+    }
+  })
+  return matches
+}
+
+function search (query, start, limit) {
+  if (!query) return
+	start = start || 0;
+	limit = limit || 10;
+	var scores = {};
+	var terms = parse_terms($query);
+  var i = 0
+  var total
+  var to_display = limit
+  var threads = {}
+  var thread_ids = []
+  var comment_ids = []
+  var file_ids = []
+  var results = []
+
+  terms.forEach((term) => {
+    if (STOPWORDS.has(term)) return;
+    var results = find_term(term);
+    if (!results) return;
+    results.forEach((result) => {
+      var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 }
+      score.thread = score.thread || result.thread
+      score.comment = score.comment || result.comment
+      score.file = score.file || result.file
+      score.strength += result.strength
+      score.count += 1
+    })
+  })
+  total = Object.keys(scores).length
+  Object.values(scores).sort((a,b) => {
+    if (b.count !== a.count) {
+      return cmp(b.count, a.count)
+    }
+    return cmp(b.strength * b.count, a.strength * a.count)
+  }).some((match) => {
+    if (i++ < start) return false
+    if (to_display-- === 0) return true
+    results.push(match)
+    thread_ids.push(match.thread)
+    if (match.comment) comment_ids.push(match.comment)
+    if (match.file) file_ids.push(match.file)
+    return false
+  })
+
+  return {
+    query: query,
+    start: start,
+		next: start + limit,
+		limit: limit,
+    total: total,
+    results: results,
+		thread_ids: thread_ids,
+		comment_ids: comment_ids,
+		file_ids: $file_ids,
+		terms: terms,
+  };
+}
+
+module.exports = { search: search }
diff --git a/bucky/search/snippet.js b/bucky/search/snippet.js
new file mode 100644
index 0000000..cd0657f
--- /dev/null
+++ b/bucky/search/snippet.js
@@ -0,0 +1,88 @@
+var util = require('../util/util')
+var STOPWORDS = require('./stopwords')
+
+function bold_snippet(s, terms) {
+  return bold_terms(snippet(s, terms), terms)
+}
+function bold_terms (s, terms) {
+  s = util.sanitize(s)
+  terms.forEach( (term) => {
+    s.replace(new RegExp("\b" + term + "\b", "i"), "<b>" + term + "</b>")
+  })
+}
+function snippet(s, terms) {
+  s = util.sanitize(s)
+  var term_re = new RegExp("\b(" + terms.join("|") + ")\b", "i")
+  var words = s.split(/\s+/)
+  var snippet = "";
+  
+	// deduper for matching @words indexes, so we don't add a word twice
+  var index_matches = {}
+
+	// words in the eventual snippet
+  var words_matched = []
+
+	// counter for aggregating context after a match
+  var aggr = 0;
+
+	// amount of context to show, in number of words surrounding a match
+	var $pad = 4;
+
+	// loop over each of the words in the string
+	words.some((word, i) => {
+    // if the word matches...
+	  if (term_re.match(word) && ! STOPWORDS.has(word.toLowerCase())) {
+			// if we aren't already aggregating, add an ellipsis
+			if (! $aggr) {
+				words_matched.push("...")
+			}
+			
+			// look backward $pad words
+			var idx;
+			for (var j = -pad; j < 1; j++) {
+				// create a new index from the offset
+				idx = i + j;
+
+				// is this a valid index? has it already been encountered?
+				if (idx < 0) continue;
+				if (idx > words.length) continue;
+				if (index_matches[idx]) continue;
+
+				// checks out, save this word
+				words_matched.push(words[idx])
+
+				// note the matching index in our deduper
+				index_matches[idx] = 1;
+      }
+			// enter aggregate mode -- add the next (pad) words
+			aggr = pad;
+    }
+
+		// have we been told to aggregate?
+		else if (aggr) {
+			// save this word
+			words_matched.push(word)
+
+			// add index to the deduper
+			index_matches[i] = 1;
+
+			// one less word to aggregate
+			aggr--;
+    }
+		
+		// keep snippets to a modest length
+		return words_matched.length > 30;
+  })
+
+	// add a trailing ellipsis
+	words_matched.push("...")
+
+	// create the snippet from the saved context words
+	snippet = words_matched.join(" ")
+	
+  return snippet
+}
+
+module.exports = {
+  bold_snippet, bold_terms, snippet,
+}
\ No newline at end of file
diff --git a/bucky/search/stopwords.js b/bucky/search/stopwords.js
new file mode 100644
index 0000000..ceffe14
--- /dev/null
+++ b/bucky/search/stopwords.js
@@ -0,0 +1,18 @@
+module.exports = new Set(
+  "a about above across adj after again against all almost alone along also " +
+  "although always am among an and another any anybody anyone anything anywhere " +
+  "apart are around as aside at away be because been before behind being below " +
+  "besides between beyond both but by can cannot could did do does doing done " +
+  "down downwards during each either else enough etc even ever every everybody " +
+  "everyone except far few for forth from get gets got had hardly has have having " +
+  "her here herself him himself his how however i if in indeed instead into inward " +
+  "is it its itself just kept many maybe might mine more most mostly much must " +
+  "myself near neither next no nobody none nor not nothing nowhere of off often on " +
+  "only onto or other others ought our ours out outside over own p per please plus " +
+  "pp quite rather really said seem self selves several shall she should since so " +
+  "some somebody somewhat still such than that the their theirs them themselves " +
+  "then there therefore these they this thorough thoroughly those through thus to " +
+  "together too toward towards under until up upon v very was well were what " +
+  "whatever when whenever where whether which while who whom whose will with" +
+  "within without would yet young your yourself s".split(" ")
+);
diff --git a/lib/search/index.js b/lib/search/index.js
deleted file mode 100644
index 27f436f..0000000
--- a/lib/search/index.js
+++ /dev/null
@@ -1,110 +0,0 @@
-var db = require('../db')
-var bdb_lib = require('berkeleydb')
-var bdb = new bdb_lib.Db()
-bdb.open('search.db')
-
-var wordRegexp = new RegExp("(\W+)");
-var wordBoundaryRegexp = new RegExp("\W");
-function parse_terms (s) {
-  return s.toLowerCase().split(wordRegexp).filter((term) => {
-	  if (! term.match(wordBoundaryRegexp)) {
-	    return true
-	  }
-	  return false
-	})
-}
-function cmp (a,b){ return (a<b)?a:(a===b)?0:1 }
-
-var STOPWORDS = new Set(
-  "a about above across adj after again against all almost alone along also " +
-  "although always am among an and another any anybody anyone anything anywhere " +
-  "apart are around as aside at away be because been before behind being below " +
-  "besides between beyond both but by can cannot could did do does doing done " +
-  "down downwards during each either else enough etc even ever every everybody " +
-  "everyone except far few for forth from get gets got had hardly has have having " +
-  "her here herself him himself his how however i if in indeed instead into inward " +
-  "is it its itself just kept many maybe might mine more most mostly much must " +
-  "myself near neither next no nobody none nor not nothing nowhere of off often on " +
-  "only onto or other others ought our ours out outside over own p per please plus " +
-  "pp quite rather really said seem self selves several shall she should since so " +
-  "some somebody somewhat still such than that the their theirs them themselves " +
-  "then there therefore these they this thorough thoroughly those through thus to " +
-  "together too toward towards under until up upon v very was well were what " +
-  "whatever when whenever where whether which while who whom whose will with" +
-  "within without would yet young your yourself s".split(" ")
-);
-
-function find_term(term) {
-  var matches = bdb.get(term).split(",").map((s) => {
-    var partz = s.split(" ")
-    var match = {
-      thread: s[0],
-      comment: s[1],
-      file: s[2],
-      strength: s[3],
-    }
-  })
-  return matches
-}
-
-function search (query, start, limit) {
-  if (!query) return
-	start = start || 0;
-	limit = limit || 10;
-	var scores = {};
-	var terms = parse_terms($query);
-  var i = 0
-  var total
-  var to_display = limit
-  var threads = {}
-  var comment_ids = []
-  var file_ids = []
-  var results = []
-
-  terms.forEach((term) => {
-    if (STOPWORDS.has(term)) return;
-    var results = find_term(term);
-    if (!results) return;
-    results.forEach((result) => {
-      var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 }
-      score.thread = score.thread || result.thread
-      score.comment = score.comment || result.comment
-      score.file = score.file || result.file
-      score.strength += result.strength
-      score.count += 1
-    })
-  })
-  total = Object.keys(scores).length
-  Object.values(scores).sort((a,b) => {
-    if (b.count !== a.count) {
-      return cmp(b.count, a.count)
-    }
-    return cmp(b.strength * b.count, a.strength * a.count)
-  }).some((match) => {
-    if (i++ < start) return false
-    if (to_display-- === 0) return true
-    results.push(match)
-    thread_ids.push(match.thread)
-    if (match.comment) comment_ids.push(match.comment)
-    if (match.file) file_ids.push(match.file)
-    return false
-  })
-  
-  db.storeQuery(query, total)
-  
-	my $files = $self->files_by_id($files_to_get);
-	my $comments = $self->comments_by_id($comments_to_get);
-	$self->log_query($query, $total);
-  return {
-		start => $start + $limit,
-		limit => $limit,
-    total => $total,
-    results => $results,
-		threads => $threads,
-		comments => $comments,
-		files => $files,
-		terms => $terms,
-  };
-}
-
-module.exports = { search: search }
diff --git a/lib/search/snippet.js b/lib/search/snippet.js
deleted file mode 100644
index de71911..0000000
--- a/lib/search/snippet.js
+++ /dev/null
@@ -1,103 +0,0 @@
-var util = require('../util/util')
-
-function bold_terms (s, terms) {
-  
-}
-sub bold_terms
-	{
-	my ($self, $string, $terms) = @_;
-	$string = $self->strip_html($string);
-	foreach my $term (@$terms)
-		{
-		$string =~ s/\b($term)\b/<b>$1<\/b>/gi;
-		}
-	return $string;
-	}
-sub bold_snippet
-	{
-	my ($self, $string, $terms) = @_;
-	my $snippet = $self->snippet($string, $terms);
-	return $self->bold_terms($snippet, $terms);
-	}
-sub snippet
-	{
-	my ($self, $string, $terms) = @_;
-
-	# clean up the string we got
-	$string = $self->strip_html($string);
-
-	# create a regex out of the search terms
-	my $term_re = join "|", @$terms;
-
-	# take the string to be snippetized and split it into words
-	my @words = split /\s+/, $string;
-
-	# deduper for matching @words indexes, so we don't add a word twice
-	my $index_matches = {};
-
-	# words in the eventual snippet
-	my @words_matched;
-
-	# the snippet itself
-	my $snippet = '';
-
-	# counter for aggregating context after a match
-	my $aggr = 0;
-
-	# amount of context to show, in number of words surrounding a match
-	my $pad = 4;
-
-	# loop over each of the words in the string
-	for (my $i = 0; $i < scalar @words; $i++)
-		{
-		# does this word contain a match?
-		if ($words[$i] =~ /\b($term_re)\b/i && ! $self->is_stopword($1))
-			{
-			# if we aren't already aggregating, add an ellipsis
-			if (! $aggr)
-				{
-				push @words_matched, "...";
-				}
-			# look backward $pad words
-			for (my $j = -$pad; $j < 1; $j++)
-				{
-				# create a new index from the offset
-				my $idx = $i + $j;
-
-				# is this a valid index? has it already been encountered?
-				next if $idx < 0;
-				next if $idx > scalar @words;
-				next if exists $index_matches->{$i+$j};
-
-				# checks out, save this word
-				push @words_matched, $words[$i+$j];
-
-				# note the matching index in our deduper
-				$index_matches->{$i+$j} ++;
-				}
-			# enter aggregate mode -- add the next $pad words
-			$aggr = $pad;
-			}
-		# have we been told to aggregate?
-		elsif ($aggr)
-			{
-			# save this word
-			push @words_matched, $words[$i];
-
-			# add index to the deduper
-			$index_matches->{$i} ++;
-
-			# one less word to aggregate
-			$aggr--;
-			}
-		# keep snippets to a modest length
-		last if scalar @words_matched > 30;
-		}
-	# add a trailing ellipsis
-	push @words_matched, "...";
-
-	# create the snippet from the saved context words
-	$snippet = join " ", @words_matched;
-
-	return $snippet;
-	}
\ No newline at end of file
-- 
cgit v1.2.3-70-g09d2