snippets and middleware

author: Jules Laplace <julescarbon@gmail.com> 2017-12-08 02:52:19 +0100
committer: Jules Laplace <julescarbon@gmail.com> 2017-12-08 02:56:41 +0100
commit: 192abb9db60f95968953b515ce18700c6b2da090 (patch)
tree: e0e77c935ad95ca9e641c6a1f19a22556df9b8d2 /bucky/search/snippet.js
parent: bbbd8bbab8737f5067c85376daf79cd8a5a9c4cb (diff)
1 files changed, 88 insertions, 0 deletions
diff --git a/bucky/search/snippet.js b/bucky/search/snippet.js
new file mode 100644
index 0000000..cd0657f
--- /dev/null
+++ b/bucky/search/snippet.js
@@ -0,0 +1,88 @@
+var util = require('../util/util')
+var STOPWORDS = require('./stopwords')
+
+function bold_snippet(s, terms) {
+  return bold_terms(snippet(s, terms), terms)
+}
+function bold_terms (s, terms) {
+  s = util.sanitize(s)
+  terms.forEach( (term) => {
+    s.replace(new RegExp("\b" + term + "\b", "i"), "<b>" + term + "</b>")
+  })
+}
+function snippet(s, terms) {
+  s = util.sanitize(s)
+  var term_re = new RegExp("\b(" + terms.join("|") + ")\b", "i")
+  var words = s.split(/\s+/)
+  var snippet = "";
+  
+	// deduper for matching @words indexes, so we don't add a word twice
+  var index_matches = {}
+
+	// words in the eventual snippet
+  var words_matched = []
+
+	// counter for aggregating context after a match
+  var aggr = 0;
+
+	// amount of context to show, in number of words surrounding a match
+	var $pad = 4;
+
+	// loop over each of the words in the string
+	words.some((word, i) => {
+    // if the word matches...
+	  if (term_re.match(word) && ! STOPWORDS.has(word.toLowerCase())) {
+			// if we aren't already aggregating, add an ellipsis
+			if (! $aggr) {
+				words_matched.push("...")
+			}
+			
+			// look backward $pad words
+			var idx;
+			for (var j = -pad; j < 1; j++) {
+				// create a new index from the offset
+				idx = i + j;
+
+				// is this a valid index? has it already been encountered?
+				if (idx < 0) continue;
+				if (idx > words.length) continue;
+				if (index_matches[idx]) continue;
+
+				// checks out, save this word
+				words_matched.push(words[idx])
+
+				// note the matching index in our deduper
+				index_matches[idx] = 1;
+      }
+			// enter aggregate mode -- add the next (pad) words
+			aggr = pad;
+    }
+
+		// have we been told to aggregate?
+		else if (aggr) {
+			// save this word
+			words_matched.push(word)
+
+			// add index to the deduper
+			index_matches[i] = 1;
+
+			// one less word to aggregate
+			aggr--;
+    }
+		
+		// keep snippets to a modest length
+		return words_matched.length > 30;
+  })
+
+	// add a trailing ellipsis
+	words_matched.push("...")
+
+	// create the snippet from the saved context words
+	snippet = words_matched.join(" ")
+	
+  return snippet
+}
+
+module.exports = {
+  bold_snippet, bold_terms, snippet,
+}
+\ No newline at end of file
author	Jules Laplace <julescarbon@gmail.com>	2017-12-08 02:52:19 +0100
committer	Jules Laplace <julescarbon@gmail.com>	2017-12-08 02:56:41 +0100
commit	192abb9db60f95968953b515ce18700c6b2da090 (patch)
tree	e0e77c935ad95ca9e641c6a1f19a22556df9b8d2 /bucky/search/snippet.js
parent	bbbd8bbab8737f5067c85376daf79cd8a5a9c4cb (diff)