summaryrefslogtreecommitdiff
path: root/bucky/search/snippet.js
blob: cd0657f380e7f927105d548876a02c3196b5a602 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
var util = require('../util/util')
var STOPWORDS = require('./stopwords')

function bold_snippet(s, terms) {
  return bold_terms(snippet(s, terms), terms)
}
function bold_terms (s, terms) {
  s = util.sanitize(s)
  terms.forEach( (term) => {
    s.replace(new RegExp("\b" + term + "\b", "i"), "<b>" + term + "</b>")
  })
}
function snippet(s, terms) {
  s = util.sanitize(s)
  var term_re = new RegExp("\b(" + terms.join("|") + ")\b", "i")
  var words = s.split(/\s+/)
  var snippet = "";
  
	// deduper for matching @words indexes, so we don't add a word twice
  var index_matches = {}

	// words in the eventual snippet
  var words_matched = []

	// counter for aggregating context after a match
  var aggr = 0;

	// amount of context to show, in number of words surrounding a match
	var $pad = 4;

	// loop over each of the words in the string
	words.some((word, i) => {
    // if the word matches...
	  if (term_re.match(word) && ! STOPWORDS.has(word.toLowerCase())) {
			// if we aren't already aggregating, add an ellipsis
			if (! $aggr) {
				words_matched.push("...")
			}
			
			// look backward $pad words
			var idx;
			for (var j = -pad; j < 1; j++) {
				// create a new index from the offset
				idx = i + j;

				// is this a valid index? has it already been encountered?
				if (idx < 0) continue;
				if (idx > words.length) continue;
				if (index_matches[idx]) continue;

				// checks out, save this word
				words_matched.push(words[idx])

				// note the matching index in our deduper
				index_matches[idx] = 1;
      }
			// enter aggregate mode -- add the next (pad) words
			aggr = pad;
    }

		// have we been told to aggregate?
		else if (aggr) {
			// save this word
			words_matched.push(word)

			// add index to the deduper
			index_matches[i] = 1;

			// one less word to aggregate
			aggr--;
    }
		
		// keep snippets to a modest length
		return words_matched.length > 30;
  })

	// add a trailing ellipsis
	words_matched.push("...")

	// create the snippet from the saved context words
	snippet = words_matched.join(" ")
	
  return snippet
}

module.exports = {
  bold_snippet, bold_terms, snippet,
}