1 files changed, 120 insertions, 0 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js
new file mode 100644
index 0000000..2cf0f21
--- /dev/null
+++ b/bucky/search/lexicon.js
@@ -0,0 +1,120 @@
+require('dotenv').load();
+
+var STOPWORDS = require('./stopwords')
+var bdb = require('./bdb')
+var db = require('../db')
+
+var lexicon = {}
+var total = 0
+
+build_index()
+
+function build_index() {
+  parse_threads()
+    .then(parse_comments)
+    .then(parse_files)
+    .then( () => {
+      var unique = Object.keys(lexicon).length
+      console.log( "--- WORD COUNT: ", total );
+      console.log( "--- UNIQUE WORDS: ", unique );
+      lexicon_store();
+      console.log( "Done!")
+      process.exit()
+    })
+}
+function parse_threads() {
+  return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => {
+    console.log('got threads', threads.length)
+    threads.forEach( (thread) => {
+      total += parse_terms({
+        string: thread.get('title'),
+        thread: thread.get('id'),
+      })
+    })
+  })
+}
+function parse_comments() {
+  return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => {
+    console.log('got comments', comments.length)
+    comments.forEach( (comment) => {
+      total += parse_terms({
+        string: comment.get('comment').toString(),
+        thread: comment.get('thread'),
+        comment: comment.get('id'),
+      })
+    })
+  })
+}
+function parse_files() {
+  return db.File.fetchAll().then( (files) => {
+    console.log('got files', files.length)
+    files.forEach( (file) => {
+      total += parse_terms({
+        string: file.get('filename'),
+        thread: file.get('thread'),
+        file: file.get('id'),
+      })
+    })
+  })
+}
+
+var underscoreRegexp = new RegExp('_', 'g')
+var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g')
+
+function parse_terms (opt) {
+  var thread = opt.thread
+  var comment = opt.comment || 0
+  var file = opt.file || 0
+  var string = opt.string
+  if (!string || !thread) return 0
+  var count = 0
+  var terms = string
+    .replace(underscoreRegexp, ' ')
+    .split(spaceRegexp)
+    .forEach((term) => {
+      var t = term.toLowerCase()
+      var lookup = lexicon[t] = lexicon[t] || {}
+      var res = lookup[thread] = lookup[thread] || { strength: 0 }
+      res.thread = res.thread || thread
+      res.comment = res.comment || comment
+      res.file = res.file || file
+      if (!comment || !file) {
+        res.strength += 2
+      }
+      else {
+        res.strength += 1
+      }
+      count += 1
+		})
+	return count || 0
+}
+
+var put_total = 0
+function lexicon_store () {
+  console.log('writing db...')
+	Object.keys(lexicon).forEach( (term) => {
+		if (STOPWORDS.has(term)) return
+		var serialized = serialize_matches(lexicon[term]);
+		if (! serialized) return;
+		if ((put_total % 5000) === 0) console.log(put_total + '...')
+		put_total += 1
+		// if (put_total > 10) return
+		// console.log(term)
+		bdb.put(term, serialized)
+  })
+}
+function serialize_matches (matches) {
+	var serialized_matches = [];
+	Object.values(matches).forEach( (match) => {
+		if (!match) return
+		var s = [
+			match.thread,
+			match.comment,
+			match.file,
+			match.strength
+		].join(' ')
+		if (s) serialized_matches.push(s)
+  })
+	if (!serialized_matches.length) return
+	return serialized_matches.join(',')
+}
+\ No newline at end of file