1 files changed, 162 insertions, 106 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js
index dc1d7ab..d725777 100644
--- a/bucky/search/lexicon.js
+++ b/bucky/search/lexicon.js
@@ -1,129 +1,185 @@
-require('dotenv').load();
+require("dotenv").load();
 
-var STOPWORDS = require('./stopwords')
-var bdb = require('./bdb')
-var db = require('../db')
+var STOPWORDS = require("./stopwords");
+var bdb = require("./bdb");
+var db = require("../db");
+var parse_term = require("./parse_term");
 
-var search_db = bdb('search')
+var search_db = bdb("search");
 
-var lexicon = {}
-var lex_counts = {}
-var total = 0
+var lexicon = new Map();
+var lex_counts = new Map();
+var total = 0;
 
-module.exports = { build: build_index }
+module.exports = {
+  build: build_index,
+  watch: watch_index,
+  save: () => search_db.save(),
+};
+
+var BUILD_DELAY = 1000 * 60 * 60 * 24;
+function watch_index() {
+  build_index();
+  console.log(
+    "rebuilding search index every",
+    BUILD_DELAY / (60 * 60 * 1000),
+    "hours",
+  );
+  var interval = setInterval(build_index, BUILD_DELAY);
+}
 
 function build_index(cb) {
-  console.log("building index")
+  console.log("building search index");
+  lexicon = new Map();
+  lex_counts = new Map();
+  total = 0;
   return parse_threads()
     .then(parse_comments)
     .then(parse_files)
-    .then( () => {
-      var unique = Object.keys(lexicon).length
-      console.log( "--- WORD COUNT: ", total );
-      console.log( "--- UNIQUE WORDS: ", unique );
+    .then(() => {
+      var unique = lexicon.size;
+      console.log("--- WORD COUNT: ", total);
+      console.log("--- UNIQUE WORDS: ", unique);
       lexicon_store();
-      console.log( "Done!")
-      return { total, unique }
-    })
+      console.log("Done!");
+      return { total, unique };
+    });
 }
 function parse_threads() {
-  return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => {
-    console.log('got threads', threads.length)
-    threads.forEach( (thread) => {
-      total += parse_terms({
-        string: thread.get('title'),
-        thread: thread.get('id'),
-      })
-    })
-  })
+  return db.Thread.where("id", ">", 1)
+    .fetchAll()
+    .then((threads) => {
+      console.log("got threads", threads.length);
+      for (const thread of threads) {
+        total += parse_terms({
+          string: thread.get("title"),
+          thread: thread.get("id"),
+        });
+      }
+    });
 }
 function parse_comments() {
-  return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => {
-    console.log('got comments', comments.length)
-    comments.forEach( (comment) => {
-      total += parse_terms({
-        string: comment.get('comment').toString(),
-        thread: comment.get('thread'),
-        comment: comment.get('id'),
-      })
-    })
-  })
+  return db.Comment.where("thread", ">", 1)
+    .fetchAll()
+    .then((comments) => {
+      console.log("got comments", comments.length);
+      for (const comment of comments) {
+        total += parse_terms({
+          string: comment.get("comment").toString(),
+          thread: comment.get("thread"),
+          comment: comment.get("id"),
+        });
+      }
+    });
 }
 function parse_files() {
-  return db.File.fetchAll().then( (files) => {
-    console.log('got files', files.length)
-    files.forEach( (file) => {
+  return db.File.fetchAll().then((files) => {
+    console.log("got files", files.length);
+    for (const file of files) {
       total += parse_terms({
-        string: file.get('filename'),
-        thread: file.get('thread'),
-        file: file.get('id'),
-      })
-    })
-  })
+        string: file.get("filename"),
+        thread: file.get("thread"),
+        file: file.get("id"),
+      });
+    }
+  });
 }
 
-var underscoreRegexp = new RegExp('_', 'g')
-var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g')
+var underscoreRegexp = new RegExp("_", "g");
+var spaceRegexp = new RegExp("[^a-zA-Z0-9]+", "g");
 
-function parse_terms (opt) {
-  var thread = opt.thread
-  var comment = opt.comment || 0
-  var file = opt.file || 0
-  var string = opt.string
-  if (!string || !thread) return 0
-  var count = 0
-  var terms = string
-    .replace(underscoreRegexp, ' ')
-    .split(spaceRegexp)
-    .forEach((term) => {
-      var t = term.toLowerCase()
-      var lookup = lexicon[t] = lexicon[t] || {}
-      var res = lookup[thread] = lookup[thread] || { strength: 0 }
-      res.thread = res.thread || thread
-      res.comment = res.comment || comment
-      res.file = res.file || file
-      // prioritize threads
-      if (!comment && !file) {
-        res.strength += 2
-      }
-      else {
-        res.strength += 1
-      }
-      count += 1
-      lex_counts[term] = lex_counts[term] || 0
-      lex_counts[term] += 1
-		})
-	return count || 0
+/**
+ * For each term, create mappings:
+ *   - lexicon[term][thread] => {thread, comment, file, strength}
+ *   - lex_counts[term] => document frequency
+ *   - total terms ++
+ */
+function parse_terms(opt) {
+  var thread = opt.thread;
+  var comment = opt.comment || 0;
+  var file = opt.file || 0;
+  var string = opt.string;
+  if (!string || !thread) {
+    return 0;
+  }
+  var term_count = 0;
+  var terms = string.replace(underscoreRegexp, " ").split(spaceRegexp);
+  for (const term of terms) {
+    var parsedTerm = parse_term(term);
+    if (STOPWORDS.has(parsedTerm)) {
+      continue;
+    }
+    if (!term || !parsedTerm) {
+      continue;
+    }
+    if (!lexicon.has(parsedTerm)) {
+      lexicon.set(parsedTerm, {});
+    }
+    var lookup = lexicon.get(parsedTerm);
+    lookup[thread] = lookup[thread] || { strength: 1 };
+
+    var res = lookup[thread];
+    res.thread = res.thread || thread;
+    res.comment = res.comment || comment;
+    res.file = res.file || file;
+
+    // prioritize threads
+    if (!comment && !file) {
+      res.strength += 100;
+    } else {
+      res.strength += 1;
+    }
+    term_count += 1;
+
+    if (!lex_counts.has(parsedTerm)) {
+      lex_counts.set(parsedTerm, new Set());
+    }
+    const lex_count = lex_counts.get(parsedTerm);
+
+    try {
+      lex_count.add(res.thread);
+    } catch (error) {
+      console.error(error);
+      console.log(term, terms, lex_count);
+    }
+  }
+  return term_count || 0;
 }
 
-var put_total = 0
-function lexicon_store () {
-  console.log('writing db...')
-	Object.keys(lexicon).forEach( (term) => {
-		if (STOPWORDS.has(term)) return
-		var serialized = serialize_matches(term);
-		if (! serialized) return;
-		if ((put_total % 5000) === 0) console.log(put_total + '...')
-		put_total += 1
-		// if (put_total > 10) return
-		// console.log(term)
-		search_db.put(term, serialized)
-  })
+var put_total = 0;
+function lexicon_store() {
+  console.log("writing db...");
+  // console.log(Object.keys(lexicon));
+  search_db.reset();
+  for (const term of lexicon.keys()) {
+    var serialized = serialize_matches(term);
+    if (!serialized) return;
+    if (put_total % 5000 === 0) console.log(put_total + "...");
+    put_total += 1;
+    // if (put_total > 10) return
+    // console.log(term)
+    search_db.put(term, serialized);
+  }
+  // search_db.save();
+}
+function serialize_matches(term) {
+  var matches = lexicon.get(term);
+  var lex_count = lex_counts.get(term)?.size || 0;
+  if (!lex_count) {
+    return null;
+  }
+  var idf = Math.log(total / lex_count);
+  var serialized_matches = [];
+  Object.values(matches).forEach((match) => {
+    if (!match) return;
+    var s = [
+      match.thread,
+      match.comment,
+      match.file,
+      Number((match.strength * idf).toFixed(2)),
+    ];
+    if (s) serialized_matches.push(s);
+  });
+  if (!serialized_matches.length) return;
+  return serialized_matches;
 }
-function serialize_matches (term) {
-  var matches = lexicon[term]
-  var idf = Math.log(total / lex_counts[term])
-	var serialized_matches = [];
-	Object.values(matches).forEach( (match) => {
-		if (!match) return
-		var s = [
-			match.thread,
-			match.comment,
-			match.file,
-			match.strength * idf
-		].join(' ')
-		if (s) serialized_matches.push(s)
-  })
-	if (!serialized_matches.length) return
-	return serialized_matches.join(',')
-}
-\ No newline at end of file