summaryrefslogtreecommitdiff
path: root/bucky/search/lexicon.js
diff options
context:
space:
mode:
Diffstat (limited to 'bucky/search/lexicon.js')
-rw-r--r--bucky/search/lexicon.js232
1 files changed, 132 insertions, 100 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js
index dc1d7ab..0783512 100644
--- a/bucky/search/lexicon.js
+++ b/bucky/search/lexicon.js
@@ -1,129 +1,161 @@
-require('dotenv').load();
+require("dotenv").load();
-var STOPWORDS = require('./stopwords')
-var bdb = require('./bdb')
-var db = require('../db')
+var STOPWORDS = require("./stopwords");
+var bdb = require("./bdb");
+var db = require("../db");
+var parse_term = require("./parse_term");
-var search_db = bdb('search')
+var search_db = bdb("search");
-var lexicon = {}
-var lex_counts = {}
-var total = 0
+var lexicon = {};
+var lex_counts = {};
+var total = 0;
-module.exports = { build: build_index }
+module.exports = {
+ build: build_index,
+ watch: watch_index,
+ save: () => search_db.save(),
+};
+
+var BUILD_DELAY = 1000 * 60 * 60 * 24;
+function watch_index() {
+ build_index();
+ console.log(
+ "rebuilding search index every",
+ BUILD_DELAY / (60 * 60 * 1000),
+ "hours",
+ );
+ var interval = setInterval(build_index, BUILD_DELAY);
+}
function build_index(cb) {
- console.log("building index")
+ console.log("building search index");
+ lexicon = {};
+ lex_counts = {};
+ total = 0;
return parse_threads()
.then(parse_comments)
.then(parse_files)
- .then( () => {
- var unique = Object.keys(lexicon).length
- console.log( "--- WORD COUNT: ", total );
- console.log( "--- UNIQUE WORDS: ", unique );
+ .then(() => {
+ var unique = Object.keys(lexicon).length;
+ console.log("--- WORD COUNT: ", total);
+ console.log("--- UNIQUE WORDS: ", unique);
lexicon_store();
- console.log( "Done!")
- return { total, unique }
- })
+ console.log("Done!");
+ return { total, unique };
+ });
}
function parse_threads() {
- return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => {
- console.log('got threads', threads.length)
- threads.forEach( (thread) => {
- total += parse_terms({
- string: thread.get('title'),
- thread: thread.get('id'),
- })
- })
- })
+ return db.Thread.where("id", ">", 1)
+ .fetchAll()
+ .then((threads) => {
+ console.log("got threads", threads.length);
+ threads.forEach((thread) => {
+ total += parse_terms({
+ string: thread.get("title"),
+ thread: thread.get("id"),
+ });
+ });
+ });
}
function parse_comments() {
- return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => {
- console.log('got comments', comments.length)
- comments.forEach( (comment) => {
- total += parse_terms({
- string: comment.get('comment').toString(),
- thread: comment.get('thread'),
- comment: comment.get('id'),
- })
- })
- })
+ return db.Comment.where("thread", ">", 1)
+ .fetchAll()
+ .then((comments) => {
+ console.log("got comments", comments.length);
+ comments.forEach((comment) => {
+ total += parse_terms({
+ string: comment.get("comment").toString(),
+ thread: comment.get("thread"),
+ comment: comment.get("id"),
+ });
+ });
+ });
}
function parse_files() {
- return db.File.fetchAll().then( (files) => {
- console.log('got files', files.length)
- files.forEach( (file) => {
+ return db.File.fetchAll().then((files) => {
+ console.log("got files", files.length);
+ files.forEach((file) => {
total += parse_terms({
- string: file.get('filename'),
- thread: file.get('thread'),
- file: file.get('id'),
- })
- })
- })
+ string: file.get("filename"),
+ thread: file.get("thread"),
+ file: file.get("id"),
+ });
+ });
+ });
}
-var underscoreRegexp = new RegExp('_', 'g')
-var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g')
+var underscoreRegexp = new RegExp("_", "g");
+var spaceRegexp = new RegExp("[^a-zA-Z0-9]+", "g");
-function parse_terms (opt) {
- var thread = opt.thread
- var comment = opt.comment || 0
- var file = opt.file || 0
- var string = opt.string
- if (!string || !thread) return 0
- var count = 0
+function parse_terms(opt) {
+ var thread = opt.thread;
+ var comment = opt.comment || 0;
+ var file = opt.file || 0;
+ var string = opt.string;
+ if (!string || !thread) return 0;
+ var count = 0;
var terms = string
- .replace(underscoreRegexp, ' ')
+ .replace(underscoreRegexp, " ")
.split(spaceRegexp)
.forEach((term) => {
- var t = term.toLowerCase()
- var lookup = lexicon[t] = lexicon[t] || {}
- var res = lookup[thread] = lookup[thread] || { strength: 0 }
- res.thread = res.thread || thread
- res.comment = res.comment || comment
- res.file = res.file || file
+ var t = parse_term(term);
+ if (!term) {
+ return;
+ }
+ var lookup = (lexicon[t] = lexicon[t] || {});
+ var res = (lookup[thread] = lookup[thread] || { strength: 0 });
+ res.thread = res.thread || thread;
+ res.comment = res.comment || comment;
+ res.file = res.file || file;
// prioritize threads
if (!comment && !file) {
- res.strength += 2
- }
- else {
- res.strength += 1
+ res.strength += 2;
+ } else {
+ res.strength += 1;
}
- count += 1
- lex_counts[term] = lex_counts[term] || 0
- lex_counts[term] += 1
- })
- return count || 0
+ count += 1;
+ lex_counts[term] = lex_counts[term] || 0;
+ lex_counts[term] += 1;
+ });
+ return count || 0;
}
-var put_total = 0
-function lexicon_store () {
- console.log('writing db...')
- Object.keys(lexicon).forEach( (term) => {
- if (STOPWORDS.has(term)) return
- var serialized = serialize_matches(term);
- if (! serialized) return;
- if ((put_total % 5000) === 0) console.log(put_total + '...')
- put_total += 1
- // if (put_total > 10) return
- // console.log(term)
- search_db.put(term, serialized)
- })
+var put_total = 0;
+function lexicon_store() {
+ console.log("writing db...");
+ // console.log(Object.keys(lexicon));
+ search_db.reset();
+ Object.keys(lexicon).forEach((term) => {
+ if (STOPWORDS.has(term)) return;
+ var serialized = serialize_matches(term);
+ if (!serialized) return;
+ if (put_total % 5000 === 0) console.log(put_total + "...");
+ put_total += 1;
+ // if (put_total > 10) return
+ // console.log(term)
+ search_db.put(term, serialized);
+ });
+ search_db.save();
+}
+function serialize_matches(term) {
+ var matches = lexicon[term];
+ var lex_count = lex_counts[term];
+ if (!lex_count) {
+ return null;
+ }
+ var idf = Math.log(total / lex_count);
+ var serialized_matches = [];
+ Object.values(matches).forEach((match) => {
+ if (!match) return;
+ var s = [
+ match.thread,
+ match.comment,
+ match.file,
+ Number((match.strength * idf).toFixed(2)),
+ ];
+ if (s) serialized_matches.push(s);
+ });
+ if (!serialized_matches.length) return;
+ return serialized_matches;
}
-function serialize_matches (term) {
- var matches = lexicon[term]
- var idf = Math.log(total / lex_counts[term])
- var serialized_matches = [];
- Object.values(matches).forEach( (match) => {
- if (!match) return
- var s = [
- match.thread,
- match.comment,
- match.file,
- match.strength * idf
- ].join(' ')
- if (s) serialized_matches.push(s)
- })
- if (!serialized_matches.length) return
- return serialized_matches.join(',')
-} \ No newline at end of file