require("dotenv").load(); var STOPWORDS = require("./stopwords"); var bdb = require("./bdb"); var db = require("../db"); var parse_term = require("./parse_term"); var search_db = bdb("search"); var lexicon = {}; var lex_counts = {}; var total = 0; module.exports = { build: build_index, watch: watch_index, save: () => search_db.save(), }; var BUILD_DELAY = 1000 * 60 * 60 * 24; function watch_index() { build_index(); console.log( "rebuilding search index every", BUILD_DELAY / (60 * 60 * 1000), "hours", ); var interval = setInterval(build_index, BUILD_DELAY); } function build_index(cb) { console.log("building search index"); lexicon = {}; lex_counts = {}; total = 0; return parse_threads() .then(parse_comments) .then(parse_files) .then(() => { var unique = Object.keys(lexicon).length; console.log("--- WORD COUNT: ", total); console.log("--- UNIQUE WORDS: ", unique); lexicon_store(); console.log("Done!"); return { total, unique }; }); } function parse_threads() { return db.Thread.where("id", ">", 1) .fetchAll() .then((threads) => { console.log("got threads", threads.length); threads.forEach((thread) => { total += parse_terms({ string: thread.get("title"), thread: thread.get("id"), }); }); }); } function parse_comments() { return db.Comment.where("thread", ">", 1) .fetchAll() .then((comments) => { console.log("got comments", comments.length); comments.forEach((comment) => { total += parse_terms({ string: comment.get("comment").toString(), thread: comment.get("thread"), comment: comment.get("id"), }); }); }); } function parse_files() { return db.File.fetchAll().then((files) => { console.log("got files", files.length); files.forEach((file) => { total += parse_terms({ string: file.get("filename"), thread: file.get("thread"), file: file.get("id"), }); }); }); } var underscoreRegexp = new RegExp("_", "g"); var spaceRegexp = new RegExp("[^a-zA-Z0-9]+", "g"); /** * For each term, create mappings: * - lexicon[term][thread] => {thread, comment, file, strength} * - lex_counts[term] => document frequency * - total terms ++ */ function parse_terms(opt) { var thread = opt.thread; var comment = opt.comment || 0; var file = opt.file || 0; var string = opt.string; if (!string || !thread) return 0; var count = 0; var terms = string .replace(underscoreRegexp, " ") .split(spaceRegexp) .forEach((term) => { var t = parse_term(term); if (!term) { return; } var lookup = (lexicon[t] = lexicon[t] || {}); var res = (lookup[thread] = lookup[thread] || { strength: 1 }); res.thread = res.thread || thread; res.comment = res.comment || comment; res.file = res.file || file; // prioritize threads if (!comment && !file) { res.strength += 4; } else if (file) { res.strength += 1.5; } count += 1; lex_counts[term] = lex_counts[term] || new Set(); try { lex_counts[term].add(res.thread); } catch (error) { console.error(error); console.log(lex_counts[term]); } }); return count || 0; } var put_total = 0; function lexicon_store() { console.log("writing db..."); // console.log(Object.keys(lexicon)); search_db.reset(); Object.keys(lexicon).forEach((term) => { if (STOPWORDS.has(term)) return; var serialized = serialize_matches(term); if (!serialized) return; if (put_total % 5000 === 0) console.log(put_total + "..."); put_total += 1; // if (put_total > 10) return // console.log(term) search_db.put(term, serialized); }); // search_db.save(); } function serialize_matches(term) { var matches = lexicon[term]; var lex_count = lex_counts[term]?.size || 0; if (!lex_count) { return null; } var idf = Math.log(total / lex_count); var serialized_matches = []; Object.values(matches).forEach((match) => { if (!match) return; var s = [ match.thread, match.comment, match.file, Number((match.strength * idf).toFixed(2)), ]; if (s) serialized_matches.push(s); }); if (!serialized_matches.length) return; return serialized_matches; }