require("dotenv").load(); var STOPWORDS = require("./stopwords"); var bdb = require("./bdb"); var db = require("../db"); var parse_term = require("./parse_term"); var search_db = bdb("search"); var lexicon = new Map(); var lex_counts = new Map(); var total = 0; module.exports = { build: build_index, watch: watch_index, save: () => search_db.save(), }; var BUILD_DELAY = 1000 * 60 * 60 * 24; function watch_index() { build_index(); console.log( "rebuilding search index every", BUILD_DELAY / (60 * 60 * 1000), "hours", ); var interval = setInterval(build_index, BUILD_DELAY); } function build_index(cb) { console.log("building search index"); lexicon = new Map(); lex_counts = new Map(); total = 0; return parse_threads() .then(parse_comments) .then(parse_files) .then(() => { var unique = lexicon.size; console.log("--- WORD COUNT: ", total); console.log("--- UNIQUE WORDS: ", unique); lexicon_store(); console.log("Done!"); return { total, unique }; }); } function parse_threads() { return db.Thread.where("id", ">", 1) .fetchAll() .then((threads) => { console.log("got threads", threads.length); for (const thread of threads) { total += parse_terms({ string: thread.get("title"), thread: thread.get("id"), }); } }); } function parse_comments() { return db.Comment.where("thread", ">", 1) .fetchAll() .then((comments) => { console.log("got comments", comments.length); for (const comment of comments) { total += parse_terms({ string: comment.get("comment").toString(), thread: comment.get("thread"), comment: comment.get("id"), }); } }); } function parse_files() { return db.File.fetchAll().then((files) => { console.log("got files", files.length); for (const file of files) { total += parse_terms({ string: file.get("filename"), thread: file.get("thread"), file: file.get("id"), }); } }); } var underscoreRegexp = new RegExp("_", "g"); var spaceRegexp = new RegExp("[^a-zA-Z0-9]+", "g"); /** * For each term, create mappings: * - lexicon[term][thread] => {thread, comment, file, strength} * - lex_counts[term] => document frequency * - total terms ++ */ function parse_terms(opt) { var thread = opt.thread; var comment = opt.comment || 0; var file = opt.file || 0; var string = opt.string; if (!string || !thread) { return 0; } var term_count = 0; var terms = string.replace(underscoreRegexp, " ").split(spaceRegexp); for (const term of terms) { var parsedTerm = parse_term(term); if (!term || !parsedTerm) { return; } if (!lexicon.has(parsedTerm)) { lexicon.set(parsedTerm, {}); } var lookup = lexicon.get(parsedTerm); lookup[thread] = lookup[thread] || { strength: 1 }; var res = lookup[thread]; res.thread = res.thread || thread; res.comment = res.comment || comment; res.file = res.file || file; // prioritize threads if (!comment && !file) { res.strength += 100; } else { res.strength += 1; } term_count += 1; if (!lex_counts.has(parsedTerm)) { lex_counts.set(parsedTerm, new Set()); } const lex_count = lex_counts.get(parsedTerm); try { lex_count.add(res.thread); } catch (error) { console.error(error); console.log(term, terms, lex_count); } } return term_count || 0; } var put_total = 0; function lexicon_store() { console.log("writing db..."); // console.log(Object.keys(lexicon)); search_db.reset(); for (const term of lexicon.keys()) { if (STOPWORDS.has(term)) return; var serialized = serialize_matches(term); if (!serialized) return; if (put_total % 5000 === 0) console.log(put_total + "..."); put_total += 1; // if (put_total > 10) return // console.log(term) search_db.put(term, serialized); } // search_db.save(); } function serialize_matches(term) { var matches = lexicon.get(term); var lex_count = lex_counts.get(term)?.size || 0; if (!lex_count) { return null; } var idf = Math.log(total / lex_count); var serialized_matches = []; Object.values(matches).forEach((match) => { if (!match) return; var s = [ match.thread, match.comment, match.file, Number((match.strength * idf).toFixed(2)), ]; if (s) serialized_matches.push(s); }); if (!serialized_matches.length) return; return serialized_matches; }