diff options
Diffstat (limited to 'bucky/search/lexicon.js')
| -rw-r--r-- | bucky/search/lexicon.js | 268 |
1 files changed, 162 insertions, 106 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js index dc1d7ab..d725777 100644 --- a/bucky/search/lexicon.js +++ b/bucky/search/lexicon.js @@ -1,129 +1,185 @@ -require('dotenv').load(); +require("dotenv").load(); -var STOPWORDS = require('./stopwords') -var bdb = require('./bdb') -var db = require('../db') +var STOPWORDS = require("./stopwords"); +var bdb = require("./bdb"); +var db = require("../db"); +var parse_term = require("./parse_term"); -var search_db = bdb('search') +var search_db = bdb("search"); -var lexicon = {} -var lex_counts = {} -var total = 0 +var lexicon = new Map(); +var lex_counts = new Map(); +var total = 0; -module.exports = { build: build_index } +module.exports = { + build: build_index, + watch: watch_index, + save: () => search_db.save(), +}; + +var BUILD_DELAY = 1000 * 60 * 60 * 24; +function watch_index() { + build_index(); + console.log( + "rebuilding search index every", + BUILD_DELAY / (60 * 60 * 1000), + "hours", + ); + var interval = setInterval(build_index, BUILD_DELAY); +} function build_index(cb) { - console.log("building index") + console.log("building search index"); + lexicon = new Map(); + lex_counts = new Map(); + total = 0; return parse_threads() .then(parse_comments) .then(parse_files) - .then( () => { - var unique = Object.keys(lexicon).length - console.log( "--- WORD COUNT: ", total ); - console.log( "--- UNIQUE WORDS: ", unique ); + .then(() => { + var unique = lexicon.size; + console.log("--- WORD COUNT: ", total); + console.log("--- UNIQUE WORDS: ", unique); lexicon_store(); - console.log( "Done!") - return { total, unique } - }) + console.log("Done!"); + return { total, unique }; + }); } function parse_threads() { - return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => { - console.log('got threads', threads.length) - threads.forEach( (thread) => { - total += parse_terms({ - string: thread.get('title'), - thread: thread.get('id'), - }) - }) - }) + return db.Thread.where("id", ">", 1) + .fetchAll() + .then((threads) => { + console.log("got threads", threads.length); + for (const thread of threads) { + total += parse_terms({ + string: thread.get("title"), + thread: thread.get("id"), + }); + } + }); } function parse_comments() { - return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => { - console.log('got comments', comments.length) - comments.forEach( (comment) => { - total += parse_terms({ - string: comment.get('comment').toString(), - thread: comment.get('thread'), - comment: comment.get('id'), - }) - }) - }) + return db.Comment.where("thread", ">", 1) + .fetchAll() + .then((comments) => { + console.log("got comments", comments.length); + for (const comment of comments) { + total += parse_terms({ + string: comment.get("comment").toString(), + thread: comment.get("thread"), + comment: comment.get("id"), + }); + } + }); } function parse_files() { - return db.File.fetchAll().then( (files) => { - console.log('got files', files.length) - files.forEach( (file) => { + return db.File.fetchAll().then((files) => { + console.log("got files", files.length); + for (const file of files) { total += parse_terms({ - string: file.get('filename'), - thread: file.get('thread'), - file: file.get('id'), - }) - }) - }) + string: file.get("filename"), + thread: file.get("thread"), + file: file.get("id"), + }); + } + }); } -var underscoreRegexp = new RegExp('_', 'g') -var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g') +var underscoreRegexp = new RegExp("_", "g"); +var spaceRegexp = new RegExp("[^a-zA-Z0-9]+", "g"); -function parse_terms (opt) { - var thread = opt.thread - var comment = opt.comment || 0 - var file = opt.file || 0 - var string = opt.string - if (!string || !thread) return 0 - var count = 0 - var terms = string - .replace(underscoreRegexp, ' ') - .split(spaceRegexp) - .forEach((term) => { - var t = term.toLowerCase() - var lookup = lexicon[t] = lexicon[t] || {} - var res = lookup[thread] = lookup[thread] || { strength: 0 } - res.thread = res.thread || thread - res.comment = res.comment || comment - res.file = res.file || file - // prioritize threads - if (!comment && !file) { - res.strength += 2 - } - else { - res.strength += 1 - } - count += 1 - lex_counts[term] = lex_counts[term] || 0 - lex_counts[term] += 1 - }) - return count || 0 +/** + * For each term, create mappings: + * - lexicon[term][thread] => {thread, comment, file, strength} + * - lex_counts[term] => document frequency + * - total terms ++ + */ +function parse_terms(opt) { + var thread = opt.thread; + var comment = opt.comment || 0; + var file = opt.file || 0; + var string = opt.string; + if (!string || !thread) { + return 0; + } + var term_count = 0; + var terms = string.replace(underscoreRegexp, " ").split(spaceRegexp); + for (const term of terms) { + var parsedTerm = parse_term(term); + if (STOPWORDS.has(parsedTerm)) { + continue; + } + if (!term || !parsedTerm) { + continue; + } + if (!lexicon.has(parsedTerm)) { + lexicon.set(parsedTerm, {}); + } + var lookup = lexicon.get(parsedTerm); + lookup[thread] = lookup[thread] || { strength: 1 }; + + var res = lookup[thread]; + res.thread = res.thread || thread; + res.comment = res.comment || comment; + res.file = res.file || file; + + // prioritize threads + if (!comment && !file) { + res.strength += 100; + } else { + res.strength += 1; + } + term_count += 1; + + if (!lex_counts.has(parsedTerm)) { + lex_counts.set(parsedTerm, new Set()); + } + const lex_count = lex_counts.get(parsedTerm); + + try { + lex_count.add(res.thread); + } catch (error) { + console.error(error); + console.log(term, terms, lex_count); + } + } + return term_count || 0; } -var put_total = 0 -function lexicon_store () { - console.log('writing db...') - Object.keys(lexicon).forEach( (term) => { - if (STOPWORDS.has(term)) return - var serialized = serialize_matches(term); - if (! serialized) return; - if ((put_total % 5000) === 0) console.log(put_total + '...') - put_total += 1 - // if (put_total > 10) return - // console.log(term) - search_db.put(term, serialized) - }) +var put_total = 0; +function lexicon_store() { + console.log("writing db..."); + // console.log(Object.keys(lexicon)); + search_db.reset(); + for (const term of lexicon.keys()) { + var serialized = serialize_matches(term); + if (!serialized) return; + if (put_total % 5000 === 0) console.log(put_total + "..."); + put_total += 1; + // if (put_total > 10) return + // console.log(term) + search_db.put(term, serialized); + } + // search_db.save(); +} +function serialize_matches(term) { + var matches = lexicon.get(term); + var lex_count = lex_counts.get(term)?.size || 0; + if (!lex_count) { + return null; + } + var idf = Math.log(total / lex_count); + var serialized_matches = []; + Object.values(matches).forEach((match) => { + if (!match) return; + var s = [ + match.thread, + match.comment, + match.file, + Number((match.strength * idf).toFixed(2)), + ]; + if (s) serialized_matches.push(s); + }); + if (!serialized_matches.length) return; + return serialized_matches; } -function serialize_matches (term) { - var matches = lexicon[term] - var idf = Math.log(total / lex_counts[term]) - var serialized_matches = []; - Object.values(matches).forEach( (match) => { - if (!match) return - var s = [ - match.thread, - match.comment, - match.file, - match.strength * idf - ].join(' ') - if (s) serialized_matches.push(s) - }) - if (!serialized_matches.length) return - return serialized_matches.join(',') -}
\ No newline at end of file |
