diff options
| author | julian laplace <julescarbon@gmail.com> | 2026-01-18 14:47:28 +0100 |
|---|---|---|
| committer | julian laplace <julescarbon@gmail.com> | 2026-01-18 14:47:28 +0100 |
| commit | d08fa34987e0792a8722d77dd052a1cacd96db10 (patch) | |
| tree | 1ec95b121e3c597b0d03ce9906565fcf2055ec8d /bucky/search/lexicon.js | |
| parent | 9f1b85f69a2129622fd60c858247292f30f7da35 (diff) | |
fixing search
Diffstat (limited to 'bucky/search/lexicon.js')
| -rw-r--r-- | bucky/search/lexicon.js | 232 |
1 files changed, 132 insertions, 100 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js index dc1d7ab..0783512 100644 --- a/bucky/search/lexicon.js +++ b/bucky/search/lexicon.js @@ -1,129 +1,161 @@ -require('dotenv').load(); +require("dotenv").load(); -var STOPWORDS = require('./stopwords') -var bdb = require('./bdb') -var db = require('../db') +var STOPWORDS = require("./stopwords"); +var bdb = require("./bdb"); +var db = require("../db"); +var parse_term = require("./parse_term"); -var search_db = bdb('search') +var search_db = bdb("search"); -var lexicon = {} -var lex_counts = {} -var total = 0 +var lexicon = {}; +var lex_counts = {}; +var total = 0; -module.exports = { build: build_index } +module.exports = { + build: build_index, + watch: watch_index, + save: () => search_db.save(), +}; + +var BUILD_DELAY = 1000 * 60 * 60 * 24; +function watch_index() { + build_index(); + console.log( + "rebuilding search index every", + BUILD_DELAY / (60 * 60 * 1000), + "hours", + ); + var interval = setInterval(build_index, BUILD_DELAY); +} function build_index(cb) { - console.log("building index") + console.log("building search index"); + lexicon = {}; + lex_counts = {}; + total = 0; return parse_threads() .then(parse_comments) .then(parse_files) - .then( () => { - var unique = Object.keys(lexicon).length - console.log( "--- WORD COUNT: ", total ); - console.log( "--- UNIQUE WORDS: ", unique ); + .then(() => { + var unique = Object.keys(lexicon).length; + console.log("--- WORD COUNT: ", total); + console.log("--- UNIQUE WORDS: ", unique); lexicon_store(); - console.log( "Done!") - return { total, unique } - }) + console.log("Done!"); + return { total, unique }; + }); } function parse_threads() { - return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => { - console.log('got threads', threads.length) - threads.forEach( (thread) => { - total += parse_terms({ - string: thread.get('title'), - thread: thread.get('id'), - }) - }) - }) + return db.Thread.where("id", ">", 1) + .fetchAll() + .then((threads) => { + console.log("got threads", threads.length); + threads.forEach((thread) => { + total += parse_terms({ + string: thread.get("title"), + thread: thread.get("id"), + }); + }); + }); } function parse_comments() { - return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => { - console.log('got comments', comments.length) - comments.forEach( (comment) => { - total += parse_terms({ - string: comment.get('comment').toString(), - thread: comment.get('thread'), - comment: comment.get('id'), - }) - }) - }) + return db.Comment.where("thread", ">", 1) + .fetchAll() + .then((comments) => { + console.log("got comments", comments.length); + comments.forEach((comment) => { + total += parse_terms({ + string: comment.get("comment").toString(), + thread: comment.get("thread"), + comment: comment.get("id"), + }); + }); + }); } function parse_files() { - return db.File.fetchAll().then( (files) => { - console.log('got files', files.length) - files.forEach( (file) => { + return db.File.fetchAll().then((files) => { + console.log("got files", files.length); + files.forEach((file) => { total += parse_terms({ - string: file.get('filename'), - thread: file.get('thread'), - file: file.get('id'), - }) - }) - }) + string: file.get("filename"), + thread: file.get("thread"), + file: file.get("id"), + }); + }); + }); } -var underscoreRegexp = new RegExp('_', 'g') -var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g') +var underscoreRegexp = new RegExp("_", "g"); +var spaceRegexp = new RegExp("[^a-zA-Z0-9]+", "g"); -function parse_terms (opt) { - var thread = opt.thread - var comment = opt.comment || 0 - var file = opt.file || 0 - var string = opt.string - if (!string || !thread) return 0 - var count = 0 +function parse_terms(opt) { + var thread = opt.thread; + var comment = opt.comment || 0; + var file = opt.file || 0; + var string = opt.string; + if (!string || !thread) return 0; + var count = 0; var terms = string - .replace(underscoreRegexp, ' ') + .replace(underscoreRegexp, " ") .split(spaceRegexp) .forEach((term) => { - var t = term.toLowerCase() - var lookup = lexicon[t] = lexicon[t] || {} - var res = lookup[thread] = lookup[thread] || { strength: 0 } - res.thread = res.thread || thread - res.comment = res.comment || comment - res.file = res.file || file + var t = parse_term(term); + if (!term) { + return; + } + var lookup = (lexicon[t] = lexicon[t] || {}); + var res = (lookup[thread] = lookup[thread] || { strength: 0 }); + res.thread = res.thread || thread; + res.comment = res.comment || comment; + res.file = res.file || file; // prioritize threads if (!comment && !file) { - res.strength += 2 - } - else { - res.strength += 1 + res.strength += 2; + } else { + res.strength += 1; } - count += 1 - lex_counts[term] = lex_counts[term] || 0 - lex_counts[term] += 1 - }) - return count || 0 + count += 1; + lex_counts[term] = lex_counts[term] || 0; + lex_counts[term] += 1; + }); + return count || 0; } -var put_total = 0 -function lexicon_store () { - console.log('writing db...') - Object.keys(lexicon).forEach( (term) => { - if (STOPWORDS.has(term)) return - var serialized = serialize_matches(term); - if (! serialized) return; - if ((put_total % 5000) === 0) console.log(put_total + '...') - put_total += 1 - // if (put_total > 10) return - // console.log(term) - search_db.put(term, serialized) - }) +var put_total = 0; +function lexicon_store() { + console.log("writing db..."); + // console.log(Object.keys(lexicon)); + search_db.reset(); + Object.keys(lexicon).forEach((term) => { + if (STOPWORDS.has(term)) return; + var serialized = serialize_matches(term); + if (!serialized) return; + if (put_total % 5000 === 0) console.log(put_total + "..."); + put_total += 1; + // if (put_total > 10) return + // console.log(term) + search_db.put(term, serialized); + }); + search_db.save(); +} +function serialize_matches(term) { + var matches = lexicon[term]; + var lex_count = lex_counts[term]; + if (!lex_count) { + return null; + } + var idf = Math.log(total / lex_count); + var serialized_matches = []; + Object.values(matches).forEach((match) => { + if (!match) return; + var s = [ + match.thread, + match.comment, + match.file, + Number((match.strength * idf).toFixed(2)), + ]; + if (s) serialized_matches.push(s); + }); + if (!serialized_matches.length) return; + return serialized_matches; } -function serialize_matches (term) { - var matches = lexicon[term] - var idf = Math.log(total / lex_counts[term]) - var serialized_matches = []; - Object.values(matches).forEach( (match) => { - if (!match) return - var s = [ - match.thread, - match.comment, - match.file, - match.strength * idf - ].join(' ') - if (s) serialized_matches.push(s) - }) - if (!serialized_matches.length) return - return serialized_matches.join(',') -}
\ No newline at end of file |
