diff options
| author | julian laplace <julescarbon@gmail.com> | 2026-01-18 14:47:28 +0100 |
|---|---|---|
| committer | julian laplace <julescarbon@gmail.com> | 2026-01-18 14:47:28 +0100 |
| commit | d08fa34987e0792a8722d77dd052a1cacd96db10 (patch) | |
| tree | 1ec95b121e3c597b0d03ce9906565fcf2055ec8d /bucky | |
| parent | 9f1b85f69a2129622fd60c858247292f30f7da35 (diff) | |
fixing search
Diffstat (limited to 'bucky')
| -rw-r--r-- | bucky/app/site.js | 4 | ||||
| -rw-r--r-- | bucky/bin/build-search.js | 8 | ||||
| -rw-r--r-- | bucky/search/bdb.js | 106 | ||||
| -rw-r--r-- | bucky/search/lexicon.js | 232 | ||||
| -rw-r--r-- | bucky/search/middleware.js | 160 | ||||
| -rw-r--r-- | bucky/search/parse_term.js | 7 | ||||
| -rw-r--r-- | bucky/search/search.js | 9 | ||||
| -rw-r--r-- | bucky/search/snippet.js | 57 | ||||
| -rw-r--r-- | bucky/search/stopwords.js | 32 |
9 files changed, 317 insertions, 298 deletions
diff --git a/bucky/app/site.js b/bucky/app/site.js index 3627bac..de42155 100644 --- a/bucky/app/site.js +++ b/bucky/app/site.js @@ -19,6 +19,7 @@ var RedisStore = require("connect-redis")(session); var redisClient = redis.createClient(); var upload = require("../util/upload"); +var lexicon = require("../search/lexicon"); var app, server; @@ -83,7 +84,7 @@ site.init = function () { server = http.createServer(app).listen(process.env.PORT || 5000, function () { console.log( "Bucky listening at http://" + process.env.HOST_NAME + ":%s", - server.address().port + server.address().port, ); }); @@ -101,6 +102,7 @@ site.init = function () { if (process.env.NODE_ENV === "production") { require("../bin/build-scripts"); } + lexicon.watch(); }; site.api = require("./api"); site.pages = require("./pages"); diff --git a/bucky/bin/build-search.js b/bucky/bin/build-search.js index 23657b3..cb12c23 100644 --- a/bucky/bin/build-search.js +++ b/bucky/bin/build-search.js @@ -1,4 +1,6 @@ -var lexicon = require('../search/lexicon') - -lexicon.build().then(() => process.exit()) +var lexicon = require("../search/lexicon"); +lexicon.build().then(() => { + lexicon.save(); + process.exit(); +}); diff --git a/bucky/search/bdb.js b/bucky/search/bdb.js index 0495666..e62f59e 100644 --- a/bucky/search/bdb.js +++ b/bucky/search/bdb.js @@ -1,68 +1,48 @@ var fs = require("fs"); -function berkeleydb(fn) { - var db; - var bdb_lib = require("berkeleydb"); - var dbenv = new bdb_lib.DbEnv(); - var bdb_status = dbenv.open("./search/db/env"); - if (bdb_status) { - console.log("open dbenv failed:", bdb_status); - process.exit(); - } - - fn = "./" + fn + ".db"; - - function exitHandler(options, err) { - if (db) db.close(); - // if (options.cleanup) console.log('clean'); - if (err) console.log(err.stack); - if (options.exit) process.exit(); - } - - // do something when app is closing - process.on("exit", exitHandler.bind(null, { cleanup: true })); - - // catches ctrl+c event - process.on("SIGINT", exitHandler.bind(null, { exit: true })); - - // catches "kill pid" (for example: nodemon restart) - process.on("SIGUSR1", exitHandler.bind(null, { exit: true })); - process.on("SIGUSR2", exitHandler.bind(null, { exit: true })); +var databases = {}; - //catches uncaught exceptions - process.on("uncaughtException", exitHandler.bind(null, { exit: true })); - - function open(fn) { - if (db) db.close(); - var _db = new bdb_lib.Db(dbenv); - var bdb_status = _db.open(fn); - if (bdb_status) { - console.log("openĀ " + fn + " failed:", bdb_status); - process.exit(); - } - db = _db; +function jsondb(dbName) { + if (databases[dbName]) { + return databases[dbName]; } - open(fn); + let db = {}; + let filename = "./" + dbName + ".db"; - return { + // Store context for this database + var controller = { + load: function () { + if (fs.existsSync(filename)) { + try { + db = JSON.parse(fs.readFileSync(filename)); + } catch (err) { + console.error("couldn't read " + filename); + process.exit(); + } + } else { + db = {}; + } + }, + save: function () { + fs.writeFileSync(filename, JSON.stringify(db, false, 0)); + }, + reset: function () { + db = {}; + }, put: function (term, serialized) { - db.put(term, serialized); + db[term] = serialized; }, get: function (term) { - return db.get(term); + return db[term]; }, }; -} -function jsondb(fn) { - let db; - - fn = "./" + fn + ".db"; + databases[dbName] = controller; function exitHandler(options, err) { - if (db) { - fs.writeFileSync(fn, JSON.stringify(db, false, 0)); - } + // if (db) { + // fs.writeFileSync(fn, JSON.stringify(db, false, 0)); + // } // if (options.cleanup) console.log('clean'); if (err) console.log(err.stack); if (options.exit) process.exit(); @@ -81,24 +61,8 @@ function jsondb(fn) { //catches uncaught exceptions process.on("uncaughtException", exitHandler.bind(null, { exit: true })); - if (fs.existsSync(fn)) { - try { - db = JSON.parse(fs.readFileSync(fn)); - } catch (err) { - console.error("couldn't read " + fn); - process.exit(); - } - } else { - db = {}; - } + controller.load(); - return { - put: function (term, serialized) { - db[term] = serialized; - }, - get: function (term) { - return db[term]; - }, - }; + return controller; } -module.exports = process.env.USE_BDB === "true" ? berkeleydb : jsondb; +module.exports = jsondb; diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js index dc1d7ab..0783512 100644 --- a/bucky/search/lexicon.js +++ b/bucky/search/lexicon.js @@ -1,129 +1,161 @@ -require('dotenv').load(); +require("dotenv").load(); -var STOPWORDS = require('./stopwords') -var bdb = require('./bdb') -var db = require('../db') +var STOPWORDS = require("./stopwords"); +var bdb = require("./bdb"); +var db = require("../db"); +var parse_term = require("./parse_term"); -var search_db = bdb('search') +var search_db = bdb("search"); -var lexicon = {} -var lex_counts = {} -var total = 0 +var lexicon = {}; +var lex_counts = {}; +var total = 0; -module.exports = { build: build_index } +module.exports = { + build: build_index, + watch: watch_index, + save: () => search_db.save(), +}; + +var BUILD_DELAY = 1000 * 60 * 60 * 24; +function watch_index() { + build_index(); + console.log( + "rebuilding search index every", + BUILD_DELAY / (60 * 60 * 1000), + "hours", + ); + var interval = setInterval(build_index, BUILD_DELAY); +} function build_index(cb) { - console.log("building index") + console.log("building search index"); + lexicon = {}; + lex_counts = {}; + total = 0; return parse_threads() .then(parse_comments) .then(parse_files) - .then( () => { - var unique = Object.keys(lexicon).length - console.log( "--- WORD COUNT: ", total ); - console.log( "--- UNIQUE WORDS: ", unique ); + .then(() => { + var unique = Object.keys(lexicon).length; + console.log("--- WORD COUNT: ", total); + console.log("--- UNIQUE WORDS: ", unique); lexicon_store(); - console.log( "Done!") - return { total, unique } - }) + console.log("Done!"); + return { total, unique }; + }); } function parse_threads() { - return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => { - console.log('got threads', threads.length) - threads.forEach( (thread) => { - total += parse_terms({ - string: thread.get('title'), - thread: thread.get('id'), - }) - }) - }) + return db.Thread.where("id", ">", 1) + .fetchAll() + .then((threads) => { + console.log("got threads", threads.length); + threads.forEach((thread) => { + total += parse_terms({ + string: thread.get("title"), + thread: thread.get("id"), + }); + }); + }); } function parse_comments() { - return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => { - console.log('got comments', comments.length) - comments.forEach( (comment) => { - total += parse_terms({ - string: comment.get('comment').toString(), - thread: comment.get('thread'), - comment: comment.get('id'), - }) - }) - }) + return db.Comment.where("thread", ">", 1) + .fetchAll() + .then((comments) => { + console.log("got comments", comments.length); + comments.forEach((comment) => { + total += parse_terms({ + string: comment.get("comment").toString(), + thread: comment.get("thread"), + comment: comment.get("id"), + }); + }); + }); } function parse_files() { - return db.File.fetchAll().then( (files) => { - console.log('got files', files.length) - files.forEach( (file) => { + return db.File.fetchAll().then((files) => { + console.log("got files", files.length); + files.forEach((file) => { total += parse_terms({ - string: file.get('filename'), - thread: file.get('thread'), - file: file.get('id'), - }) - }) - }) + string: file.get("filename"), + thread: file.get("thread"), + file: file.get("id"), + }); + }); + }); } -var underscoreRegexp = new RegExp('_', 'g') -var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g') +var underscoreRegexp = new RegExp("_", "g"); +var spaceRegexp = new RegExp("[^a-zA-Z0-9]+", "g"); -function parse_terms (opt) { - var thread = opt.thread - var comment = opt.comment || 0 - var file = opt.file || 0 - var string = opt.string - if (!string || !thread) return 0 - var count = 0 +function parse_terms(opt) { + var thread = opt.thread; + var comment = opt.comment || 0; + var file = opt.file || 0; + var string = opt.string; + if (!string || !thread) return 0; + var count = 0; var terms = string - .replace(underscoreRegexp, ' ') + .replace(underscoreRegexp, " ") .split(spaceRegexp) .forEach((term) => { - var t = term.toLowerCase() - var lookup = lexicon[t] = lexicon[t] || {} - var res = lookup[thread] = lookup[thread] || { strength: 0 } - res.thread = res.thread || thread - res.comment = res.comment || comment - res.file = res.file || file + var t = parse_term(term); + if (!term) { + return; + } + var lookup = (lexicon[t] = lexicon[t] || {}); + var res = (lookup[thread] = lookup[thread] || { strength: 0 }); + res.thread = res.thread || thread; + res.comment = res.comment || comment; + res.file = res.file || file; // prioritize threads if (!comment && !file) { - res.strength += 2 - } - else { - res.strength += 1 + res.strength += 2; + } else { + res.strength += 1; } - count += 1 - lex_counts[term] = lex_counts[term] || 0 - lex_counts[term] += 1 - }) - return count || 0 + count += 1; + lex_counts[term] = lex_counts[term] || 0; + lex_counts[term] += 1; + }); + return count || 0; } -var put_total = 0 -function lexicon_store () { - console.log('writing db...') - Object.keys(lexicon).forEach( (term) => { - if (STOPWORDS.has(term)) return - var serialized = serialize_matches(term); - if (! serialized) return; - if ((put_total % 5000) === 0) console.log(put_total + '...') - put_total += 1 - // if (put_total > 10) return - // console.log(term) - search_db.put(term, serialized) - }) +var put_total = 0; +function lexicon_store() { + console.log("writing db..."); + // console.log(Object.keys(lexicon)); + search_db.reset(); + Object.keys(lexicon).forEach((term) => { + if (STOPWORDS.has(term)) return; + var serialized = serialize_matches(term); + if (!serialized) return; + if (put_total % 5000 === 0) console.log(put_total + "..."); + put_total += 1; + // if (put_total > 10) return + // console.log(term) + search_db.put(term, serialized); + }); + search_db.save(); +} +function serialize_matches(term) { + var matches = lexicon[term]; + var lex_count = lex_counts[term]; + if (!lex_count) { + return null; + } + var idf = Math.log(total / lex_count); + var serialized_matches = []; + Object.values(matches).forEach((match) => { + if (!match) return; + var s = [ + match.thread, + match.comment, + match.file, + Number((match.strength * idf).toFixed(2)), + ]; + if (s) serialized_matches.push(s); + }); + if (!serialized_matches.length) return; + return serialized_matches; } -function serialize_matches (term) { - var matches = lexicon[term] - var idf = Math.log(total / lex_counts[term]) - var serialized_matches = []; - Object.values(matches).forEach( (match) => { - if (!match) return - var s = [ - match.thread, - match.comment, - match.file, - match.strength * idf - ].join(' ') - if (s) serialized_matches.push(s) - }) - if (!serialized_matches.length) return - return serialized_matches.join(',') -}
\ No newline at end of file diff --git a/bucky/search/middleware.js b/bucky/search/middleware.js index 0cca05c..a93ee7f 100644 --- a/bucky/search/middleware.js +++ b/bucky/search/middleware.js @@ -1,111 +1,121 @@ -var db = require('../db') +var db = require("../db"); -var search = require('./search') -var snippet = require('./snippet') -var lexicon = require('./lexicon') +var search = require("./search"); +var snippet = require("./snippet"); +var lexicon = require("./lexicon"); module.exports = { - search: function (req, res, next) { - res.search = search.search(req.query.query, req.query.start, req.query.limit) - if (! res.search) { - res.sendStatus(400) - return + res.search = search.search( + req.query.query, + req.query.start, + req.query.limit, + ); + if (!res.search) { + res.sendStatus(400); + return; } - next() + next(); }, - - getThreads: function (req, res, next){ + + getThreads: function (req, res, next) { var thread_ids = res.search.thread_ids; - if (! thread_ids || ! thread_ids.length) { - res.search.threads = [] - return next() + if (!thread_ids || !thread_ids.length) { + res.search.threads = []; + return next(); } - db.getThreadsById(thread_ids).then(function(threads){ + db.getThreadsById(thread_ids).then(function (threads) { threads.forEach((thread) => { - var flag_id = thread.get('flagged') + var flag_id = thread.get("flagged"); if (flag_id) { - res.search.file_ids.push(flag_id) + res.search.file_ids.push(flag_id); } - }) - res.search.threads = threads - next() - }) + }); + res.search.threads = threads; + next(); + }); }, - - getComments: function (req, res, next){ + + getComments: function (req, res, next) { var comment_ids = res.search.comment_ids; - if (! comment_ids || ! comment_ids.length) { - res.search.comments = [] - return next() + if (!comment_ids || !comment_ids.length) { + res.search.comments = []; + return next(); } - db.getCommentsById(comment_ids).then(function(comments){ - var terms = res.search.meta.terms - comments.forEach(function(comment){ - const snip = snippet(comment.get('comment').toString(), terms) - comment.set('comment', snip) - }) - res.search.comments = comments - next() - }) + db.getCommentsById(comment_ids).then(function (comments) { + var terms = res.search.meta.terms; + comments.forEach(function (comment) { + const snip = snippet(comment.get("comment").toString(), terms); + comment.set("comment", snip); + }); + res.search.comments = comments; + next(); + }); }, - - getFiles: function (req, res, next){ - var file_ids = res.search.file_ids - if (! file_ids || ! file_ids.length) { - res.search.files = [] - return next() + + getFiles: function (req, res, next) { + var file_ids = res.search.file_ids; + if (!file_ids || !file_ids.length) { + res.search.files = []; + return next(); } - db.getFilesById(file_ids).then(function(files){ - res.search.files = files - next() - }) + db.getFilesById(file_ids).then(function (files) { + res.search.files = files; + next(); + }); }, - logQuery: function(req, res, next) { + logQuery: function (req, res, next) { // req.search.query, req.search.count - next() + next(); }, - success: function(req, res, next){ - var terms = res.search.meta.terms - var threads = {}, comments = {}, files = {} - res.search.threads.forEach((t) => { threads[t.id] = t }) - res.search.comments.forEach((t) => { comments[t.id] = t }) - res.search.files.forEach((t) => { files[t.id] = t }) + success: function (req, res, next) { + var terms = res.search.meta.terms; + var threads = {}, + comments = {}, + files = {}; + res.search.threads.forEach((t) => { + threads[t.id] = t; + }); + res.search.comments.forEach((t) => { + comments[t.id] = t; + }); + res.search.files.forEach((t) => { + files[t.id] = t; + }); var results = res.search.results.map((r) => { - var m = {} - m.thread = threads[r.thread] - m.comment = comments[r.comment] - m.file = files[r.file] - m.count = r.count - m.strength = r.strength + var m = {}; + m.thread = threads[r.thread]; + m.comment = comments[r.comment]; + m.file = files[r.file]; + m.count = r.count; + m.strength = r.strength; if (m.thread) { - var flagged = m.thread.get('flagged') + var flagged = m.thread.get("flagged"); if (flagged) { - m.thread.set('flagged', files[flagged]) + m.thread.set("flagged", files[flagged]); } - var allowed = m.thread.get('allowed') + var allowed = m.thread.get("allowed"); if (allowed) { - m.thread.set('allowed', allowed.toString().split(" ")) + m.thread.set("allowed", allowed.toString().split(" ")); } - var display = m.thread.get('display') + var display = m.thread.get("display"); if (display) { - m.thread.set('display', display.toString().split(" ")) + m.thread.set("display", display.toString().split(" ")); } } - return m - }) + return m; + }); res.json({ meta: res.search.meta, results: results, - }) + }); }, - rebuild: function(req, res, next){ - lexicon.build().then( (data) => { - res.json(data) - }) + rebuild: function (req, res, next) { + lexicon.build().then((data) => { + res.json(data); + }); }, - -} +}; diff --git a/bucky/search/parse_term.js b/bucky/search/parse_term.js new file mode 100644 index 0000000..470d371 --- /dev/null +++ b/bucky/search/parse_term.js @@ -0,0 +1,7 @@ +module.exports = function parse_term(term) { + return term + ? String(term) + .toLowerCase() + .replace(/(es|ing|ly|ed|er)?s?$/, "") + : ""; +}; diff --git a/bucky/search/search.js b/bucky/search/search.js index fb3bb2d..8924b1f 100644 --- a/bucky/search/search.js +++ b/bucky/search/search.js @@ -1,12 +1,14 @@ var db = require("../db"); var bdb = require("./bdb")("search"); var STOPWORDS = require("./stopwords"); +var parse_term = require("./parse_term"); var wordRegexp = new RegExp("[^a-z0-9]+", "g"); function parse_terms(s) { return s .toLowerCase() .split(wordRegexp) + .map(parse_term) .filter((term) => !!term); } function cmp(a, b) { @@ -16,12 +18,11 @@ function cmp(a, b) { function find_term(term) { var row = bdb.get(term); if (!row) return []; - var res = row.toString(); + var res = row; // console.log(res) if (!res.length) return []; - var matches = res.split(",").map((s) => { - if (!s.length) return; - var partz = s.split(" "); + var matches = res.map((partz) => { + if (!partz.length) return; return { thread: parseInt(partz[0]), comment: parseInt(partz[1]), diff --git a/bucky/search/snippet.js b/bucky/search/snippet.js index 17988d2..787a53f 100644 --- a/bucky/search/snippet.js +++ b/bucky/search/snippet.js @@ -1,35 +1,36 @@ -var util = require('../util/util') -var STOPWORDS = require('./stopwords') +var util = require("../util/util"); +var STOPWORDS = require("./stopwords"); +var parse_term = require("./parse_term"); function snippet(s, terms) { - s = util.sanitize(s) - var term_set = new Set(terms) - - var words = s.split(/[^a-zA-Z0-9]+/) - var snippet = ""; - + s = util.sanitize(s); + var term_set = new Set(terms); + + var words = s.split(/[^a-zA-Z0-9]+/); + var snippet = ""; + // deduper for matching @words indexes, so we don't add a word twice - var index_matches = {} + var index_matches = {}; // words in the eventual snippet - var words_matched = [] + var words_matched = []; // counter for aggregating context after a match - var aggr = 0; + var aggr = 0; // amount of context to show, in number of words surrounding a match var pad = 10; // loop over each of the words in the string - var word for (var i = 0, len = words.length; i < len; i++) { - word = words[i] + var word = words[i]; + var term = parse_term(word); - // if the word matches... - if (term_set.has(word.toLowerCase()) && ! STOPWORDS.has(word.toLowerCase())) { + // if the word matches... + if (term && term_set.has(term) && !STOPWORDS.has(term.toLowerCase())) { // if we aren't already aggregating, add an ellipsis - if (! aggr) { - words_matched.push("...") + if (!aggr) { + words_matched.push("..."); } // look backward $pad words @@ -44,38 +45,38 @@ function snippet(s, terms) { if (index_matches[idx]) continue INNER; // checks out, save this word - words_matched.push(words[idx]) + words_matched.push(words[idx]); // note the matching index in our deduper index_matches[idx] = 1; - } + } // enter aggregate mode -- add the next (pad) words aggr = pad; - } + } // have we been told to aggregate? else if (aggr) { // save this word - words_matched.push(word) + words_matched.push(word); // add index to the deduper index_matches[i] = 1; // one less word to aggregate aggr--; - } + } // keep snippets to a modest length - if (words_matched.length > 30) break - } + if (words_matched.length > 30) break; + } // add a trailing ellipsis - words_matched.push("...") + words_matched.push("..."); // create the snippet from the saved context words - snippet = words_matched.join(" ") + snippet = words_matched.join(" "); - return snippet + return snippet; } -module.exports = snippet
\ No newline at end of file +module.exports = snippet; diff --git a/bucky/search/stopwords.js b/bucky/search/stopwords.js index ceffe14..735e94d 100644 --- a/bucky/search/stopwords.js +++ b/bucky/search/stopwords.js @@ -1,18 +1,18 @@ module.exports = new Set( - "a about above across adj after again against all almost alone along also " + - "although always am among an and another any anybody anyone anything anywhere " + - "apart are around as aside at away be because been before behind being below " + - "besides between beyond both but by can cannot could did do does doing done " + - "down downwards during each either else enough etc even ever every everybody " + - "everyone except far few for forth from get gets got had hardly has have having " + - "her here herself him himself his how however i if in indeed instead into inward " + - "is it its itself just kept many maybe might mine more most mostly much must " + - "myself near neither next no nobody none nor not nothing nowhere of off often on " + - "only onto or other others ought our ours out outside over own p per please plus " + - "pp quite rather really said seem self selves several shall she should since so " + - "some somebody somewhat still such than that the their theirs them themselves " + - "then there therefore these they this thorough thoroughly those through thus to " + - "together too toward towards under until up upon v very was well were what " + - "whatever when whenever where whether which while who whom whose will with" + - "within without would yet young your yourself s".split(" ") + "a adj " + + "am an and " + + "are as at be been " + + "but by can could did do does doing done " + + "down " + + "far few for forth from get gets got had hardly has have having " + + "her here herself him himself his how i if in into " + + "is it its itself just kept many maybe might mine more much must " + + "myself near neither next no none nor not of off often on " + + "only onto or other others ought our ours out over own p per please plus " + + "pp quite rather really said seem self selves several shall she should since so " + + "some somebody somewhat still such than that the their theirs them themselves " + + "then there therefore these they this thorough thoroughly those through thus to " + + "together too toward towards under until up upon v very was well were what " + + "whatever when whenever where whether which while who whom whose will with" + + "within without would yet young your yourself s".split(" "), ); |
