diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | bucky/app/router.js | 1 | ||||
| -rw-r--r-- | bucky/db/bookshelf.js | 3 | ||||
| -rw-r--r-- | bucky/db/index.js | 7 | ||||
| -rw-r--r-- | bucky/search/bdb.js | 38 | ||||
| -rw-r--r-- | bucky/search/lexicon.js | 120 | ||||
| -rw-r--r-- | bucky/search/middleware.js | 15 | ||||
| -rw-r--r-- | bucky/search/search.js | 40 | ||||
| -rw-r--r-- | search/db/.gitkeep | 0 | ||||
| -rw-r--r-- | search/db/env/.gitkeep | 0 |
10 files changed, 202 insertions, 23 deletions
@@ -13,4 +13,5 @@ public/data .my.cnf search.db search.db.1 +search/db/env/* diff --git a/bucky/app/router.js b/bucky/app/router.js index fde4278..8104bd5 100644 --- a/bucky/app/router.js +++ b/bucky/app/router.js @@ -95,6 +95,7 @@ module.exports = function(app){ app.get("/api/search", middleware.ensureAuthenticated, search.search, + search.getThreads, search.getComments, search.getFiles, search.logQuery, diff --git a/bucky/db/bookshelf.js b/bucky/db/bookshelf.js index 69157cc..32f4aba 100644 --- a/bucky/db/bookshelf.js +++ b/bucky/db/bookshelf.js @@ -11,6 +11,9 @@ var knex = require('knex')({ if (field.type == 'BLOB') { return field.string() } + if (field.type == 'TINYBLOB') { + return field.string() + } return next() } } diff --git a/bucky/db/index.js b/bucky/db/index.js index dcd5f20..f7adb7a 100644 --- a/bucky/db/index.js +++ b/bucky/db/index.js @@ -73,6 +73,9 @@ db.getThreadsForKeyword = function (keyword) { db.getThread = function (id) { return Thread.query("where", "id", "=", id).fetch() } +db.getThreadsById = function(ids){ + return Thread.where("id", "in", ids).fetchAll() +} db.createThread = function(data){ return new db.Thread(data).save() } @@ -93,7 +96,7 @@ db.getFileSizes = function(ids){ return knex.column('thread').sum('size as size').select().from('files').where('thread', 'in', ids).groupBy('thread') } db.getFilesById = function(ids){ - return File.where("id", "in", ids) + return File.where("id", "in", ids).fetchAll() } db.createFile = function(data){ return new db.File(data).save() @@ -121,7 +124,7 @@ db.getCommentsForThread = function (id, limit, offset, order){ }) } db.getCommentsById = function(ids){ - return Comment.where("id", "in", ids) + return Comment.where("id", "in", ids).fetchAll() } db.getCommentCounts = function(ids){ return knex.column('thread').count('* as count').select().from('comments').where('thread', 'in', ids).groupBy('thread') diff --git a/bucky/search/bdb.js b/bucky/search/bdb.js new file mode 100644 index 0000000..ba0124d --- /dev/null +++ b/bucky/search/bdb.js @@ -0,0 +1,38 @@ +var bdb_lib = require('berkeleydb') +var dbenv = new bdb_lib.DbEnv(); +var bdb_status = dbenv.open('./search/db/env') +console.log('openĀ /search/db:', bdb_status) + +var db + +function exitHandler(options, err) { + db.close() + // if (options.cleanup) console.log('clean'); + if (err) console.log(err.stack); + if (options.exit) process.exit(); +} + +// do something when app is closing +process.on('exit', exitHandler.bind(null, {cleanup: true})); + +// catches ctrl+c event +process.on('SIGINT', exitHandler.bind(null, {exit: true})); + +// catches "kill pid" (for example: nodemon restart) +process.on('SIGUSR1', exitHandler.bind(null, {exit: true})); +process.on('SIGUSR2', exitHandler.bind(null, {exit: true})); + +//catches uncaught exceptions +process.on('uncaughtException', exitHandler.bind(null, {exit:true})); + +function open(){ + if (db) db.close() + var _db = new bdb_lib.Db(dbenv); + var bdb_status = _db.open('./search.db') + console.log('openĀ ./search.db:', bdb_status) + db = _db +} + +open() + +module.exports = db diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js new file mode 100644 index 0000000..2cf0f21 --- /dev/null +++ b/bucky/search/lexicon.js @@ -0,0 +1,120 @@ +require('dotenv').load(); + +var STOPWORDS = require('./stopwords') +var bdb = require('./bdb') +var db = require('../db') + +var lexicon = {} +var total = 0 + +build_index() + +function build_index() { + parse_threads() + .then(parse_comments) + .then(parse_files) + .then( () => { + var unique = Object.keys(lexicon).length + console.log( "--- WORD COUNT: ", total ); + console.log( "--- UNIQUE WORDS: ", unique ); + lexicon_store(); + console.log( "Done!") + process.exit() + }) +} +function parse_threads() { + return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => { + console.log('got threads', threads.length) + threads.forEach( (thread) => { + total += parse_terms({ + string: thread.get('title'), + thread: thread.get('id'), + }) + }) + }) +} +function parse_comments() { + return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => { + console.log('got comments', comments.length) + comments.forEach( (comment) => { + total += parse_terms({ + string: comment.get('comment').toString(), + thread: comment.get('thread'), + comment: comment.get('id'), + }) + }) + }) +} +function parse_files() { + return db.File.fetchAll().then( (files) => { + console.log('got files', files.length) + files.forEach( (file) => { + total += parse_terms({ + string: file.get('filename'), + thread: file.get('thread'), + file: file.get('id'), + }) + }) + }) +} + +var underscoreRegexp = new RegExp('_', 'g') +var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g') + +function parse_terms (opt) { + var thread = opt.thread + var comment = opt.comment || 0 + var file = opt.file || 0 + var string = opt.string + if (!string || !thread) return 0 + var count = 0 + var terms = string + .replace(underscoreRegexp, ' ') + .split(spaceRegexp) + .forEach((term) => { + var t = term.toLowerCase() + var lookup = lexicon[t] = lexicon[t] || {} + var res = lookup[thread] = lookup[thread] || { strength: 0 } + res.thread = res.thread || thread + res.comment = res.comment || comment + res.file = res.file || file + if (!comment || !file) { + res.strength += 2 + } + else { + res.strength += 1 + } + count += 1 + }) + return count || 0 +} + +var put_total = 0 +function lexicon_store () { + console.log('writing db...') + Object.keys(lexicon).forEach( (term) => { + if (STOPWORDS.has(term)) return + var serialized = serialize_matches(lexicon[term]); + if (! serialized) return; + if ((put_total % 5000) === 0) console.log(put_total + '...') + put_total += 1 + // if (put_total > 10) return + // console.log(term) + bdb.put(term, serialized) + }) +} +function serialize_matches (matches) { + var serialized_matches = []; + Object.values(matches).forEach( (match) => { + if (!match) return + var s = [ + match.thread, + match.comment, + match.file, + match.strength + ].join(' ') + if (s) serialized_matches.push(s) + }) + if (!serialized_matches.length) return + return serialized_matches.join(',') +}
\ No newline at end of file diff --git a/bucky/search/middleware.js b/bucky/search/middleware.js index b9487b1..39d7a71 100644 --- a/bucky/search/middleware.js +++ b/bucky/search/middleware.js @@ -7,16 +7,29 @@ module.exports = { search: function (req, res, next) { res.search = search.search(req.query.query, req.query.start, req.query.limit) console.log(res.search) - next() }, + getThreads: function (req, res, next){ + var thread_ids = res.search.thread_ids; + if (! thread_ids || ! thread_ids.length) { + return next() + } + db.getThreadsById(thread_ids).then(function(threads){ + res.search.threads = threads + next() + }) + }, + getComments: function (req, res, next){ var comment_ids = res.search.comment_ids; if (! comment_ids || ! comment_ids.length) { return next() } db.getCommentsById(comment_ids).then(function(comments){ + comments.forEach(function(comment){ + comment.set('comment', comment.get('comment').toString()) + }) res.search.comments = comments next() }) diff --git a/bucky/search/search.js b/bucky/search/search.js index 1d06aea..a28d49c 100644 --- a/bucky/search/search.js +++ b/bucky/search/search.js @@ -12,33 +12,32 @@ function parse_terms (s) { return false }) } -function cmp (a,b){ return (a<b)?a:(a===b)?0:1 } +function cmp (a,b){ return (a<b)?-1:(a===b)?0:1 } function find_term(term) { var res = bdb.get(term).toString() - console.log(res) + // console.log(res) if (! res.length) return [] var matches = res.split(",").map((s) => { if (! s.length) return; - console.log(s) var partz = s.split(" ") return { - thread: s[0], - comment: s[1], - file: s[2], - strength: s[3], + thread: parseInt(partz[0]), + comment: parseInt(partz[1]), + file: parseInt(partz[2]), + strength: parseInt(partz[3]) || 1, } }) + console.log(matches) return matches } function search (query, start, limit) { if (!query) return - start = start || 0; - limit = limit || 10; + start = parseInt(start) || 0; + limit = parseInt(limit) || 10; var scores = {}; var terms = parse_terms(query); - var i = 0 var total var to_display = limit var threads = {} @@ -53,23 +52,24 @@ function search (query, start, limit) { if (!results) return; results.forEach((result) => { var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 } - score.thread = score.thread || result.thread - score.comment = score.comment || result.comment - score.file = score.file || result.file + score.thread = score.thread || parseInt(result.thread) + score.comment = score.comment || parseInt(result.comment) + score.file = score.file || parseInt(result.file) score.strength += result.strength score.count += 1 }) }) total = Object.keys(scores).length - Object.values(scores).sort((a,b) => { - if (b.count !== a.count) { - return cmp(b.count, a.count) - } - return cmp(b.strength * b.count, a.strength * a.count) - }).some((match) => { - if (i++ < start) return false + Object.values(scores).sort((b,a) => { + // if (a.count !== b.count) { + // return cmp(a.count, b.count) + // } + return cmp(a.strength, b.strength) + }).some((match, i) => { + if (i < start) return false if (to_display-- === 0) return true results.push(match) + console.log(match) thread_ids.push(match.thread) if (match.comment) comment_ids.push(match.comment) if (match.file) file_ids.push(match.file) diff --git a/search/db/.gitkeep b/search/db/.gitkeep new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/search/db/.gitkeep diff --git a/search/db/env/.gitkeep b/search/db/env/.gitkeep new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/search/db/env/.gitkeep |
