summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--bucky/app/router.js1
-rw-r--r--bucky/db/bookshelf.js3
-rw-r--r--bucky/db/index.js7
-rw-r--r--bucky/search/bdb.js38
-rw-r--r--bucky/search/lexicon.js120
-rw-r--r--bucky/search/middleware.js15
-rw-r--r--bucky/search/search.js40
-rw-r--r--search/db/.gitkeep0
-rw-r--r--search/db/env/.gitkeep0
10 files changed, 202 insertions, 23 deletions
diff --git a/.gitignore b/.gitignore
index 94b4c80..e86c1b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,5 @@ public/data
.my.cnf
search.db
search.db.1
+search/db/env/*
diff --git a/bucky/app/router.js b/bucky/app/router.js
index fde4278..8104bd5 100644
--- a/bucky/app/router.js
+++ b/bucky/app/router.js
@@ -95,6 +95,7 @@ module.exports = function(app){
app.get("/api/search",
middleware.ensureAuthenticated,
search.search,
+ search.getThreads,
search.getComments,
search.getFiles,
search.logQuery,
diff --git a/bucky/db/bookshelf.js b/bucky/db/bookshelf.js
index 69157cc..32f4aba 100644
--- a/bucky/db/bookshelf.js
+++ b/bucky/db/bookshelf.js
@@ -11,6 +11,9 @@ var knex = require('knex')({
if (field.type == 'BLOB') {
return field.string()
}
+ if (field.type == 'TINYBLOB') {
+ return field.string()
+ }
return next()
}
}
diff --git a/bucky/db/index.js b/bucky/db/index.js
index dcd5f20..f7adb7a 100644
--- a/bucky/db/index.js
+++ b/bucky/db/index.js
@@ -73,6 +73,9 @@ db.getThreadsForKeyword = function (keyword) {
db.getThread = function (id) {
return Thread.query("where", "id", "=", id).fetch()
}
+db.getThreadsById = function(ids){
+ return Thread.where("id", "in", ids).fetchAll()
+}
db.createThread = function(data){
return new db.Thread(data).save()
}
@@ -93,7 +96,7 @@ db.getFileSizes = function(ids){
return knex.column('thread').sum('size as size').select().from('files').where('thread', 'in', ids).groupBy('thread')
}
db.getFilesById = function(ids){
- return File.where("id", "in", ids)
+ return File.where("id", "in", ids).fetchAll()
}
db.createFile = function(data){
return new db.File(data).save()
@@ -121,7 +124,7 @@ db.getCommentsForThread = function (id, limit, offset, order){
})
}
db.getCommentsById = function(ids){
- return Comment.where("id", "in", ids)
+ return Comment.where("id", "in", ids).fetchAll()
}
db.getCommentCounts = function(ids){
return knex.column('thread').count('* as count').select().from('comments').where('thread', 'in', ids).groupBy('thread')
diff --git a/bucky/search/bdb.js b/bucky/search/bdb.js
new file mode 100644
index 0000000..ba0124d
--- /dev/null
+++ b/bucky/search/bdb.js
@@ -0,0 +1,38 @@
+var bdb_lib = require('berkeleydb')
+var dbenv = new bdb_lib.DbEnv();
+var bdb_status = dbenv.open('./search/db/env')
+console.log('openĀ /search/db:', bdb_status)
+
+var db
+
+function exitHandler(options, err) {
+ db.close()
+ // if (options.cleanup) console.log('clean');
+ if (err) console.log(err.stack);
+ if (options.exit) process.exit();
+}
+
+// do something when app is closing
+process.on('exit', exitHandler.bind(null, {cleanup: true}));
+
+// catches ctrl+c event
+process.on('SIGINT', exitHandler.bind(null, {exit: true}));
+
+// catches "kill pid" (for example: nodemon restart)
+process.on('SIGUSR1', exitHandler.bind(null, {exit: true}));
+process.on('SIGUSR2', exitHandler.bind(null, {exit: true}));
+
+//catches uncaught exceptions
+process.on('uncaughtException', exitHandler.bind(null, {exit:true}));
+
+function open(){
+ if (db) db.close()
+ var _db = new bdb_lib.Db(dbenv);
+ var bdb_status = _db.open('./search.db')
+ console.log('openĀ ./search.db:', bdb_status)
+ db = _db
+}
+
+open()
+
+module.exports = db
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js
new file mode 100644
index 0000000..2cf0f21
--- /dev/null
+++ b/bucky/search/lexicon.js
@@ -0,0 +1,120 @@
+require('dotenv').load();
+
+var STOPWORDS = require('./stopwords')
+var bdb = require('./bdb')
+var db = require('../db')
+
+var lexicon = {}
+var total = 0
+
+build_index()
+
+function build_index() {
+ parse_threads()
+ .then(parse_comments)
+ .then(parse_files)
+ .then( () => {
+ var unique = Object.keys(lexicon).length
+ console.log( "--- WORD COUNT: ", total );
+ console.log( "--- UNIQUE WORDS: ", unique );
+ lexicon_store();
+ console.log( "Done!")
+ process.exit()
+ })
+}
+function parse_threads() {
+ return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => {
+ console.log('got threads', threads.length)
+ threads.forEach( (thread) => {
+ total += parse_terms({
+ string: thread.get('title'),
+ thread: thread.get('id'),
+ })
+ })
+ })
+}
+function parse_comments() {
+ return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => {
+ console.log('got comments', comments.length)
+ comments.forEach( (comment) => {
+ total += parse_terms({
+ string: comment.get('comment').toString(),
+ thread: comment.get('thread'),
+ comment: comment.get('id'),
+ })
+ })
+ })
+}
+function parse_files() {
+ return db.File.fetchAll().then( (files) => {
+ console.log('got files', files.length)
+ files.forEach( (file) => {
+ total += parse_terms({
+ string: file.get('filename'),
+ thread: file.get('thread'),
+ file: file.get('id'),
+ })
+ })
+ })
+}
+
+var underscoreRegexp = new RegExp('_', 'g')
+var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g')
+
+function parse_terms (opt) {
+ var thread = opt.thread
+ var comment = opt.comment || 0
+ var file = opt.file || 0
+ var string = opt.string
+ if (!string || !thread) return 0
+ var count = 0
+ var terms = string
+ .replace(underscoreRegexp, ' ')
+ .split(spaceRegexp)
+ .forEach((term) => {
+ var t = term.toLowerCase()
+ var lookup = lexicon[t] = lexicon[t] || {}
+ var res = lookup[thread] = lookup[thread] || { strength: 0 }
+ res.thread = res.thread || thread
+ res.comment = res.comment || comment
+ res.file = res.file || file
+ if (!comment || !file) {
+ res.strength += 2
+ }
+ else {
+ res.strength += 1
+ }
+ count += 1
+ })
+ return count || 0
+}
+
+var put_total = 0
+function lexicon_store () {
+ console.log('writing db...')
+ Object.keys(lexicon).forEach( (term) => {
+ if (STOPWORDS.has(term)) return
+ var serialized = serialize_matches(lexicon[term]);
+ if (! serialized) return;
+ if ((put_total % 5000) === 0) console.log(put_total + '...')
+ put_total += 1
+ // if (put_total > 10) return
+ // console.log(term)
+ bdb.put(term, serialized)
+ })
+}
+function serialize_matches (matches) {
+ var serialized_matches = [];
+ Object.values(matches).forEach( (match) => {
+ if (!match) return
+ var s = [
+ match.thread,
+ match.comment,
+ match.file,
+ match.strength
+ ].join(' ')
+ if (s) serialized_matches.push(s)
+ })
+ if (!serialized_matches.length) return
+ return serialized_matches.join(',')
+} \ No newline at end of file
diff --git a/bucky/search/middleware.js b/bucky/search/middleware.js
index b9487b1..39d7a71 100644
--- a/bucky/search/middleware.js
+++ b/bucky/search/middleware.js
@@ -7,16 +7,29 @@ module.exports = {
search: function (req, res, next) {
res.search = search.search(req.query.query, req.query.start, req.query.limit)
console.log(res.search)
-
next()
},
+ getThreads: function (req, res, next){
+ var thread_ids = res.search.thread_ids;
+ if (! thread_ids || ! thread_ids.length) {
+ return next()
+ }
+ db.getThreadsById(thread_ids).then(function(threads){
+ res.search.threads = threads
+ next()
+ })
+ },
+
getComments: function (req, res, next){
var comment_ids = res.search.comment_ids;
if (! comment_ids || ! comment_ids.length) {
return next()
}
db.getCommentsById(comment_ids).then(function(comments){
+ comments.forEach(function(comment){
+ comment.set('comment', comment.get('comment').toString())
+ })
res.search.comments = comments
next()
})
diff --git a/bucky/search/search.js b/bucky/search/search.js
index 1d06aea..a28d49c 100644
--- a/bucky/search/search.js
+++ b/bucky/search/search.js
@@ -12,33 +12,32 @@ function parse_terms (s) {
return false
})
}
-function cmp (a,b){ return (a<b)?a:(a===b)?0:1 }
+function cmp (a,b){ return (a<b)?-1:(a===b)?0:1 }
function find_term(term) {
var res = bdb.get(term).toString()
- console.log(res)
+ // console.log(res)
if (! res.length) return []
var matches = res.split(",").map((s) => {
if (! s.length) return;
- console.log(s)
var partz = s.split(" ")
return {
- thread: s[0],
- comment: s[1],
- file: s[2],
- strength: s[3],
+ thread: parseInt(partz[0]),
+ comment: parseInt(partz[1]),
+ file: parseInt(partz[2]),
+ strength: parseInt(partz[3]) || 1,
}
})
+ console.log(matches)
return matches
}
function search (query, start, limit) {
if (!query) return
- start = start || 0;
- limit = limit || 10;
+ start = parseInt(start) || 0;
+ limit = parseInt(limit) || 10;
var scores = {};
var terms = parse_terms(query);
- var i = 0
var total
var to_display = limit
var threads = {}
@@ -53,23 +52,24 @@ function search (query, start, limit) {
if (!results) return;
results.forEach((result) => {
var score = scores[result.thread] = scores[result.thread] || { count: 0, strength: 0 }
- score.thread = score.thread || result.thread
- score.comment = score.comment || result.comment
- score.file = score.file || result.file
+ score.thread = score.thread || parseInt(result.thread)
+ score.comment = score.comment || parseInt(result.comment)
+ score.file = score.file || parseInt(result.file)
score.strength += result.strength
score.count += 1
})
})
total = Object.keys(scores).length
- Object.values(scores).sort((a,b) => {
- if (b.count !== a.count) {
- return cmp(b.count, a.count)
- }
- return cmp(b.strength * b.count, a.strength * a.count)
- }).some((match) => {
- if (i++ < start) return false
+ Object.values(scores).sort((b,a) => {
+ // if (a.count !== b.count) {
+ // return cmp(a.count, b.count)
+ // }
+ return cmp(a.strength, b.strength)
+ }).some((match, i) => {
+ if (i < start) return false
if (to_display-- === 0) return true
results.push(match)
+ console.log(match)
thread_ids.push(match.thread)
if (match.comment) comment_ids.push(match.comment)
if (match.file) file_ids.push(match.file)
diff --git a/search/db/.gitkeep b/search/db/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/search/db/.gitkeep
diff --git a/search/db/env/.gitkeep b/search/db/env/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/search/db/env/.gitkeep