summaryrefslogtreecommitdiff
path: root/bucky
diff options
context:
space:
mode:
authorJules Laplace <julescarbon@gmail.com>2017-12-14 23:59:08 +0100
committerJules Laplace <julescarbon@gmail.com>2017-12-14 23:59:08 +0100
commit0541e7d7457d646dceca375b7fa6e1f382232772 (patch)
tree7a4ae50aa3c09b421c5fe2baab550f35a5e45cbd /bucky
parent6cedbcf2987a74a01ddbe6abe0fed15fd1595e27 (diff)
tf-idf to the rescue!!
Diffstat (limited to 'bucky')
-rw-r--r--bucky/search/lexicon.js17
-rw-r--r--bucky/search/search.js2
-rw-r--r--bucky/util/middleware.js2
3 files changed, 13 insertions, 8 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js
index e6a9e84..2415e81 100644
--- a/bucky/search/lexicon.js
+++ b/bucky/search/lexicon.js
@@ -7,11 +7,13 @@ var db = require('../db')
var search_db = bdb('search')
var lexicon = {}
+var lex_counts = {}
var total = 0
module.exports = { build: build_index }
function build_index() {
+ console.log("building index")
parse_threads()
.then(parse_comments)
.then(parse_files)
@@ -61,7 +63,7 @@ function parse_files() {
}
var underscoreRegexp = new RegExp('_', 'g')
-var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g')
+var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g')
function parse_terms (opt) {
var thread = opt.thread
@@ -80,13 +82,16 @@ function parse_terms (opt) {
res.thread = res.thread || thread
res.comment = res.comment || comment
res.file = res.file || file
- if (!comment || !file) {
+ // prioritize threads
+ if (!comment && !file) {
res.strength += 2
}
else {
res.strength += 1
}
count += 1
+ lex_counts[term] = lex_counts[term] || 0
+ lex_counts[term] += 1
})
return count || 0
}
@@ -96,7 +101,7 @@ function lexicon_store () {
console.log('writing db...')
Object.keys(lexicon).forEach( (term) => {
if (STOPWORDS.has(term)) return
- var serialized = serialize_matches(lexicon[term]);
+ var serialized = serialize_matches(term);
if (! serialized) return;
if ((put_total % 5000) === 0) console.log(put_total + '...')
put_total += 1
@@ -105,7 +110,9 @@ function lexicon_store () {
search_db.put(term, serialized)
})
}
-function serialize_matches (matches) {
+function serialize_matches (term) {
+ var matches = lexicon[term]
+ var idf = Math.log(total / lex_counts[term])
var serialized_matches = [];
Object.values(matches).forEach( (match) => {
if (!match) return
@@ -113,7 +120,7 @@ function serialize_matches (matches) {
match.thread,
match.comment,
match.file,
- match.strength
+ match.strength * idf
].join(' ')
if (s) serialized_matches.push(s)
})
diff --git a/bucky/search/search.js b/bucky/search/search.js
index 1236a4c..1a56f53 100644
--- a/bucky/search/search.js
+++ b/bucky/search/search.js
@@ -49,7 +49,7 @@ function search (query, start, limit) {
score.thread = score.thread || parseInt(result.thread)
score.comment = score.comment || parseInt(result.comment)
score.file = score.file || parseInt(result.file)
- score.strength += result.strength
+ score.strength += parseFloat(result.strength)
score.count += 1
})
})
diff --git a/bucky/util/middleware.js b/bucky/util/middleware.js
index 08935e1..20ec323 100644
--- a/bucky/util/middleware.js
+++ b/bucky/util/middleware.js
@@ -12,8 +12,6 @@ var middleware = module.exports = {
res.locals.csrfToken = req.csrfToken()
res.locals.title = "bucky"
res.locals.env = process.env.NODE_ENV
- console.log(res.locals.env)
-
if (req.isAuthenticated()) {
res.locals.show_header = true
}