summaryrefslogtreecommitdiff
path: root/bucky/search/lexicon.js
diff options
context:
space:
mode:
Diffstat (limited to 'bucky/search/lexicon.js')
-rw-r--r--bucky/search/lexicon.js17
1 files changed, 12 insertions, 5 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js
index e6a9e84..2415e81 100644
--- a/bucky/search/lexicon.js
+++ b/bucky/search/lexicon.js
@@ -7,11 +7,13 @@ var db = require('../db')
var search_db = bdb('search')
var lexicon = {}
+var lex_counts = {}
var total = 0
module.exports = { build: build_index }
function build_index() {
+ console.log("building index")
parse_threads()
.then(parse_comments)
.then(parse_files)
@@ -61,7 +63,7 @@ function parse_files() {
}
var underscoreRegexp = new RegExp('_', 'g')
-var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g')
+var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g')
function parse_terms (opt) {
var thread = opt.thread
@@ -80,13 +82,16 @@ function parse_terms (opt) {
res.thread = res.thread || thread
res.comment = res.comment || comment
res.file = res.file || file
- if (!comment || !file) {
+ // prioritize threads
+ if (!comment && !file) {
res.strength += 2
}
else {
res.strength += 1
}
count += 1
+ lex_counts[term] = lex_counts[term] || 0
+ lex_counts[term] += 1
})
return count || 0
}
@@ -96,7 +101,7 @@ function lexicon_store () {
console.log('writing db...')
Object.keys(lexicon).forEach( (term) => {
if (STOPWORDS.has(term)) return
- var serialized = serialize_matches(lexicon[term]);
+ var serialized = serialize_matches(term);
if (! serialized) return;
if ((put_total % 5000) === 0) console.log(put_total + '...')
put_total += 1
@@ -105,7 +110,9 @@ function lexicon_store () {
search_db.put(term, serialized)
})
}
-function serialize_matches (matches) {
+function serialize_matches (term) {
+ var matches = lexicon[term]
+ var idf = Math.log(total / lex_counts[term])
var serialized_matches = [];
Object.values(matches).forEach( (match) => {
if (!match) return
@@ -113,7 +120,7 @@ function serialize_matches (matches) {
match.thread,
match.comment,
match.file,
- match.strength
+ match.strength * idf
].join(' ')
if (s) serialized_matches.push(s)
})