diff options
Diffstat (limited to 'bucky/search/lexicon.js')
| -rw-r--r-- | bucky/search/lexicon.js | 17 |
1 files changed, 12 insertions, 5 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js index e6a9e84..2415e81 100644 --- a/bucky/search/lexicon.js +++ b/bucky/search/lexicon.js @@ -7,11 +7,13 @@ var db = require('../db') var search_db = bdb('search') var lexicon = {} +var lex_counts = {} var total = 0 module.exports = { build: build_index } function build_index() { + console.log("building index") parse_threads() .then(parse_comments) .then(parse_files) @@ -61,7 +63,7 @@ function parse_files() { } var underscoreRegexp = new RegExp('_', 'g') -var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g') +var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g') function parse_terms (opt) { var thread = opt.thread @@ -80,13 +82,16 @@ function parse_terms (opt) { res.thread = res.thread || thread res.comment = res.comment || comment res.file = res.file || file - if (!comment || !file) { + // prioritize threads + if (!comment && !file) { res.strength += 2 } else { res.strength += 1 } count += 1 + lex_counts[term] = lex_counts[term] || 0 + lex_counts[term] += 1 }) return count || 0 } @@ -96,7 +101,7 @@ function lexicon_store () { console.log('writing db...') Object.keys(lexicon).forEach( (term) => { if (STOPWORDS.has(term)) return - var serialized = serialize_matches(lexicon[term]); + var serialized = serialize_matches(term); if (! serialized) return; if ((put_total % 5000) === 0) console.log(put_total + '...') put_total += 1 @@ -105,7 +110,9 @@ function lexicon_store () { search_db.put(term, serialized) }) } -function serialize_matches (matches) { +function serialize_matches (term) { + var matches = lexicon[term] + var idf = Math.log(total / lex_counts[term]) var serialized_matches = []; Object.values(matches).forEach( (match) => { if (!match) return @@ -113,7 +120,7 @@ function serialize_matches (matches) { match.thread, match.comment, match.file, - match.strength + match.strength * idf ].join(' ') if (s) serialized_matches.push(s) }) |
