diff options
Diffstat (limited to 'bucky/search/lexicon.js')
| -rw-r--r-- | bucky/search/lexicon.js | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js new file mode 100644 index 0000000..2cf0f21 --- /dev/null +++ b/bucky/search/lexicon.js @@ -0,0 +1,120 @@ +require('dotenv').load(); + +var STOPWORDS = require('./stopwords') +var bdb = require('./bdb') +var db = require('../db') + +var lexicon = {} +var total = 0 + +build_index() + +function build_index() { + parse_threads() + .then(parse_comments) + .then(parse_files) + .then( () => { + var unique = Object.keys(lexicon).length + console.log( "--- WORD COUNT: ", total ); + console.log( "--- UNIQUE WORDS: ", unique ); + lexicon_store(); + console.log( "Done!") + process.exit() + }) +} +function parse_threads() { + return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => { + console.log('got threads', threads.length) + threads.forEach( (thread) => { + total += parse_terms({ + string: thread.get('title'), + thread: thread.get('id'), + }) + }) + }) +} +function parse_comments() { + return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => { + console.log('got comments', comments.length) + comments.forEach( (comment) => { + total += parse_terms({ + string: comment.get('comment').toString(), + thread: comment.get('thread'), + comment: comment.get('id'), + }) + }) + }) +} +function parse_files() { + return db.File.fetchAll().then( (files) => { + console.log('got files', files.length) + files.forEach( (file) => { + total += parse_terms({ + string: file.get('filename'), + thread: file.get('thread'), + file: file.get('id'), + }) + }) + }) +} + +var underscoreRegexp = new RegExp('_', 'g') +var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g') + +function parse_terms (opt) { + var thread = opt.thread + var comment = opt.comment || 0 + var file = opt.file || 0 + var string = opt.string + if (!string || !thread) return 0 + var count = 0 + var terms = string + .replace(underscoreRegexp, ' ') + .split(spaceRegexp) + .forEach((term) => { + var t = term.toLowerCase() + var lookup = lexicon[t] = lexicon[t] || {} + var res = lookup[thread] = lookup[thread] || { strength: 0 } + res.thread = res.thread || thread + res.comment = res.comment || comment + res.file = res.file || file + if (!comment || !file) { + res.strength += 2 + } + else { + res.strength += 1 + } + count += 1 + }) + return count || 0 +} + +var put_total = 0 +function lexicon_store () { + console.log('writing db...') + Object.keys(lexicon).forEach( (term) => { + if (STOPWORDS.has(term)) return + var serialized = serialize_matches(lexicon[term]); + if (! serialized) return; + if ((put_total % 5000) === 0) console.log(put_total + '...') + put_total += 1 + // if (put_total > 10) return + // console.log(term) + bdb.put(term, serialized) + }) +} +function serialize_matches (matches) { + var serialized_matches = []; + Object.values(matches).forEach( (match) => { + if (!match) return + var s = [ + match.thread, + match.comment, + match.file, + match.strength + ].join(' ') + if (s) serialized_matches.push(s) + }) + if (!serialized_matches.length) return + return serialized_matches.join(',') +}
\ No newline at end of file |
