require('dotenv').load(); var STOPWORDS = require('./stopwords') var bdb = require('./bdb') var db = require('../db') var search_db = bdb('search') var lexicon = {} var lex_counts = {} var total = 0 module.exports = { build: build_index } function build_index() { console.log("building index") parse_threads() .then(parse_comments) .then(parse_files) .then( () => { var unique = Object.keys(lexicon).length console.log( "--- WORD COUNT: ", total ); console.log( "--- UNIQUE WORDS: ", unique ); lexicon_store(); console.log( "Done!") process.exit() }) } function parse_threads() { return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => { console.log('got threads', threads.length) threads.forEach( (thread) => { total += parse_terms({ string: thread.get('title'), thread: thread.get('id'), }) }) }) } function parse_comments() { return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => { console.log('got comments', comments.length) comments.forEach( (comment) => { total += parse_terms({ string: comment.get('comment').toString(), thread: comment.get('thread'), comment: comment.get('id'), }) }) }) } function parse_files() { return db.File.fetchAll().then( (files) => { console.log('got files', files.length) files.forEach( (file) => { total += parse_terms({ string: file.get('filename'), thread: file.get('thread'), file: file.get('id'), }) }) }) } var underscoreRegexp = new RegExp('_', 'g') var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g') function parse_terms (opt) { var thread = opt.thread var comment = opt.comment || 0 var file = opt.file || 0 var string = opt.string if (!string || !thread) return 0 var count = 0 var terms = string .replace(underscoreRegexp, ' ') .split(spaceRegexp) .forEach((term) => { var t = term.toLowerCase() var lookup = lexicon[t] = lexicon[t] || {} var res = lookup[thread] = lookup[thread] || { strength: 0 } res.thread = res.thread || thread res.comment = res.comment || comment res.file = res.file || file // prioritize threads if (!comment && !file) { res.strength += 2 } else { res.strength += 1 } count += 1 lex_counts[term] = lex_counts[term] || 0 lex_counts[term] += 1 }) return count || 0 } var put_total = 0 function lexicon_store () { console.log('writing db...') Object.keys(lexicon).forEach( (term) => { if (STOPWORDS.has(term)) return var serialized = serialize_matches(term); if (! serialized) return; if ((put_total % 5000) === 0) console.log(put_total + '...') put_total += 1 // if (put_total > 10) return // console.log(term) search_db.put(term, serialized) }) } function serialize_matches (term) { var matches = lexicon[term] var idf = Math.log(total / lex_counts[term]) var serialized_matches = []; Object.values(matches).forEach( (match) => { if (!match) return var s = [ match.thread, match.comment, match.file, match.strength * idf ].join(' ') if (s) serialized_matches.push(s) }) if (!serialized_matches.length) return return serialized_matches.join(',') }