summaryrefslogtreecommitdiff
path: root/bucky/search/lexicon.js
diff options
context:
space:
mode:
Diffstat (limited to 'bucky/search/lexicon.js')
-rw-r--r--bucky/search/lexicon.js120
1 files changed, 120 insertions, 0 deletions
diff --git a/bucky/search/lexicon.js b/bucky/search/lexicon.js
new file mode 100644
index 0000000..2cf0f21
--- /dev/null
+++ b/bucky/search/lexicon.js
@@ -0,0 +1,120 @@
+require('dotenv').load();
+
+var STOPWORDS = require('./stopwords')
+var bdb = require('./bdb')
+var db = require('../db')
+
+var lexicon = {}
+var total = 0
+
+build_index()
+
+function build_index() {
+ parse_threads()
+ .then(parse_comments)
+ .then(parse_files)
+ .then( () => {
+ var unique = Object.keys(lexicon).length
+ console.log( "--- WORD COUNT: ", total );
+ console.log( "--- UNIQUE WORDS: ", unique );
+ lexicon_store();
+ console.log( "Done!")
+ process.exit()
+ })
+}
+function parse_threads() {
+ return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => {
+ console.log('got threads', threads.length)
+ threads.forEach( (thread) => {
+ total += parse_terms({
+ string: thread.get('title'),
+ thread: thread.get('id'),
+ })
+ })
+ })
+}
+function parse_comments() {
+ return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => {
+ console.log('got comments', comments.length)
+ comments.forEach( (comment) => {
+ total += parse_terms({
+ string: comment.get('comment').toString(),
+ thread: comment.get('thread'),
+ comment: comment.get('id'),
+ })
+ })
+ })
+}
+function parse_files() {
+ return db.File.fetchAll().then( (files) => {
+ console.log('got files', files.length)
+ files.forEach( (file) => {
+ total += parse_terms({
+ string: file.get('filename'),
+ thread: file.get('thread'),
+ file: file.get('id'),
+ })
+ })
+ })
+}
+
+var underscoreRegexp = new RegExp('_', 'g')
+var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g')
+
+function parse_terms (opt) {
+ var thread = opt.thread
+ var comment = opt.comment || 0
+ var file = opt.file || 0
+ var string = opt.string
+ if (!string || !thread) return 0
+ var count = 0
+ var terms = string
+ .replace(underscoreRegexp, ' ')
+ .split(spaceRegexp)
+ .forEach((term) => {
+ var t = term.toLowerCase()
+ var lookup = lexicon[t] = lexicon[t] || {}
+ var res = lookup[thread] = lookup[thread] || { strength: 0 }
+ res.thread = res.thread || thread
+ res.comment = res.comment || comment
+ res.file = res.file || file
+ if (!comment || !file) {
+ res.strength += 2
+ }
+ else {
+ res.strength += 1
+ }
+ count += 1
+ })
+ return count || 0
+}
+
+var put_total = 0
+function lexicon_store () {
+ console.log('writing db...')
+ Object.keys(lexicon).forEach( (term) => {
+ if (STOPWORDS.has(term)) return
+ var serialized = serialize_matches(lexicon[term]);
+ if (! serialized) return;
+ if ((put_total % 5000) === 0) console.log(put_total + '...')
+ put_total += 1
+ // if (put_total > 10) return
+ // console.log(term)
+ bdb.put(term, serialized)
+ })
+}
+function serialize_matches (matches) {
+ var serialized_matches = [];
+ Object.values(matches).forEach( (match) => {
+ if (!match) return
+ var s = [
+ match.thread,
+ match.comment,
+ match.file,
+ match.strength
+ ].join(' ')
+ if (s) serialized_matches.push(s)
+ })
+ if (!serialized_matches.length) return
+ return serialized_matches.join(',')
+} \ No newline at end of file