bucky/search/lexicon.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

require('dotenv').load();

var STOPWORDS = require('./stopwords')
var bdb = require('./bdb')
var db = require('../db')

var search_db = bdb('search')

var lexicon = {}
var lex_counts = {}
var total = 0

module.exports = { build: build_index }

function build_index(cb) {
  console.log("building index")
  return parse_threads()
    .then(parse_comments)
    .then(parse_files)
    .then( () => {
      var unique = Object.keys(lexicon).length
      console.log( "--- WORD COUNT: ", total );
      console.log( "--- UNIQUE WORDS: ", unique );
      lexicon_store();
      console.log( "Done!")
      return { total, unique }
    })
}
function parse_threads() {
  return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => {
    console.log('got threads', threads.length)
    threads.forEach( (thread) => {
      total += parse_terms({
        string: thread.get('title'),
        thread: thread.get('id'),
      })
    })
  })
}
function parse_comments() {
  return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => {
    console.log('got comments', comments.length)
    comments.forEach( (comment) => {
      total += parse_terms({
        string: comment.get('comment').toString(),
        thread: comment.get('thread'),
        comment: comment.get('id'),
      })
    })
  })
}
function parse_files() {
  return db.File.fetchAll().then( (files) => {
    console.log('got files', files.length)
    files.forEach( (file) => {
      total += parse_terms({
        string: file.get('filename'),
        thread: file.get('thread'),
        file: file.get('id'),
      })
    })
  })
}

var underscoreRegexp = new RegExp('_', 'g')
var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g')

function parse_terms (opt) {
  var thread = opt.thread
  var comment = opt.comment || 0
  var file = opt.file || 0
  var string = opt.string
  if (!string || !thread) return 0
  var count = 0
  var terms = string
    .replace(underscoreRegexp, ' ')
    .split(spaceRegexp)
    .forEach((term) => {
      var t = term.toLowerCase()
      var lookup = lexicon[t] = lexicon[t] || {}
      var res = lookup[thread] = lookup[thread] || { strength: 0 }
      res.thread = res.thread || thread
      res.comment = res.comment || comment
      res.file = res.file || file
      // prioritize threads
      if (!comment && !file) {
        res.strength += 2
      }
      else {
        res.strength += 1
      }
      count += 1
      lex_counts[term] = lex_counts[term] || 0
      lex_counts[term] += 1
		})
	return count || 0
}

var put_total = 0
function lexicon_store () {
  console.log('writing db...')
	Object.keys(lexicon).forEach( (term) => {
		if (STOPWORDS.has(term)) return
		var serialized = serialize_matches(term);
		if (! serialized) return;
		if ((put_total % 5000) === 0) console.log(put_total + '...')
		put_total += 1
		// if (put_total > 10) return
		// console.log(term)
		search_db.put(term, serialized)
  })
}
function serialize_matches (term) {
  var matches = lexicon[term]
  var idf = Math.log(total / lex_counts[term])
	var serialized_matches = [];
	Object.values(matches).forEach( (match) => {
		if (!match) return
		var s = [
			match.thread,
			match.comment,
			match.file,
			match.strength * idf
		].join(' ')
		if (s) serialized_matches.push(s)
  })
	if (!serialized_matches.length) return
	return serialized_matches.join(',')
}