summaryrefslogtreecommitdiff
path: root/bucky/search/lexicon.js
blob: ea9953572e918f526892825b3aa0817590b695d1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
require('dotenv').load();

var STOPWORDS = require('./stopwords')
var db = require('../db')
var redisClient = require('./redis-client');

var lexicon = {}
var lex_counts = {}
var total = 0

module.exports = { build: build_index }

function build_index(cb) {
  console.log("building index")
  return parse_threads()
    .then(parse_comments)
    .then(parse_files)
    .then( () => {
      var unique = Object.keys(lexicon).length
      console.log( "--- WORD COUNT: ", total );
      console.log( "--- UNIQUE WORDS: ", unique );
      lexicon_store();
      console.log( "Done!")
      return { total, unique }
    })
}

function parse_threads() {
  return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => {
    console.log('got threads', threads.length)
    threads.forEach( (thread) => {
      total += parse_terms({
        string: thread.get('title'),
        thread: thread.get('id'),
      })
    })
  })
}
function parse_comments() {
  return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => {
    console.log('got comments', comments.length)
    comments.forEach( (comment) => {
      total += parse_terms({ string: comment.get('comment').toString(),
        thread: comment.get('thread'),
        comment: comment.get('id'),
      })
    })
  })
}
function parse_files() {
  return db.File.fetchAll().then( (files) => {
    console.log('got files', files.length)
    files.forEach( (file) => {
      total += parse_terms({
        string: file.get('filename'),
        thread: file.get('thread'),
        file: file.get('id'),
      })
    })
  })
}

var underscoreRegexp = new RegExp('_', 'g')
var spaceRegexp = new RegExp('[^a-zA-Z0-9]+', 'g')

function parse_terms (opt) {
  var thread = opt.thread
  var comment = opt.comment || 0
  var file = opt.file || 0
  var string = opt.string
  if (!string || !thread) return 0
  var count = 0
  var terms = string
    .replace(underscoreRegexp, ' ')
    .split(spaceRegexp)
    .forEach((term) => {
      var t = term.toLowerCase()
      var lookup = lexicon[t] = lexicon[t] || {}
      var res = lookup[thread] = lookup[thread] || { strength: 0 }
      res.thread = res.thread || thread
      res.comment = res.comment || comment
      res.file = res.file || file
      // prioritize threads
      if (!comment && !file) {
        res.strength += 2
      }
      else {
        res.strength += 1
      }
      count += 1
      lex_counts[term] = lex_counts[term] || 0
      lex_counts[term] += 1
		})
	return count || 0
}

var put_total = 0
function lexicon_store () {
  console.log('writing db...')
	Object.keys(lexicon).forEach( (term) => {
		if (STOPWORDS.has(term)) return
		var serialized = serialize_matches(term);
		if (! serialized) return;
		if ((put_total % 5000) === 0) console.log(put_total + '...')
		put_total += 1
		// if (put_total > 10) return
		// console.log(term)
    redisClient.set(term, serialized)
  })
}
function serialize_matches (term) {
  var matches = lexicon[term]
  var idf = Math.log(total / lex_counts[term])
	var serialized_matches = [];
	Object.values(matches).forEach( (match) => {
		if (!match) return
		var s = [
			match.thread,
			match.comment,
			match.file,
			match.strength * idf
		].join(' ')
		if (s) serialized_matches.push(s)
  })
	if (!serialized_matches.length) return
	return serialized_matches.join(',')
}