1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
require('dotenv').load();
var STOPWORDS = require('./stopwords')
var bdb = require('./bdb')
var db = require('../db')
var search_db = bdb('search')
var lexicon = {}
var total = 0
module.exports = { build: build_index }
function build_index() {
parse_threads()
.then(parse_comments)
.then(parse_files)
.then( () => {
var unique = Object.keys(lexicon).length
console.log( "--- WORD COUNT: ", total );
console.log( "--- UNIQUE WORDS: ", unique );
lexicon_store();
console.log( "Done!")
process.exit()
})
}
function parse_threads() {
return db.Thread.where('id', '>', 1).fetchAll().then( (threads) => {
console.log('got threads', threads.length)
threads.forEach( (thread) => {
total += parse_terms({
string: thread.get('title'),
thread: thread.get('id'),
})
})
})
}
function parse_comments() {
return db.Comment.where('thread', '>', 1).fetchAll().then( (comments) => {
console.log('got comments', comments.length)
comments.forEach( (comment) => {
total += parse_terms({
string: comment.get('comment').toString(),
thread: comment.get('thread'),
comment: comment.get('id'),
})
})
})
}
function parse_files() {
return db.File.fetchAll().then( (files) => {
console.log('got files', files.length)
files.forEach( (file) => {
total += parse_terms({
string: file.get('filename'),
thread: file.get('thread'),
file: file.get('id'),
})
})
})
}
var underscoreRegexp = new RegExp('_', 'g')
var spaceRegexp = new RegExp('[^a-zA-Z]+', 'g')
function parse_terms (opt) {
var thread = opt.thread
var comment = opt.comment || 0
var file = opt.file || 0
var string = opt.string
if (!string || !thread) return 0
var count = 0
var terms = string
.replace(underscoreRegexp, ' ')
.split(spaceRegexp)
.forEach((term) => {
var t = term.toLowerCase()
var lookup = lexicon[t] = lexicon[t] || {}
var res = lookup[thread] = lookup[thread] || { strength: 0 }
res.thread = res.thread || thread
res.comment = res.comment || comment
res.file = res.file || file
if (!comment || !file) {
res.strength += 2
}
else {
res.strength += 1
}
count += 1
})
return count || 0
}
var put_total = 0
function lexicon_store () {
console.log('writing db...')
Object.keys(lexicon).forEach( (term) => {
if (STOPWORDS.has(term)) return
var serialized = serialize_matches(lexicon[term]);
if (! serialized) return;
if ((put_total % 5000) === 0) console.log(put_total + '...')
put_total += 1
// if (put_total > 10) return
// console.log(term)
search_db.put(term, serialized)
})
}
function serialize_matches (matches) {
var serialized_matches = [];
Object.values(matches).forEach( (match) => {
if (!match) return
var s = [
match.thread,
match.comment,
match.file,
match.strength
].join(' ')
if (s) serialized_matches.push(s)
})
if (!serialized_matches.length) return
return serialized_matches.join(',')
}
|