summaryrefslogtreecommitdiff
path: root/bucky/search/lexicon.js
blob: 2b7a8a99770f1029d56a5c5fb23cd93fb651f912 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
require("dotenv").load();

var STOPWORDS = require("./stopwords");
var bdb = require("./bdb");
var db = require("../db");
var parse_term = require("./parse_term");

var search_db = bdb("search");

var lexicon = {};
var lex_counts = {};
var total = 0;

module.exports = {
  build: build_index,
  watch: watch_index,
  save: () => search_db.save(),
};

var BUILD_DELAY = 1000 * 60 * 60 * 24;
function watch_index() {
  build_index();
  console.log(
    "rebuilding search index every",
    BUILD_DELAY / (60 * 60 * 1000),
    "hours",
  );
  var interval = setInterval(build_index, BUILD_DELAY);
}

function build_index(cb) {
  console.log("building search index");
  lexicon = {};
  lex_counts = {};
  total = 0;
  return parse_threads()
    .then(parse_comments)
    .then(parse_files)
    .then(() => {
      var unique = Object.keys(lexicon).length;
      console.log("--- WORD COUNT: ", total);
      console.log("--- UNIQUE WORDS: ", unique);
      lexicon_store();
      console.log("Done!");
      return { total, unique };
    });
}
function parse_threads() {
  return db.Thread.where("id", ">", 1)
    .fetchAll()
    .then((threads) => {
      console.log("got threads", threads.length);
      threads.forEach((thread) => {
        total += parse_terms({
          string: thread.get("title"),
          thread: thread.get("id"),
        });
      });
    });
}
function parse_comments() {
  return db.Comment.where("thread", ">", 1)
    .fetchAll()
    .then((comments) => {
      console.log("got comments", comments.length);
      comments.forEach((comment) => {
        total += parse_terms({
          string: comment.get("comment").toString(),
          thread: comment.get("thread"),
          comment: comment.get("id"),
        });
      });
    });
}
function parse_files() {
  return db.File.fetchAll().then((files) => {
    console.log("got files", files.length);
    files.forEach((file) => {
      total += parse_terms({
        string: file.get("filename"),
        thread: file.get("thread"),
        file: file.get("id"),
      });
    });
  });
}

var underscoreRegexp = new RegExp("_", "g");
var spaceRegexp = new RegExp("[^a-zA-Z0-9]+", "g");

/**
 * For each term, create mappings:
 *   - lexicon[term][thread] => {thread, comment, file, strength}
 *   - lex_counts[term] => document frequency
 *   - total terms ++
 */
function parse_terms(opt) {
  var thread = opt.thread;
  var comment = opt.comment || 0;
  var file = opt.file || 0;
  var string = opt.string;
  if (!string || !thread) return 0;
  var count = 0;
  var terms = string
    .replace(underscoreRegexp, " ")
    .split(spaceRegexp)
    .forEach((term) => {
      var t = parse_term(term);
      if (!term) {
        return;
      }
      var lookup = (lexicon[t] = lexicon[t] || {});
      var res = (lookup[thread] = lookup[thread] || { strength: 1 });
      res.thread = res.thread || thread;
      res.comment = res.comment || comment;
      res.file = res.file || file;
      // prioritize threads
      if (!comment && !file) {
        res.strength += 4;
      } else if (file) {
        res.strength += 1.5;
      }
      count += 1;
      lex_counts[term] = lex_counts[term] || new Set();
      try {
        lex_counts[term].add(res.thread);
      } catch (error) {
        console.error(error);
        console.log(lex_counts[term]);
      }
    });
  return count || 0;
}

var put_total = 0;
function lexicon_store() {
  console.log("writing db...");
  // console.log(Object.keys(lexicon));
  search_db.reset();
  Object.keys(lexicon).forEach((term) => {
    if (STOPWORDS.has(term)) return;
    var serialized = serialize_matches(term);
    if (!serialized) return;
    if (put_total % 5000 === 0) console.log(put_total + "...");
    put_total += 1;
    // if (put_total > 10) return
    // console.log(term)
    search_db.put(term, serialized);
  });
  // search_db.save();
}
function serialize_matches(term) {
  var matches = lexicon[term];
  var lex_count = lex_counts[term]?.size || 0;
  if (!lex_count) {
    return null;
  }
  var idf = Math.log(total / lex_count);
  var serialized_matches = [];
  Object.values(matches).forEach((match) => {
    if (!match) return;
    var s = [
      match.thread,
      match.comment,
      match.file,
      Number((match.strength * idf).toFixed(2)),
    ];
    if (s) serialized_matches.push(s);
  });
  if (!serialized_matches.length) return;
  return serialized_matches;
}