scripts/fix_fav_counts.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

"""
   this needs python 3 due to py-postgresql...

   before running this command, please run the following SQL, which tallies the faves per post:

   COPY (SELECT messages.user_id, tags.message_id, COUNT(tags.message_id)
                AS mycount, TO_CHAR(messages.created_on, 'YYYYMMDD')
           FROM tags, messages
          WHERE tags.message_id = messages.message_id AND tags.tag = 'favorite'
       GROUP BY tags.message_id, messages.user_id, messages.created_on)
   TO '/tmp/fav_counts69.csv' WITH CSV;

   this file will be owned by the postgres user,
   so change the 69 to some other number (janky i know)

   then run ./sort_faves.sh /tmp/fav_counts69.csv
   ...which will pre-sort the data for this script.

   then run python3 fix_fav_counts.py
   ...this script.

   if you run this twice, don't worry, data will not be duplicated.
"""

import re
import sys
import postgresql
import redis
import csv

db = postgresql.open("pq://postgres:root@localhost/dumpfm")
db.execute("SET CLIENT_ENCODING to 'UNICODE'")

r = redis.Redis("192.168.156.111")

def fetch_users():
  statement = """SELECT user_id, nick FROM users"""

  ps = db.prepare(statement)
  return ps.chunks

# by_date.csv  by_user.csv  counts_sorted.csv  hall.csv
# field order: user_id, message_id, score, date

def load_faves_by_user():
  print("fixing favscores...")
  nicks = load_nicks()
  counter = 0
  user_counter = 0
  score = 0
  user_id = 0
  key = ""
  with open('faves/by_user.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
      if row[0] != user_id:
        if score != 0:
          r.zadd("favscores", nicks[user_id], score)
        counter = 0
        user_counter += 1
        score = 0
        user_id = row[0]
        key = "popular:" + nicks[user_id]
        if (user_counter % 1000) == 0:
          print(str(user_counter) + " ...")
      score += int(row[2])
      if counter > 30:
        continue
      r.zadd(key, row[1], int(row[2]))
      counter += 1

def load_faves_by_date():
  print("fixing daily halls...")
  date_counter = 0
  counter = 0
  date = ""
  key = ""
  with open('faves/by_date.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
      if row[3] != date:
        counter = 0
        date_counter += 1
        date = row[3]
        key = "hall:daily:" + row[3]
        if (int(date) % 100) == 1:
          print(key)
      if counter > 30:
        continue
      r.zadd(key, row[1], int(row[2]))
      counter += 1

def load_hall():
  print("fixing hall...")
  with open('faves/hall.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
      r.zadd('hall', row[1], int(row[2]))

def load_nicks():
  nicks = {}
  chunks = fetch_users()
  for rowset in chunks():
    for row in rowset:
      nicks[str(row[0])] = row[1]
  return nicks

if __name__ == "__main__":
  load_hall()
  load_faves_by_user()
  load_faves_by_date()
  print("done!")