1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
|
(ns feed
(:import java.util.Date)
(:require [clojure.contrib.str-utils2 :as s])
(:use clojure.contrib.condition
clojure.contrib.duck-streams
clojure.contrib.seq-utils
clojure.contrib.sql
compojure
rooms
scheduled-agent
utils))
(def *feeds-path* "docs/feeds.csv")
(defn parse-line [line]
(let [r (s/split line #",")]
(zipmap [:room-key :desc :feed-link :site-link :contact]
(map #(.trim (.replaceAll % "\"" ""))
r))))
(defn read-feeds []
(rest (map parse-line (read-lines *feeds-path*))))
(def *image-posted-qry* "
SELECT *
FROM UNNEST(?) as v
WHERE NOT EXISTS (SELECT 1
FROM feed_images f
WHERE f.image_url = v
AND f.room_id = ?)
")
(defn filter-posted-images [urls room-id]
(if (empty? urls)
[]
(map :v
(do-select [*image-posted-qry*
(sql-array "text" urls)
room-id]))))
(defn insert-feed-image-to-db! [room-id feed img user-id]
(with-connection *db*
(transaction
(let [acc (comp :message_id first)
m-id (acc
(do-select ["INSERT INTO messages (user_id, room_id, content, is_image)
VALUES (?, ?, ?, true) RETURNING message_id"
user-id room-id img]))]
(do-prepared "INSERT INTO feed_images
(feed_url, image_url, room_id, message_id)
VALUES (?, ?, ?, ?)"
[feed img room-id m-id])
m-id))))
; http://stackoverflow.com/questions/169625/regex-to-check-if-valid-url-that-ends-in-jpg-png-or-gif
(def *image-regex*
#"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png)")
(defn extract-images [text]
(re-seq *image-regex* text))
(defn is-thumbnail? [img]
(boolean (re-find #"(?i)[-._](thumb|small|thumbs)[-._]" img)))
(def image-filters [["THUMBNAIL" is-thumbnail?]])
(defn filter-image [img]
(or (some
(fn [[r f]] (if (f img) [img r]))
image-filters)
[img nil]))
(defn classify-images [imgs]
(let [good? (comp not boolean second)
res (group-by good? (map filter-image imgs))]
[(map first (res true))
(res false)]))
(defn classify-images-from-feed [feed]
(let [[ms text] (with-timing (download-http-url feed))
[g b] (classify-images (extract-images text))]
[g b ms]))
(defn process-feed [f]
(let [room-key (:room-key f)
room-id (get-or-create-room! room-key)
[bot-nick bot-id] (get-or-create-room-bot! room-key)
feed (:feed-link f)
[good bad time] (classify-images-from-feed feed)
filtered-good (filter-posted-images good room-id)]
(doseq [img filtered-good]
(println (format "Inserting %s into room-id %s" img room-key))
(let [msg-id (insert-feed-image-to-db! room-id feed img bot-id)
msg {:msg_id msg-id
:nick bot-nick
:created_on (new Date)
:content img}]
(dosync
(add-message msg (lookup-room room-key)))))))
(defn process-all-feeds! []
(doseq [f (shuffle (read-feeds))]
(try
(if (and (:room-key f) (:feed-link f))
(process-feed f)
(println "Incomplete feed " f))
(catch Exception e
(print-stack-trace e)))))
;; Feed download schedule
(def *feed-refresh-period-sec* (* 30 60))
;(def *feed-downloader*
; (scheduled-agent process-all-feeds!
; *feed-refresh-period-sec*
; nil))
;; Testing
(defn feed-test-page [session]
(if-vip
(html [:body
[:h1 "Feed Test"]
[:form {:action "/feed-test" :method "post"}
[:input {:type "text" :name "url"}]
[:input {:type "submit" :value "Send"}]]])))
(defn show-bad-images [imgs]
(for [[img reason] imgs]
[:div
reason
[:a {:href img}
[:img {:src img}]]]))
(defn show-good-images [imgs]
(for [img imgs]
[:div
[:a {:href img}
[:img {:src img}]]]))
(defn feed-test [session params]
(if-vip
(if-let [feed (params :url)]
(let [[slurp-ms text] (with-timing (download-http-url feed))
[process-ms imgs] (with-timing (extract-images text))
[good-imgs bad-imgs] (classify-images imgs)]
(html [:body
[:h1 (str "Images for " feed)]
[:div (format "Downloaded in %s ms" slurp-ms)]
[:div (format "Processed in %s ms" process-ms)]
[:hr]
[:h2 "Good Images"]
(show-good-images good-imgs)
[:hr]
[:h2 "Filtered Out Images"]
(show-bad-images bad-imgs)
[:hr]
[:h2 "Raw Feed Contents"]
[:pre (escape-html text)]]))
(redirect-to "/feed-test"))))
|