blob: 8d27745782413f6cf096977be1fd18c4831fc2b9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
|
(ns feed
(:import java.net.URL
java.util.Date
javax.imageio.ImageIO
com.sun.syndication.io.SyndFeedInput
com.sun.syndication.io.XmlReader
org.htmlcleaner.HtmlCleaner)
(:require [clojure.contrib.str-utils2 :as s])
(:use clojure.contrib.condition
clojure.contrib.duck-streams
clojure.contrib.seq-utils
clojure.contrib.sql
clojure.contrib.core
compojure
rooms
scheduled-agent
utils))
(def *feeds-path* "docs/feeds.csv")
(defn queue-image! [room-key img]
(do-insert :feed_images
[:feed_url :link :title :image_url :room]
[(:feed-url img) (:link img) (:title img) (:url img) room-key]))
(defn parse-line [line]
(let [r (s/split line #",")]
(zipmap [:room-key :desc :feed-url :site-link :contact]
(map #(.trim (.replaceAll % "\"" ""))
r))))
(defn read-feeds []
(rest (map parse-line (read-lines *feeds-path*))))
(def *image-posted-qry* "
SELECT v
FROM UNNEST(?) as v
WHERE NOT EXISTS (SELECT 1
FROM feed_images f
WHERE f.image_url = v
AND f.room = ?)
AND NOT EXISTS (SELECT 1
FROM invalid_feed_images
WHERE image_url = v)
")
(defn filter-posted-urls [room-key urls]
(if (empty? urls)
#{}
(set (map :v
(do-select [*image-posted-qry*
(sql-array "text" urls)
room-key])))))
(defn insert-feed-image-to-db! [room-id feed img user-id]
(with-connection *db*
(transaction
(let [m-id (insert-message-into-db! user-id room-id img true)]
(do-prepared "INSERT INTO feed_images
(feed_url, image_url, room_id, message_id)
VALUES (?, ?, ?, ?)"
[feed img room-id m-id])
m-id))))
(defn rome-feed [url]
(.build (new SyndFeedInput) (new XmlReader (new URL url))))
(defn html-clean [html]
(.clean (new HtmlCleaner) html))
(defn- filter-seq [s]
(seq (remove nil? s)))
; http://stackoverflow.com/questions/169625/regex-to-check-if-valid-url-that-ends-in-jpg-png-or-gif
; TOOD: use site.clj regex
(def image-regex
#"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png|svg)")
(defn extract-linked-images [node]
(filter-seq
(for [a (.getElementsByName node "a" true)]
(let [href (.getAttributeByName a "href")]
(when (and href (re-matches image-regex href))
href)))))
(defn extract-inline-images [node]
(filter-seq
(for [a (.getElementsByName node "img" true)]
(.getAttributeByName a "src"))))
(defn pull-images-from-html [html]
(let [node (.clean (new HtmlCleaner) html)]
(or (extract-linked-images node)
(extract-inline-images node))))
(defn extract-images-from-entry
"Parsing strategy is to first try to extract linked images, then try to
extract inline images.
The theory is that when a entry has linked images, they link to the full
versions of included thumbnails. When there are no linked images, then the
inline images are fullsize.
TODO: only extract a linked image if the anchor tag contains a child image tag
TODO: try extracting images from other content nodes besides the first
TODO: just download the suckers and test for image size as a last resort"
[e]
(or (-?> e .getDescription .getValue pull-images-from-html)
(-?> e .getContents first .getValue pull-images-from-html)))
(defn extract-feed-images [feed-url]
(let [feed (rome-feed feed-url)]
(for [e (.getEntries feed)
url (extract-images-from-entry e)]
{:url url
:feed-url feed-url
:feed-title (or (.getTitle feed) "")
:title (or (.getTitle e) "")
:link (.getLink e)})))
(def min-image-dimensions {:height 400 :width 400})
(def max-image-dimensions {:height 2000 :width 2000})
(defn fetch-and-validate-image [url]
(try
(println "fetching " url)
(let [img (ImageIO/read (new URL url))
h (.getHeight img)
w (.getWidth img)]
(cond (and (< h (:height min-image-dimensions))
(< w (:width min-image-dimensions))) (format "TOO SMALL (%sx%s)" h w)
(or (> h (:height max-image-dimensions))
(> w (:width max-image-dimensions))) (format "TOO BIG (%sx%s)" h w)
:else nil))
(catch Exception e (.getMessage e))))
(defn fetch-and-queue-feed! [{feed-url :feed-url room-key :room-key}]
(let [images (extract-feed-images feed-url)
fresh-urls (filter-posted-urls room-key (map :url images))
good-images (filter #(contains? fresh-urls (:url %)) images)]
(doseq [img good-images]
(if-let [reason (fetch-and-validate-image (:url img))]
(do-insert :invalid_feed_images [:image_url :reason] [(:url img) reason])
(queue-image! room-key img)))))
(defn fetch-and-queue-all-feeds! []
(doseq [f (read-feeds)]
(try
(if (and (:room-key f) (:feed-url f))
(fetch-and-queue-feed! f))
(catch Exception e
(print-stack-trace e)))))
;; Image posting
(defn- post-queued-message-db-txn! [bot-id room-key room-id image-url]
(with-connection *db*
(transaction
(let [msg-id (insert-message-into-db! bot-id room-id image-url true)
update-count (first
(update-values :feed_images
["room = ? AND image_url = ? AND message_id IS NULL"
room-key image-url]
{:message_id msg-id}))]
(if (= update-count 1)
msg-id
(do (set-rollback-only)
false))))))
(defn post-queued-messages! []
(doseq [room-key (map :room (do-select ["SELECT DISTINCT(room) FROM feed_images
WHERE message_id IS NULL"]))]
(let [room-id (get-or-create-room! room-key)
[bot-nick bot-id] (get-or-create-room-bot! room-key)]
(if-let [msg (first
(do-select ["SELECT * FROM feed_images
WHERE room = ? AND message_id IS NULL
ORDER BY queued_on ASC LIMIT 1" room-key]))]
(if-let [msg-id (post-queued-message-db-txn! bot-id room-key room-id (:image_url msg))]
(do (dosync
(add-message (build-msg bot-nick (:image_url msg) msg-id)
(lookup-room room-key)))
(println "Inserted" (:image_url msg) "into" room-key))
(println "error inserting" msg))))))
;; Feed download and insertion schedule
(def feed-refresh-period-sec (* 60 60))
(def feed-insert-period-sec 60)
(def feed-downloader
(scheduled-agent fetch-and-queue-all-feeds!
feed-refresh-period-sec
nil))
(def feed-inserter
(scheduled-agent post-queued-messages!
feed-refresh-period-sec
nil))
;; Testing
;; (defn feed-test-page [session]
;; (if-vip
;; (html [:body
;; [:h1 "Feed Test"]
;; [:form {:action "/feed-test" :method "post"}
;; [:input {:type "text" :name "url"}]
;; [:input {:type "submit" :value "Send"}]]])))
;; (defn show-bad-images [imgs]
;; (for [[img reason] imgs]
;; [:div
;; reason
;; [:a {:href img}
;; [:img {:src img}]]]))
;; (defn show-good-images [imgs]
;; (for [img imgs]
;; [:div
;; [:a {:href img}
;; [:img {:src img}]]]))
;; (defn feed-test [session params]
;; (if-vip
;; (if-let [feed (params :url)]
;; (let [[slurp-ms text] (with-timing (download-http-url feed))
;; [process-ms imgs] (with-timing (extract-images text))
;; [good-imgs bad-imgs] (classify-images imgs)]
;; (html [:body
;; [:h1 (str "Images for " feed)]
;; [:div (format "Downloaded in %s ms" slurp-ms)]
;; [:div (format "Processed in %s ms" process-ms)]
;; [:hr]
;; [:h2 "Images"]
;; (show-good-images good-imgs)
;; [:hr]
;; [:h2 "Filtered Out Images"]
;; (show-bad-images bad-imgs)
;; [:hr]
;; [:h2 "Raw Feed Contents"]
;; [:pre (escape-html text)]]))
;; (redirect-to "/feed-test"))))
;; (redirect-to "/feed-test2"))))
|