summaryrefslogtreecommitdiff
path: root/src/feed.clj
blob: c8454d02d5adcb3cccac0ee1f7fd7a69074fb5e6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
(ns feed
  (:import java.net.URL
           java.util.Date
           javax.imageio.ImageIO
           com.sun.syndication.io.SyndFeedInput
           com.sun.syndication.io.XmlReader
           org.htmlcleaner.HtmlCleaner)
  (:require [clojure.contrib.str-utils2 :as s])
  (:use clojure.contrib.condition
        clojure.contrib.duck-streams
        clojure.contrib.seq-utils
        clojure.contrib.sql
        clojure.contrib.core
        compojure
        rooms
        scheduled-agent
        utils))

(def *feeds-path* "docs/feeds.csv")

(defn queue-image! [room-key img]
  (do-insert :feed_images
             [:feed_url :link :title :image_url :room]
             [(:feed-url img) (:link img) (:title img) (:url img) room-key]))


(defn parse-line [line]
  (let [r (s/split line #",")]
    (zipmap [:room-key :desc :feed-url :site-link :contact]
            (map #(.trim (.replaceAll % "\"" ""))
                 r))))

(defn read-feeds []
  (rest (map parse-line (read-lines *feeds-path*))))

(def *image-posted-qry* "
SELECT v
FROM   UNNEST(?) as v
WHERE  NOT EXISTS (SELECT 1 
                   FROM   feed_images f
                   WHERE  f.image_url = v
                   AND f.room = ?)
AND    NOT EXISTS (SELECT 1
                   FROM invalid_feed_images
                   WHERE image_url = v)
")

(defn filter-posted-urls [room-key urls]
  (if (empty? urls)
    #{}
    (set (map :v
              (do-select [*image-posted-qry*
                          (sql-array "text" urls)
                          room-key])))))

(defn insert-feed-image-to-db! [room-id feed img user-id]
  (with-connection *db*
   (transaction
    (let [m-id (insert-message-into-db! user-id room-id img true)]
      (do-prepared "INSERT INTO feed_images
                   (feed_url, image_url, room_id, message_id)
                   VALUES (?, ?, ?, ?)"
                   [feed img room-id m-id])
      m-id))))

(defn rome-feed [url]
  (.build (new SyndFeedInput) (new XmlReader (new URL url))))

(defn html-clean [html]
  (.clean (new HtmlCleaner) html))

(defn- filter-seq [s]
  (seq (remove nil? s)))

; http://stackoverflow.com/questions/169625/regex-to-check-if-valid-url-that-ends-in-jpg-png-or-gif
; TOOD: use site.clj regex
(def image-regex
     #"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png|svg)")

(defn extract-linked-images [node]
  (filter-seq
   (for [a (.getElementsByName node "a" true)]
     (let [href (.getAttributeByName a "href")]
       (when (and href (re-matches image-regex href))
         href)))))

(defn extract-inline-images [node]
  (filter-seq
   (for [a (.getElementsByName node "img" true)]
     (.getAttributeByName a "src"))))

(defn pull-images-from-html [html]
  (let [node (.clean (new HtmlCleaner) html)]
    (or (extract-linked-images node)
        (extract-inline-images node))))

(defn extract-images-from-entry
  "Parsing strategy is to first try to extract linked images, then try to
   extract inline images. 
   The theory is that when a entry has linked images, they link to the full
   versions of included thumbnails. When there are no linked images, then the
   inline images are fullsize.
   TODO: only extract a linked image if the anchor tag contains a child image tag
   TODO: try extracting images from other content nodes besides the first
   TODO: just download the suckers and test for image size as a last resort"
  [e]
  (or (-?> e .getDescription .getValue pull-images-from-html)
      (-?> e .getContents first .getValue pull-images-from-html)))

(defn extract-feed-images [feed-url]
  (let [feed       (rome-feed feed-url)]
    (for [e   (.getEntries feed)
          url (extract-images-from-entry e)]
      {:url        url
       :feed-url   feed-url
       :feed-title (or (.getTitle feed) "")
       :title      (or (.getTitle e) "")
       :link       (.getLink e)})))

(def min-image-dimensions {:height 400 :width 400})
(def max-image-dimensions {:height 2000 :width 2000})

(defn fetch-and-validate-image [url]
  (try
   (println "fetching " url)
   (let [img (ImageIO/read (new URL url))
         h   (.getHeight img)
         w   (.getWidth img)]
     (cond (and (< h (:height min-image-dimensions))
                (< w (:width min-image-dimensions))) (format "TOO SMALL (%sx%s)" h w)
           (or (> h (:height max-image-dimensions))
               (> w (:width max-image-dimensions))) (format "TOO BIG (%sx%s)" h w)
           :else nil))
     (catch Exception e (.getMessage e))))

(defn fetch-and-queue-feed! [{feed-url :feed-url room-key :room-key}]
  (let [images      (extract-feed-images feed-url)
        fresh-urls  (filter-posted-urls room-key (map :url images))
        good-images (filter #(contains? fresh-urls (:url %)) images)]
    (doseq [img good-images]
      (if-let [reason (fetch-and-validate-image (:url img))]
        (do-insert :invalid_feed_images [:image_url :reason] [(:url img) reason])
        (queue-image! room-key img)))))

(defn fetch-and-queue-all-feeds! []
  (doseq [f (read-feeds)]
    (try
     (if (and (:room-key f) (:feed-url f))
       (fetch-and-queue-feed! f))
     (catch Exception e
       (print-stack-trace e)))))

;; Image posting

(defn- post-queued-message-db-txn! [bot-id room-key room-id image-url]
  (with-connection *db*
    (transaction
     (let [msg-id       (insert-message-into-db! bot-id room-id image-url true)
           update-count (first
                         (update-values :feed_images
                                        ["room = ? AND image_url = ? AND message_id IS NULL"
                                         room-key image-url]
                                        {:message_id msg-id}))]
       (if (= update-count 1)
         msg-id
         (do (set-rollback-only)
             false))))))


(defn post-queued-messages! []
  (doseq [room-key (map :room (do-select ["SELECT DISTINCT(room) FROM feed_images
                                           WHERE message_id IS NULL"]))]
    (let [room-id            (get-or-create-room! room-key)
          [bot-nick bot-id]  (get-or-create-room-bot! room-key)]
      (if-let [msg           (first
                              (do-select ["SELECT * FROM feed_images
                                           WHERE room = ? AND message_id IS NULL
                                           ORDER BY queued_on ASC LIMIT 1" room-key]))]
        (if-let [msg-id (post-queued-message-db-txn! bot-id room-key room-id (:image_url msg))]
          (do (dosync
               (add-message (build-msg bot-nick (:image_url msg) msg-id)
                            (lookup-room room-key)))
              (println "Inserted" (:image_url msg) "into" room-key))
          (println "error inserting" msg))))))
           


;; Feed download and insertion schedule

(def feed-refresh-period-sec (* 60 60))
(def feed-insert-period-sec (* 2 60))

(def feed-downloader
     (scheduled-agent fetch-and-queue-all-feeds!
                      feed-refresh-period-sec
                      nil))

 (def feed-inserter
      (scheduled-agent post-queued-messages!
                       feed-insert-period-sec
                       nil))