diff options
Diffstat (limited to 'src/feed.clj')
| -rwxr-xr-x | src/feed.clj | 285 |
1 files changed, 187 insertions, 98 deletions
diff --git a/src/feed.clj b/src/feed.clj index 3a3cc35..8d27745 100755 --- a/src/feed.clj +++ b/src/feed.clj @@ -1,10 +1,16 @@ (ns feed - (:import java.util.Date) + (:import java.net.URL + java.util.Date + javax.imageio.ImageIO + com.sun.syndication.io.SyndFeedInput + com.sun.syndication.io.XmlReader + org.htmlcleaner.HtmlCleaner) (:require [clojure.contrib.str-utils2 :as s]) (:use clojure.contrib.condition clojure.contrib.duck-streams clojure.contrib.seq-utils clojure.contrib.sql + clojure.contrib.core compojure rooms scheduled-agent @@ -12,9 +18,15 @@ (def *feeds-path* "docs/feeds.csv") +(defn queue-image! [room-key img] + (do-insert :feed_images + [:feed_url :link :title :image_url :room] + [(:feed-url img) (:link img) (:title img) (:url img) room-key])) + + (defn parse-line [line] (let [r (s/split line #",")] - (zipmap [:room-key :desc :feed-link :site-link :contact] + (zipmap [:room-key :desc :feed-url :site-link :contact] (map #(.trim (.replaceAll % "\"" "")) r)))) @@ -22,137 +34,214 @@ (rest (map parse-line (read-lines *feeds-path*)))) (def *image-posted-qry* " -SELECT * +SELECT v FROM UNNEST(?) as v WHERE NOT EXISTS (SELECT 1 - FROM feed_images f + FROM feed_images f WHERE f.image_url = v - AND f.room_id = ?) + AND f.room = ?) +AND NOT EXISTS (SELECT 1 + FROM invalid_feed_images + WHERE image_url = v) ") -(defn filter-posted-images [urls room-id] +(defn filter-posted-urls [room-key urls] (if (empty? urls) - [] - (map :v - (do-select [*image-posted-qry* - (sql-array "text" urls) - room-id])))) + #{} + (set (map :v + (do-select [*image-posted-qry* + (sql-array "text" urls) + room-key]))))) (defn insert-feed-image-to-db! [room-id feed img user-id] (with-connection *db* (transaction - (let [acc (comp :message_id first) - m-id (acc - (do-select ["INSERT INTO messages (user_id, room_id, content, is_image) - VALUES (?, ?, ?, true) RETURNING message_id" - user-id room-id img]))] + (let [m-id (insert-message-into-db! user-id room-id img true)] (do-prepared "INSERT INTO feed_images (feed_url, image_url, room_id, message_id) VALUES (?, ?, ?, ?)" [feed img room-id m-id]) m-id)))) +(defn rome-feed [url] + (.build (new SyndFeedInput) (new XmlReader (new URL url)))) + +(defn html-clean [html] + (.clean (new HtmlCleaner) html)) + +(defn- filter-seq [s] + (seq (remove nil? s))) + ; http://stackoverflow.com/questions/169625/regex-to-check-if-valid-url-that-ends-in-jpg-png-or-gif -(def *image-regex* - #"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png)") +; TOOD: use site.clj regex +(def image-regex + #"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png|svg)") + +(defn extract-linked-images [node] + (filter-seq + (for [a (.getElementsByName node "a" true)] + (let [href (.getAttributeByName a "href")] + (when (and href (re-matches image-regex href)) + href))))) -(defn extract-images [text] - (re-seq *image-regex* text)) +(defn extract-inline-images [node] + (filter-seq + (for [a (.getElementsByName node "img" true)] + (.getAttributeByName a "src")))) -(defn is-thumbnail? [img] - (boolean (re-find #"(?i)[-._](thumb|small|thumbs)[-._]" img))) +(defn pull-images-from-html [html] + (let [node (.clean (new HtmlCleaner) html)] + (or (extract-linked-images node) + (extract-inline-images node)))) -(def image-filters [["THUMBNAIL" is-thumbnail?]]) +(defn extract-images-from-entry + "Parsing strategy is to first try to extract linked images, then try to + extract inline images. + The theory is that when a entry has linked images, they link to the full + versions of included thumbnails. When there are no linked images, then the + inline images are fullsize. + TODO: only extract a linked image if the anchor tag contains a child image tag + TODO: try extracting images from other content nodes besides the first + TODO: just download the suckers and test for image size as a last resort" + [e] + (or (-?> e .getDescription .getValue pull-images-from-html) + (-?> e .getContents first .getValue pull-images-from-html))) -(defn filter-image [img] - (or (some - (fn [[r f]] (if (f img) [img r])) - image-filters) - [img nil])) +(defn extract-feed-images [feed-url] + (let [feed (rome-feed feed-url)] + (for [e (.getEntries feed) + url (extract-images-from-entry e)] + {:url url + :feed-url feed-url + :feed-title (or (.getTitle feed) "") + :title (or (.getTitle e) "") + :link (.getLink e)}))) -(defn classify-images [imgs] - (let [good? (comp not boolean second) - res (group-by good? (map filter-image imgs))] - [(map first (res true)) - (res false)])) +(def min-image-dimensions {:height 400 :width 400}) +(def max-image-dimensions {:height 2000 :width 2000}) -(defn classify-images-from-feed [feed] - (let [[ms text] (with-timing (download-http-url feed)) - [g b] (classify-images (extract-images text))] - [g b ms])) +(defn fetch-and-validate-image [url] + (try + (println "fetching " url) + (let [img (ImageIO/read (new URL url)) + h (.getHeight img) + w (.getWidth img)] + (cond (and (< h (:height min-image-dimensions)) + (< w (:width min-image-dimensions))) (format "TOO SMALL (%sx%s)" h w) + (or (> h (:height max-image-dimensions)) + (> w (:width max-image-dimensions))) (format "TOO BIG (%sx%s)" h w) + :else nil)) + (catch Exception e (.getMessage e)))) -(defn process-feed [f] - (let [room-key (:room-key f) - room-id (get-or-create-room! room-key) - [bot-nick bot-id] (get-or-create-room-bot! room-key) - feed (:feed-link f) - [good bad time] (classify-images-from-feed feed) - filtered-good (filter-posted-images good room-id)] - (doseq [img filtered-good] - (println (format "Inserting %s into %s from %s" img room-key feed)) - (let [msg-id (insert-feed-image-to-db! room-id feed img bot-id)] - (dosync - (add-message (build-msg bot-nick img msg-id) - (lookup-room room-key))))))) +(defn fetch-and-queue-feed! [{feed-url :feed-url room-key :room-key}] + (let [images (extract-feed-images feed-url) + fresh-urls (filter-posted-urls room-key (map :url images)) + good-images (filter #(contains? fresh-urls (:url %)) images)] + (doseq [img good-images] + (if-let [reason (fetch-and-validate-image (:url img))] + (do-insert :invalid_feed_images [:image_url :reason] [(:url img) reason]) + (queue-image! room-key img))))) -(defn process-all-feeds! [] - (doseq [f (shuffle (read-feeds))] +(defn fetch-and-queue-all-feeds! [] + (doseq [f (read-feeds)] (try - (if (and (:room-key f) (:feed-link f)) - (process-feed f) - (println "Incomplete feed " f)) + (if (and (:room-key f) (:feed-url f)) + (fetch-and-queue-feed! f)) (catch Exception e (print-stack-trace e))))) -;; Feed download schedule +;; Image posting + +(defn- post-queued-message-db-txn! [bot-id room-key room-id image-url] + (with-connection *db* + (transaction + (let [msg-id (insert-message-into-db! bot-id room-id image-url true) + update-count (first + (update-values :feed_images + ["room = ? AND image_url = ? AND message_id IS NULL" + room-key image-url] + {:message_id msg-id}))] + (if (= update-count 1) + msg-id + (do (set-rollback-only) + false)))))) + + +(defn post-queued-messages! [] + (doseq [room-key (map :room (do-select ["SELECT DISTINCT(room) FROM feed_images + WHERE message_id IS NULL"]))] + (let [room-id (get-or-create-room! room-key) + [bot-nick bot-id] (get-or-create-room-bot! room-key)] + (if-let [msg (first + (do-select ["SELECT * FROM feed_images + WHERE room = ? AND message_id IS NULL + ORDER BY queued_on ASC LIMIT 1" room-key]))] + (if-let [msg-id (post-queued-message-db-txn! bot-id room-key room-id (:image_url msg))] + (do (dosync + (add-message (build-msg bot-nick (:image_url msg) msg-id) + (lookup-room room-key))) + (println "Inserted" (:image_url msg) "into" room-key)) + (println "error inserting" msg)))))) + + -(def *feed-refresh-period-sec* (* 20 60)) +;; Feed download and insertion schedule -(def *feed-downloader* - (scheduled-agent process-all-feeds! - *feed-refresh-period-sec* +(def feed-refresh-period-sec (* 60 60)) +(def feed-insert-period-sec 60) + +(def feed-downloader + (scheduled-agent fetch-and-queue-all-feeds! + feed-refresh-period-sec nil)) + (def feed-inserter + (scheduled-agent post-queued-messages! + feed-refresh-period-sec + nil)) + + ;; Testing -(defn feed-test-page [session] - (if-vip - (html [:body - [:h1 "Feed Test"] - [:form {:action "/feed-test" :method "post"} - [:input {:type "text" :name "url"}] - [:input {:type "submit" :value "Send"}]]]))) +;; (defn feed-test-page [session] +;; (if-vip +;; (html [:body +;; [:h1 "Feed Test"] +;; [:form {:action "/feed-test" :method "post"} +;; [:input {:type "text" :name "url"}] +;; [:input {:type "submit" :value "Send"}]]]))) -(defn show-bad-images [imgs] - (for [[img reason] imgs] - [:div - reason - [:a {:href img} - [:img {:src img}]]])) +;; (defn show-bad-images [imgs] +;; (for [[img reason] imgs] +;; [:div +;; reason +;; [:a {:href img} +;; [:img {:src img}]]])) -(defn show-good-images [imgs] - (for [img imgs] - [:div - [:a {:href img} - [:img {:src img}]]])) +;; (defn show-good-images [imgs] +;; (for [img imgs] +;; [:div +;; [:a {:href img} +;; [:img {:src img}]]])) -(defn feed-test [session params] - (if-vip - (if-let [feed (params :url)] - (let [[slurp-ms text] (with-timing (download-http-url feed)) - [process-ms imgs] (with-timing (extract-images text)) - [good-imgs bad-imgs] (classify-images imgs)] - (html [:body - [:h1 (str "Images for " feed)] - [:div (format "Downloaded in %s ms" slurp-ms)] - [:div (format "Processed in %s ms" process-ms)] - [:hr] - [:h2 "Good Images"] - (show-good-images good-imgs) - [:hr] - [:h2 "Filtered Out Images"] - (show-bad-images bad-imgs) - [:hr] - [:h2 "Raw Feed Contents"] - [:pre (escape-html text)]])) - (redirect-to "/feed-test")))) +;; (defn feed-test [session params] +;; (if-vip +;; (if-let [feed (params :url)] +;; (let [[slurp-ms text] (with-timing (download-http-url feed)) +;; [process-ms imgs] (with-timing (extract-images text)) +;; [good-imgs bad-imgs] (classify-images imgs)] +;; (html [:body +;; [:h1 (str "Images for " feed)] +;; [:div (format "Downloaded in %s ms" slurp-ms)] +;; [:div (format "Processed in %s ms" process-ms)] +;; [:hr] +;; [:h2 "Images"] +;; (show-good-images good-imgs) +;; [:hr] +;; [:h2 "Filtered Out Images"] +;; (show-bad-images bad-imgs) +;; [:hr] +;; [:h2 "Raw Feed Contents"] +;; [:pre (escape-html text)]])) +;; (redirect-to "/feed-test")))) +;; (redirect-to "/feed-test2"))))
\ No newline at end of file |
