(ns feed (:import java.net.URL java.util.Date javax.imageio.ImageIO com.sun.syndication.io.SyndFeedInput com.sun.syndication.io.XmlReader org.htmlcleaner.HtmlCleaner) (:require [clojure.contrib.str-utils2 :as s]) (:use clojure.contrib.condition clojure.contrib.duck-streams clojure.contrib.seq-utils clojure.contrib.sql clojure.contrib.core compojure rooms scheduled-agent utils)) ;; DEPRECATED (def *feeds-path* "docs/feeds.csv") (defn queue-image! [room-key img] (do-insert :feed_images [:feed_url :link :title :image_url :room] [(:feed-url img) (:link img) (:title img) (:url img) room-key])) (defn parse-line [line] (let [r (s/split line #",")] (zipmap [:room-key :desc :feed-url :site-link :contact] (map #(.trim (.replaceAll % "\"" "")) r)))) (defn read-feeds [] (rest (map parse-line (read-lines *feeds-path*)))) (def *image-posted-qry* " SELECT v FROM UNNEST(?) as v WHERE NOT EXISTS (SELECT 1 FROM feed_images f WHERE f.image_url = v AND f.room = ?) AND NOT EXISTS (SELECT 1 FROM invalid_feed_images WHERE image_url = v) ") (defn filter-posted-urls [room-key urls] (if (empty? urls) #{} (set (map :v (do-select [*image-posted-qry* (sql-array "text" urls) room-key]))))) (defn insert-feed-image-to-db! [room-id feed img user-id] (with-connection *db* (transaction (let [m-id (insert-message-into-db! user-id room-id img true)] (do-prepared "INSERT INTO feed_images (feed_url, image_url, room_id, message_id) VALUES (?, ?, ?, ?)" [feed img room-id m-id]) m-id)))) (defn rome-feed [url] (.build (new SyndFeedInput) (new XmlReader (new URL url)))) (defn html-clean [html] (.clean (new HtmlCleaner) html)) (defn- filter-seq [s] (seq (remove nil? s))) ; http://stackoverflow.com/questions/169625/regex-to-check-if-valid-url-that-ends-in-jpg-png-or-gif ; TOOD: use site.clj regex (def image-regex #"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png|svg)") (defn extract-linked-images [node] (filter-seq (for [a (.getElementsByName node "a" true)] (let [href (.getAttributeByName a "href")] (when (and href (re-matches image-regex href)) href))))) (defn extract-inline-images [node] (filter-seq (for [a (.getElementsByName node "img" true)] (.getAttributeByName a "src")))) (defn pull-images-from-html [html] (let [node (.clean (new HtmlCleaner) html)] (or (extract-linked-images node) (extract-inline-images node)))) (defn extract-images-from-entry "Parsing strategy is to first try to extract linked images, then try to extract inline images. The theory is that when a entry has linked images, they link to the full versions of included thumbnails. When there are no linked images, then the inline images are fullsize. TODO: only extract a linked image if the anchor tag contains a child image tag TODO: try extracting images from other content nodes besides the first TODO: just download the suckers and test for image size as a last resort" [e] (or (-?> e .getDescription .getValue pull-images-from-html) (-?> e .getContents first .getValue pull-images-from-html))) (defn extract-feed-images [feed-url] (let [feed (rome-feed feed-url)] (for [e (.getEntries feed) url (extract-images-from-entry e)] {:url url :feed-url feed-url :feed-title (or (.getTitle feed) "") :title (or (.getTitle e) "") :link (.getLink e)}))) (def min-image-dimensions {:height 400 :width 400}) (def max-image-dimensions {:height 2000 :width 2000}) (defn fetch-and-validate-image [url] (try (println "fetching " url) (let [img (ImageIO/read (new URL url)) h (.getHeight img) w (.getWidth img)] (cond (and (< h (:height min-image-dimensions)) (< w (:width min-image-dimensions))) (format "TOO SMALL (%sx%s)" h w) (or (> h (:height max-image-dimensions)) (> w (:width max-image-dimensions))) (format "TOO BIG (%sx%s)" h w) :else nil)) (catch Exception e (.getMessage e)))) (defn fetch-and-queue-feed! [{feed-url :feed-url room-key :room-key}] (let [images (extract-feed-images feed-url) fresh-urls (filter-posted-urls room-key (map :url images)) good-images (filter #(contains? fresh-urls (:url %)) images)] (doseq [img good-images] (if-let [reason (fetch-and-validate-image (:url img))] (do-insert :invalid_feed_images [:image_url :reason] [(:url img) reason]) (queue-image! room-key img))))) (defn fetch-and-queue-all-feeds! [] (doseq [f (read-feeds)] (try (if (and (:room-key f) (:feed-url f)) (fetch-and-queue-feed! f)) (catch Exception e (print-stack-trace e))))) ;; Image posting (defn- post-queued-message-db-txn! [bot-id room-key room-id image-url] (with-connection *db* (transaction (let [msg-id (insert-message-into-db! bot-id room-id image-url true) update-count (first (update-values :feed_images ["room = ? AND image_url = ? AND message_id IS NULL" room-key image-url] {:message_id msg-id}))] (if (= update-count 1) msg-id (do (set-rollback-only) false)))))) (defn post-queued-messages! [] (doseq [room-key (map :room (do-select ["SELECT DISTINCT(room) FROM feed_images WHERE message_id IS NULL"]))] (let [room-id (get-or-create-room! room-key) [bot-nick bot-id] (get-or-create-room-bot! room-key)] (if-let [msg (first (do-select ["SELECT * FROM feed_images WHERE room = ? AND message_id IS NULL ORDER BY queued_on ASC LIMIT 1" room-key]))] (if-let [msg-id (post-queued-message-db-txn! bot-id room-key room-id (:image_url msg))] (do (dosync (add-message (build-msg bot-nick (:image_url msg) msg-id) (lookup-room room-key))) (println "Inserted" (:image_url msg) "into" room-key)) (println "error inserting" msg)))))) ;; Feed download and insertion schedule (def feed-refresh-period-sec (* 60 60)) (def feed-insert-period-sec (* 2 60)) (def feed-downloader (scheduled-agent fetch-and-queue-all-feeds! feed-refresh-period-sec nil)) (def feed-inserter (scheduled-agent post-queued-messages! feed-insert-period-sec nil))