(ns feed (:import java.util.Date) (:require [clojure.contrib.str-utils2 :as s]) (:use clojure.contrib.condition clojure.contrib.duck-streams clojure.contrib.seq-utils clojure.contrib.sql compojure rooms scheduled-agent utils)) (def *feeds-path* "docs/feeds.csv") (defn parse-line [line] (let [r (s/split line #",")] (zipmap [:room-key :desc :feed-link :site-link :contact] (map #(.trim (.replaceAll % "\"" "")) r)))) (defn read-feeds [] (rest (map parse-line (read-lines *feeds-path*)))) (def *image-posted-qry* " SELECT * FROM UNNEST(?) as v WHERE NOT EXISTS (SELECT 1 FROM feed_images f WHERE f.image_url = v AND f.room_id = ?) ") (defn filter-posted-images [urls room-id] (if (empty? urls) [] (map :v (do-select [*image-posted-qry* (sql-array "text" urls) room-id])))) (defn insert-feed-image-to-db! [room-id feed img user-id] (with-connection *db* (transaction (let [acc (comp :message_id first) m-id (acc (do-select ["INSERT INTO messages (user_id, room_id, content, is_image) VALUES (?, ?, ?, true) RETURNING message_id" user-id room-id img]))] (do-prepared "INSERT INTO feed_images (feed_url, image_url, room_id, message_id) VALUES (?, ?, ?, ?)" [feed img room-id m-id]) m-id)))) ; http://stackoverflow.com/questions/169625/regex-to-check-if-valid-url-that-ends-in-jpg-png-or-gif (def *image-regex* #"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png)") (defn extract-images [text] (re-seq *image-regex* text)) (defn is-thumbnail? [img] (boolean (re-find #"(?i)[-._](thumb|small|thumbs)[-._]" img))) (def image-filters [["THUMBNAIL" is-thumbnail?]]) (defn filter-image [img] (or (some (fn [[r f]] (if (f img) [img r])) image-filters) [img nil])) (defn classify-images [imgs] (let [good? (comp not boolean second) res (group-by good? (map filter-image imgs))] [(map first (res true)) (res false)])) (defn classify-images-from-feed [feed] (let [[ms text] (with-timing (download-http-url feed)) [g b] (classify-images (extract-images text))] [g b ms])) (defn process-feed [f] (let [room-key (:room-key f) room-id (get-or-create-room! room-key) [bot-nick bot-id] (get-or-create-room-bot! room-key) feed (:feed-link f) [good bad time] (classify-images-from-feed feed) filtered-good (filter-posted-images good room-id)] (doseq [img filtered-good] (println (format "Inserting %s into room-id %s" img room-key)) (let [msg-id (insert-feed-image-to-db! room-id feed img bot-id) msg {:msg_id msg-id :nick bot-nick :created_on (new Date) :content img}] (dosync (add-message msg (lookup-room room-key))))))) (defn process-all-feeds! [] (doseq [f (shuffle (read-feeds))] (try (if (and (:room-key f) (:feed-link f)) (process-feed f) (println "Incomplete feed " f)) (catch Exception e (print-stack-trace e))))) ;; Feed download schedule (def *feed-refresh-period-sec* (* 30 60)) ;(def *feed-downloader* ; (scheduled-agent process-all-feeds! ; *feed-refresh-period-sec* ; nil)) ;; Testing (defn feed-test-page [session] (if-vip (html [:body [:h1 "Feed Test"] [:form {:action "/feed-test" :method "post"} [:input {:type "text" :name "url"}] [:input {:type "submit" :value "Send"}]]]))) (defn show-bad-images [imgs] (for [[img reason] imgs] [:div reason [:a {:href img} [:img {:src img}]]])) (defn show-good-images [imgs] (for [img imgs] [:div [:a {:href img} [:img {:src img}]]])) (defn feed-test [session params] (if-vip (if-let [feed (params :url)] (let [[slurp-ms text] (with-timing (download-http-url feed)) [process-ms imgs] (with-timing (extract-images text)) [good-imgs bad-imgs] (classify-images imgs)] (html [:body [:h1 (str "Images for " feed)] [:div (format "Downloaded in %s ms" slurp-ms)] [:div (format "Processed in %s ms" process-ms)] [:hr] [:h2 "Good Images"] (show-good-images good-imgs) [:hr] [:h2 "Filtered Out Images"] (show-bad-images bad-imgs) [:hr] [:h2 "Raw Feed Contents"] [:pre (escape-html text)]])) (redirect-to "/feed-test"))))