summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/tagsoup-1.2.jarbin0 -> 90023 bytes
-rwxr-xr-xsrc/feed.clj285
-rw-r--r--src/rooms.clj38
3 files changed, 215 insertions, 108 deletions
diff --git a/lib/tagsoup-1.2.jar b/lib/tagsoup-1.2.jar
new file mode 100644
index 0000000..af27803
--- /dev/null
+++ b/lib/tagsoup-1.2.jar
Binary files differ
diff --git a/src/feed.clj b/src/feed.clj
index 3a3cc35..8d27745 100755
--- a/src/feed.clj
+++ b/src/feed.clj
@@ -1,10 +1,16 @@
(ns feed
- (:import java.util.Date)
+ (:import java.net.URL
+ java.util.Date
+ javax.imageio.ImageIO
+ com.sun.syndication.io.SyndFeedInput
+ com.sun.syndication.io.XmlReader
+ org.htmlcleaner.HtmlCleaner)
(:require [clojure.contrib.str-utils2 :as s])
(:use clojure.contrib.condition
clojure.contrib.duck-streams
clojure.contrib.seq-utils
clojure.contrib.sql
+ clojure.contrib.core
compojure
rooms
scheduled-agent
@@ -12,9 +18,15 @@
(def *feeds-path* "docs/feeds.csv")
+(defn queue-image! [room-key img]
+ (do-insert :feed_images
+ [:feed_url :link :title :image_url :room]
+ [(:feed-url img) (:link img) (:title img) (:url img) room-key]))
+
+
(defn parse-line [line]
(let [r (s/split line #",")]
- (zipmap [:room-key :desc :feed-link :site-link :contact]
+ (zipmap [:room-key :desc :feed-url :site-link :contact]
(map #(.trim (.replaceAll % "\"" ""))
r))))
@@ -22,137 +34,214 @@
(rest (map parse-line (read-lines *feeds-path*))))
(def *image-posted-qry* "
-SELECT *
+SELECT v
FROM UNNEST(?) as v
WHERE NOT EXISTS (SELECT 1
- FROM feed_images f
+ FROM feed_images f
WHERE f.image_url = v
- AND f.room_id = ?)
+ AND f.room = ?)
+AND NOT EXISTS (SELECT 1
+ FROM invalid_feed_images
+ WHERE image_url = v)
")
-(defn filter-posted-images [urls room-id]
+(defn filter-posted-urls [room-key urls]
(if (empty? urls)
- []
- (map :v
- (do-select [*image-posted-qry*
- (sql-array "text" urls)
- room-id]))))
+ #{}
+ (set (map :v
+ (do-select [*image-posted-qry*
+ (sql-array "text" urls)
+ room-key])))))
(defn insert-feed-image-to-db! [room-id feed img user-id]
(with-connection *db*
(transaction
- (let [acc (comp :message_id first)
- m-id (acc
- (do-select ["INSERT INTO messages (user_id, room_id, content, is_image)
- VALUES (?, ?, ?, true) RETURNING message_id"
- user-id room-id img]))]
+ (let [m-id (insert-message-into-db! user-id room-id img true)]
(do-prepared "INSERT INTO feed_images
(feed_url, image_url, room_id, message_id)
VALUES (?, ?, ?, ?)"
[feed img room-id m-id])
m-id))))
+(defn rome-feed [url]
+ (.build (new SyndFeedInput) (new XmlReader (new URL url))))
+
+(defn html-clean [html]
+ (.clean (new HtmlCleaner) html))
+
+(defn- filter-seq [s]
+ (seq (remove nil? s)))
+
; http://stackoverflow.com/questions/169625/regex-to-check-if-valid-url-that-ends-in-jpg-png-or-gif
-(def *image-regex*
- #"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png)")
+; TOOD: use site.clj regex
+(def image-regex
+ #"(?i)https?://(?:[a-z0-9\-]+\.)+[a-z]{2,6}(?:/[^/#?]+)+\.(?:jpeg|jpg|gif|png|svg)")
+
+(defn extract-linked-images [node]
+ (filter-seq
+ (for [a (.getElementsByName node "a" true)]
+ (let [href (.getAttributeByName a "href")]
+ (when (and href (re-matches image-regex href))
+ href)))))
-(defn extract-images [text]
- (re-seq *image-regex* text))
+(defn extract-inline-images [node]
+ (filter-seq
+ (for [a (.getElementsByName node "img" true)]
+ (.getAttributeByName a "src"))))
-(defn is-thumbnail? [img]
- (boolean (re-find #"(?i)[-._](thumb|small|thumbs)[-._]" img)))
+(defn pull-images-from-html [html]
+ (let [node (.clean (new HtmlCleaner) html)]
+ (or (extract-linked-images node)
+ (extract-inline-images node))))
-(def image-filters [["THUMBNAIL" is-thumbnail?]])
+(defn extract-images-from-entry
+ "Parsing strategy is to first try to extract linked images, then try to
+ extract inline images.
+ The theory is that when a entry has linked images, they link to the full
+ versions of included thumbnails. When there are no linked images, then the
+ inline images are fullsize.
+ TODO: only extract a linked image if the anchor tag contains a child image tag
+ TODO: try extracting images from other content nodes besides the first
+ TODO: just download the suckers and test for image size as a last resort"
+ [e]
+ (or (-?> e .getDescription .getValue pull-images-from-html)
+ (-?> e .getContents first .getValue pull-images-from-html)))
-(defn filter-image [img]
- (or (some
- (fn [[r f]] (if (f img) [img r]))
- image-filters)
- [img nil]))
+(defn extract-feed-images [feed-url]
+ (let [feed (rome-feed feed-url)]
+ (for [e (.getEntries feed)
+ url (extract-images-from-entry e)]
+ {:url url
+ :feed-url feed-url
+ :feed-title (or (.getTitle feed) "")
+ :title (or (.getTitle e) "")
+ :link (.getLink e)})))
-(defn classify-images [imgs]
- (let [good? (comp not boolean second)
- res (group-by good? (map filter-image imgs))]
- [(map first (res true))
- (res false)]))
+(def min-image-dimensions {:height 400 :width 400})
+(def max-image-dimensions {:height 2000 :width 2000})
-(defn classify-images-from-feed [feed]
- (let [[ms text] (with-timing (download-http-url feed))
- [g b] (classify-images (extract-images text))]
- [g b ms]))
+(defn fetch-and-validate-image [url]
+ (try
+ (println "fetching " url)
+ (let [img (ImageIO/read (new URL url))
+ h (.getHeight img)
+ w (.getWidth img)]
+ (cond (and (< h (:height min-image-dimensions))
+ (< w (:width min-image-dimensions))) (format "TOO SMALL (%sx%s)" h w)
+ (or (> h (:height max-image-dimensions))
+ (> w (:width max-image-dimensions))) (format "TOO BIG (%sx%s)" h w)
+ :else nil))
+ (catch Exception e (.getMessage e))))
-(defn process-feed [f]
- (let [room-key (:room-key f)
- room-id (get-or-create-room! room-key)
- [bot-nick bot-id] (get-or-create-room-bot! room-key)
- feed (:feed-link f)
- [good bad time] (classify-images-from-feed feed)
- filtered-good (filter-posted-images good room-id)]
- (doseq [img filtered-good]
- (println (format "Inserting %s into %s from %s" img room-key feed))
- (let [msg-id (insert-feed-image-to-db! room-id feed img bot-id)]
- (dosync
- (add-message (build-msg bot-nick img msg-id)
- (lookup-room room-key)))))))
+(defn fetch-and-queue-feed! [{feed-url :feed-url room-key :room-key}]
+ (let [images (extract-feed-images feed-url)
+ fresh-urls (filter-posted-urls room-key (map :url images))
+ good-images (filter #(contains? fresh-urls (:url %)) images)]
+ (doseq [img good-images]
+ (if-let [reason (fetch-and-validate-image (:url img))]
+ (do-insert :invalid_feed_images [:image_url :reason] [(:url img) reason])
+ (queue-image! room-key img)))))
-(defn process-all-feeds! []
- (doseq [f (shuffle (read-feeds))]
+(defn fetch-and-queue-all-feeds! []
+ (doseq [f (read-feeds)]
(try
- (if (and (:room-key f) (:feed-link f))
- (process-feed f)
- (println "Incomplete feed " f))
+ (if (and (:room-key f) (:feed-url f))
+ (fetch-and-queue-feed! f))
(catch Exception e
(print-stack-trace e)))))
-;; Feed download schedule
+;; Image posting
+
+(defn- post-queued-message-db-txn! [bot-id room-key room-id image-url]
+ (with-connection *db*
+ (transaction
+ (let [msg-id (insert-message-into-db! bot-id room-id image-url true)
+ update-count (first
+ (update-values :feed_images
+ ["room = ? AND image_url = ? AND message_id IS NULL"
+ room-key image-url]
+ {:message_id msg-id}))]
+ (if (= update-count 1)
+ msg-id
+ (do (set-rollback-only)
+ false))))))
+
+
+(defn post-queued-messages! []
+ (doseq [room-key (map :room (do-select ["SELECT DISTINCT(room) FROM feed_images
+ WHERE message_id IS NULL"]))]
+ (let [room-id (get-or-create-room! room-key)
+ [bot-nick bot-id] (get-or-create-room-bot! room-key)]
+ (if-let [msg (first
+ (do-select ["SELECT * FROM feed_images
+ WHERE room = ? AND message_id IS NULL
+ ORDER BY queued_on ASC LIMIT 1" room-key]))]
+ (if-let [msg-id (post-queued-message-db-txn! bot-id room-key room-id (:image_url msg))]
+ (do (dosync
+ (add-message (build-msg bot-nick (:image_url msg) msg-id)
+ (lookup-room room-key)))
+ (println "Inserted" (:image_url msg) "into" room-key))
+ (println "error inserting" msg))))))
+
+
-(def *feed-refresh-period-sec* (* 20 60))
+;; Feed download and insertion schedule
-(def *feed-downloader*
- (scheduled-agent process-all-feeds!
- *feed-refresh-period-sec*
+(def feed-refresh-period-sec (* 60 60))
+(def feed-insert-period-sec 60)
+
+(def feed-downloader
+ (scheduled-agent fetch-and-queue-all-feeds!
+ feed-refresh-period-sec
nil))
+ (def feed-inserter
+ (scheduled-agent post-queued-messages!
+ feed-refresh-period-sec
+ nil))
+
+
;; Testing
-(defn feed-test-page [session]
- (if-vip
- (html [:body
- [:h1 "Feed Test"]
- [:form {:action "/feed-test" :method "post"}
- [:input {:type "text" :name "url"}]
- [:input {:type "submit" :value "Send"}]]])))
+;; (defn feed-test-page [session]
+;; (if-vip
+;; (html [:body
+;; [:h1 "Feed Test"]
+;; [:form {:action "/feed-test" :method "post"}
+;; [:input {:type "text" :name "url"}]
+;; [:input {:type "submit" :value "Send"}]]])))
-(defn show-bad-images [imgs]
- (for [[img reason] imgs]
- [:div
- reason
- [:a {:href img}
- [:img {:src img}]]]))
+;; (defn show-bad-images [imgs]
+;; (for [[img reason] imgs]
+;; [:div
+;; reason
+;; [:a {:href img}
+;; [:img {:src img}]]]))
-(defn show-good-images [imgs]
- (for [img imgs]
- [:div
- [:a {:href img}
- [:img {:src img}]]]))
+;; (defn show-good-images [imgs]
+;; (for [img imgs]
+;; [:div
+;; [:a {:href img}
+;; [:img {:src img}]]]))
-(defn feed-test [session params]
- (if-vip
- (if-let [feed (params :url)]
- (let [[slurp-ms text] (with-timing (download-http-url feed))
- [process-ms imgs] (with-timing (extract-images text))
- [good-imgs bad-imgs] (classify-images imgs)]
- (html [:body
- [:h1 (str "Images for " feed)]
- [:div (format "Downloaded in %s ms" slurp-ms)]
- [:div (format "Processed in %s ms" process-ms)]
- [:hr]
- [:h2 "Good Images"]
- (show-good-images good-imgs)
- [:hr]
- [:h2 "Filtered Out Images"]
- (show-bad-images bad-imgs)
- [:hr]
- [:h2 "Raw Feed Contents"]
- [:pre (escape-html text)]]))
- (redirect-to "/feed-test"))))
+;; (defn feed-test [session params]
+;; (if-vip
+;; (if-let [feed (params :url)]
+;; (let [[slurp-ms text] (with-timing (download-http-url feed))
+;; [process-ms imgs] (with-timing (extract-images text))
+;; [good-imgs bad-imgs] (classify-images imgs)]
+;; (html [:body
+;; [:h1 (str "Images for " feed)]
+;; [:div (format "Downloaded in %s ms" slurp-ms)]
+;; [:div (format "Processed in %s ms" process-ms)]
+;; [:hr]
+;; [:h2 "Images"]
+;; (show-good-images good-imgs)
+;; [:hr]
+;; [:h2 "Filtered Out Images"]
+;; (show-bad-images bad-imgs)
+;; [:hr]
+;; [:h2 "Raw Feed Contents"]
+;; [:pre (escape-html text)]]))
+;; (redirect-to "/feed-test"))))
+;; (redirect-to "/feed-test2")))) \ No newline at end of file
diff --git a/src/rooms.clj b/src/rooms.clj
index 09b81ba..dd02219 100644
--- a/src/rooms.clj
+++ b/src/rooms.clj
@@ -1,7 +1,8 @@
(ns rooms
(:import java.util.Date)
(:use clojure.contrib.str-utils
- utils))
+ utils
+ user))
(defstruct message-struct :nick :content :created_on :msg_id)
@@ -43,6 +44,11 @@
(defn fetch-rooms []
(do-select ["SELECT * FROM ROOMS WHERE active"]))
+(defn lookup-room-key [room-id]
+ (or (some #(and (= (:room_id %) room-id) (:room_key %))
+ (vals @rooms))
+ (:key (first (do-select ["SELECT key FROM rooms where room_id = ?" room-id])))))
+
(defn count-messages-by-room [room-id image-only]
(let [query (str "SELECT COUNT(*)
FROM messages m, users u
@@ -93,8 +99,14 @@
(defn add-message [msg room]
(alter (room :messages) (swap cons) msg))
+(defn insert-message-into-db! [user-id room-id content is-image]
+ (:message_id
+ (first
+ (do-select ["INSERT INTO messages (user_id, room_id, content, is_image)
+ VALUES (?, ?, ?, ?) RETURNING message_id"
+ user-id room-id content is-image]))))
+
(defn create-and-add-room! [key]
- (println "Creating room " key)
(do-select ["INSERT INTO rooms (key, name, description)
VALUES (?, ?, ?) RETURNING room_id"
key key key])
@@ -104,6 +116,7 @@
(build-room-map-from-db room-db))
room-db)))
+; TODO: cache
(defn get-or-create-room! [key]
(:room_id
(or (first (do-select ["SELECT room_id FROM rooms WHERE lower(key) = ?"
@@ -111,16 +124,21 @@
(create-and-add-room! key)
(throw (Exception. (str "Unable to create room " key))))))
-(defn- get-or-create-room-bot-id! [nick]
+(defn- fetch-or-create-bot-id! [nick]
((comp :user_id first)
- (or (do-select ["SELECT user_id FROM users WHERE lower(nick) = ?"
- (lower-case nick)])
+ (or [(fetch-nick nick)]
(do
- (println "Creating bot " nick)
(do-select ["INSERT INTO users (nick, hash, email)
- VALUES (?, ?, ?) RETURNING user_id"
+ VALUES (?, ?, ?) RETURNING user_id"
nick "GARBAGE" "info@dump.fm"])))))
-(defn get-or-create-room-bot! [key]
- (let [nick (str "~" key)]
- [nick (get-or-create-room-bot-id! nick)])) \ No newline at end of file
+(def room-bot-id-cache (ref {}))
+
+(defn get-or-create-room-bot! [room-key]
+ (let [nick (str "~" room-key)]
+ (or (get @room-bot-id-cache nick)
+ (let [id (fetch-or-create-bot-id! nick)
+ r [nick id]]
+ (dosync
+ (commute room-bot-id-cache assoc nick r))
+ r))))