(ns imgreplacer (:import java.net.URL java.io.File java.io.IOException java.io.ByteArrayInputStream javax.imageio.ImageIO org.htmlcleaner.HtmlCleaner) (:require [clojure.set :as r]) (:use clojure.contrib.duck-streams clojure.contrib.str-utils clojure.contrib.command-line config feed utils)) (def save-root "images/replaced") (defn file-path [date fname] (str-join "/" [save-root date fname])) (defn image-url [date fname] (str-join "/" [*server-url* save-root date fname])) (defn ins-substring? [ss s] (>= (.indexOf (lower-case s) (lower-case ss)) 0)) (defn fetch-bad-messages [url] (println "fetching bad messages for" url) (do-select [(str "SELECT * FROM messages WHERE content ilike '%" url "%'")])) (defn replace-grp-str [replacements string] (reduce (fn [s [k v]] (.replaceAll s k v)) string replacements)) (defn image-name [url] (let [fname (last (.split url "/"))] (format "%s-%s" (System/currentTimeMillis) ;; Hack: nginx doesn't like to serve images w/ spaces in them (.replaceAll fname "%20" "")))) (defn mirror-image [url] (println "fetching" url) (let [bytes (to-byte-array (.openStream (URL. url))) date (today) fname (image-name url) file (File. (file-path date fname)) img (ImageIO/read (ByteArrayInputStream. bytes))] (make-parents file) (copy (ByteArrayInputStream. bytes) file) (image-url date fname))) (defn take-safe-images [m] (set (concat (take-images m) (try (pull-images-from-html m) (catch Exception _ []))))) (def image-url-map (ref {})) (defn mirror-message! [msg dryrun url-filter] (let [imgs (filter url-filter (take-safe-images (:content msg)))] (doseq [img imgs] (if-not (contains? @image-url-map img) (dosync (alter image-url-map assoc img (mirror-image img))))) (let [replace-map (zipmap imgs (map @image-url-map imgs)) new-content (replace-grp-str replace-map (:content msg))] (if (= (:content msg) new-content) (println (format "Message %s: no change" (:message_id msg))) (do (println "\nupdating content of" (:message_id msg) "from:\n" (:content msg) "\nto:\n" new-content) (if-not dryrun (do-update :messages ["message_id = ?" (:message_id msg)] {:content new-content}))))))) (defn mirror-bad-host! [url dryrun] (doseq [m (fetch-bad-messages url)] (mirror-message! m dryrun #(ins-substring? url %)))) (defn mirror-message-id! [msg-id dryrun] (if-let [m (first (do-select ["SELECT * FROM messages WHERE message_id = ?" msg-id]))] (mirror-message! m dryrun #(not (re-find #"^http://dump.fm" %)))))