--- /dev/null
+(defproject frak "0.1.0-SNAPSHOT"
+ :description "Transform collections of strings in to regular expressions."
+ :url "http://github.com/noprompt/frak"
+ :license {:name "Eclipse Public License"
+ :url "http://www.eclipse.org/legal/epl-v10.html"}
+ :dependencies [[org.clojure/clojure "1.4.0"]])
--- /dev/null
+(ns frak
+ (:require [clojure.string :as s]))
+
+(comment
+ ;; frak generates regular expressions from collections of strings.
+
+ ;; Usage:
+ (frak/pattern ["foo" "bar" "baz" "quux"])
+ ;; => #"(?:ba(?:z|r)|foo|quux)"
+ )
+
+;; Utilities
+
+(defn- prefixes
+ "Return a list of all prefixes for a given collection."
+ [coll]
+ (map-indexed
+ (fn [i _]
+ (take (inc i) coll))
+ coll))
+
+;; Trie construction
+
+(defn- grow [trie [_ & cs :as chars] terminal?]
+ (letfn [(visit [inner-trie]
+ (let [it (or inner-trie {})
+ lc (last chars)
+ it (if terminal?
+ (vary-meta it update-in [:terminals] conj lc)
+ it)]
+ (-> it
+ (vary-meta update-in [:visitors] conj lc)
+ (assoc lc (get-in trie chars)))))]
+ (if (seq cs)
+ (update-in trie (butlast chars) visit)
+ (visit trie))))
+
+(defn- trie-put
+ ([s]
+ {:pre [(string? s)]}
+ (trie-put {} s))
+ ([trie s]
+ {:pre [(map? trie) (string? s)]}
+ (if-not (seq s)
+ {}
+ (loop [t trie, ps (prefixes s)]
+ (if-let [cs (and (next ps) (first ps))]
+ (recur (grow t cs false) (next ps))
+ (grow t (first ps) true))))))
+
+
+(defn- build-trie [strs]
+ (reduce trie-put {} strs))
+
+;;;; Pattern rendering
+
+;; Characters to escape when rendering a regular expression.
+(def ^{:private true}
+ escape-chars
+ #{\\ \^ \$ \* \+ \? \. \| \( \) \{ \} \[ \]})
+
+(def ^{:private true :dynamic true} *capture* false)
+
+(defn- re-group-fmt []
+ (let [start (if *capture* "(" "(?:")]
+ (str start "%s)")))
+
+(defn- re-group [[s & more :as strs]]
+ (if (seq more)
+ (format (re-group-fmt) (s/join "|" strs))
+ s))
+
+(defn- render-trie [trie]
+ (let [{vs :visitors ts :terminals} (meta trie)
+ terminal? (set ts)
+ ks (->> (keys trie)
+ (sort-by (frequencies vs))
+ reverse )]
+ (re-group
+ (for [k ks]
+ (let [sk (if (escape-chars k) (str "\\" k) k)]
+ (if-let [branch (trie k)]
+ (if (terminal? k)
+ (->> (render-trie branch)
+ (format (str "%s" (re-group-fmt) "?") sk))
+ (->> (render-trie branch)
+ (format "%s%s" sk)))
+ sk))))))
+
+(defn pattern
+ "Construct a regular expression from a collection of strings."
+ ([strs]
+ (pattern strs false))
+ ([strs capture?]
+ {:pre [(every? string? strs)]}
+ (binding [*capture* capture?]
+ (-> strs
+ build-trie
+ render-trie
+ str
+ re-pattern))))
--- /dev/null
+(ns frak-test
+ (:use clojure.test
+ frak))
+
+(def trie-put #'frak/trie-put)
+(def build-trie #'frak/build-trie)
+
+(deftest trie-test
+ (is (= (-> (trie-put "a")
+ (trie-put "b"))
+ ^{:terminals '(\b \a)
+ :visitors '(\b \a)}
+ {\a nil
+ \b nil}))
+
+ (is (= (-> (trie-put "aaa")
+ (trie-put "ab"))
+ ^{:visitors '(\a\a)}
+ {\a
+ ^{:terminals '(\b)
+ :visitors '(\a \b)}
+ {\a
+ ^{:terminals '(\a)
+ :visitors '(\a)}
+ {\a nil}
+ \b nil}}))
+
+ (is (= (-> (trie-put "ab")
+ (trie-put "aaa"))
+ ^{:visitors '(\a\a)}
+ {\a
+ ^{:terminals '(\b)
+ :visitors '(\a \b)}
+ {\a
+ ^{:terminals '(\a)
+ :visitors '(\a)}
+ {\a nil}
+ \b nil}})))
+
+(deftest pattern-test
+ (let [strs1 ["foo" "bar" "baz"]
+ strs2 ["baz" "bar" "foo"]
+ pat1 (pattern strs1)
+ pat2 (pattern strs2)]
+ (is (every? #(re-matches pat1 %) strs1))
+
+ (is (every? #(re-matches pat2 %) strs1))
+
+ (is (every? #(re-matches pat1 %) strs2))
+
+ (is (every? #(re-matches pat2 %) strs2))
+
+ (is (not (or (re-matches pat1 "f")
+ (re-matches pat1 "b")
+ (re-matches pat1 "ba")
+ (re-matches pat1 "fo"))))))