From bc77dda48e8ce418aee6b75c85ae5273d3d15e69 Mon Sep 17 00:00:00 2001 From: guns Date: Tue, 18 Feb 2014 18:34:08 -0600 Subject: [PATCH] Allow user to specify own escape-chars MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The current set of metacharacters that are escaped when rendering are based on Java's Pattern implementation, which for the purposes of this library, is basically PCRE. However, different regular expression implementations sometimes have different pattern metacharacters. Vim's very-magic mode, for instance, reserves all [^a-zA-Z0-9_] ASCII characters, while POSIX basic regular expressions¹ do not use (, ), ?, +, or | (these must be preceeded by a backslash to attain their now familiar meaning). This patch adds the :escape-chars option to pattern and string-pattern to allow the user to specify this set of metacharacters. The value of :escape-chars may also be a key in the (public) metacharacters map. The actual implementation is done through a dynamic var, following the precedent of *capture*. ¹ Note that we must also allow the user to control the characters used for capturing/non-capturing groups, alternation, optional matching, etc, in order to make frak work for POSIX BREs --- src/cljx/frak.cljx | 38 +++++++++++++++++++++++++++++--------- test/frak_test.clj | 6 ++++++ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/cljx/frak.cljx b/src/cljx/frak.cljx index d435889..8104f4f 100644 --- a/src/cljx/frak.cljx +++ b/src/cljx/frak.cljx @@ -49,15 +49,22 @@ ;;;; Pattern rendering +(def ^{:doc "Special characters in various regular expression implementations."} + metacharacters + {:default #{\\ \^ \$ \* \+ \? \. \| \( \) \{ \} \[ \]} + ;; Vimscript "very-magic" mode + :vim (set (remove #(re-find #"\w" (str %)) (map char (range 0x21 0x7f))))}) + (def ^{:private true + :dynamic true :doc "Characters to escape when rendering a regular expression."} - escape-chars - #{\\ \^ \$ \* \+ \? \. \| \( \) \{ \} \[ \]}) + *escape-chars* + (:default metacharacters)) (defn- escape - "Escape a character if it is an element of `escape-chars`." + "Escape a character if it is an element of `*escape-chars*`." [c] - (str (when (escape-chars c) "\\") c)) + (str (when (contains? *escape-chars* c) "\\") c)) (def ^{:private true :dynamic true @@ -184,19 +191,32 @@ (string/replace #"\(\?:?(\[[^\]]+\])([^\|\)]+[^\?]?)\)([^\?])" "$1$2$3"))) +(defn- get* + "Map lookup. In CLJS, also does lookup by string representation of kw." + [map kw] + (or (get map kw) + #+cljs (get map (name kw)))) + +(def ^:private default-options + {:capture? false + :exact? false + :escape-chars (:default metacharacters)}) + (defn string-pattern "Construct a regular expression as a string from a collection of strings." ([strs] - (string-pattern strs {:capture? false, :exact? false})) + (string-pattern strs default-options)) ([strs opts] (let [#+cljs opts #+cljs (js->clj opts) - pattern (binding [*capture* (or (:capture? opts) - (get opts "capture?"))] + cs (or (get* opts :escape-chars) *escape-chars*) + cs (if (coll? cs) cs (get* metacharacters cs)) + pattern (binding [*capture* (get* opts :capture?) + *escape-chars* cs] (-> (build-trie strs) render-trie remove-unecessary-grouping))] - (if (or (:exact? opts) (get opts "exact?")) + (if (get* opts :exact?) (str "^" pattern "$") pattern)))) @@ -206,6 +226,6 @@ (defn ^:export pattern "Construct a regular expression from a collection of strings." ([strs] - (pattern strs {:capture? false, :exact? false})) + (pattern strs default-options)) ([strs opts] (re-pattern (string-pattern strs opts)))) diff --git a/test/frak_test.clj b/test/frak_test.clj index c0bf8e5..23a0f41 100644 --- a/test/frak_test.clj +++ b/test/frak_test.clj @@ -62,6 +62,12 @@ (is (= "foo\\??" (string-pattern ["foo" "foo?"]))) + (is (= "\\!\\\"\\#\\%\\&\\'\\,\\-\\/\\:\\;\\<\\=\\>\\@\\`\\~" + (string-pattern ["!\"#%&',-/:;<=>@`~"] {:escape-chars :vim}))) + + (is (= "foo\\★?" + (string-pattern ["foo" "foo★"] {:escape-chars #{\★}}))) + (are [words] (every? #(re-matches (pattern words) %) words) ["achy" "achylia" "achylous" "achymia" "achymous"] ["aching" "achingly"])) -- 2.25.1