CRM-14765, CRM14811 - FullText QueryFormatter - Attempt to normalize query text
[civicrm-core.git] / CRM / Utils / QueryFormatter.php
1 <?php
2
3 /*
4 +--------------------------------------------------------------------+
5 | CiviCRM version 4.5 |
6 +--------------------------------------------------------------------+
7 | Copyright CiviCRM LLC (c) 2004-2014 |
8 +--------------------------------------------------------------------+
9 | This file is a part of CiviCRM. |
10 | |
11 | CiviCRM is free software; you can copy, modify, and distribute it |
12 | under the terms of the GNU Affero General Public License |
13 | Version 3, 19 November 2007 and the CiviCRM Licensing Exception. |
14 | |
15 | CiviCRM is distributed in the hope that it will be useful, but |
16 | WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
18 | See the GNU Affero General Public License for more details. |
19 | |
20 | You should have received a copy of the GNU Affero General Public |
21 | License and the CiviCRM Licensing Exception along |
22 | with this program; if not, contact CiviCRM LLC |
23 | at info[AT]civicrm[DOT]org. If you have questions about the |
24 | GNU Affero General Public License or the licensing of CiviCRM, |
25 | see the CiviCRM license FAQ at http://civicrm.org/licensing |
26 +--------------------------------------------------------------------+
27 */
28
29 /**
30 * Class CRM_Utils_QueryFormatter
31 *
32 * This class is a bad idea. It exists for the unholy reason that a single installation
33 * may have up to three query engines (MySQL LIKE, MySQL FTS, Solr) processing the same
34 * query-text. It labors* to take the user's search expression and provide similar search
35 * semantics in different contexts. It is unknown whether this labor will be fruitful
36 * or in vain.
37 */
38 class CRM_Utils_QueryFormatter {
39 const LANG_SQL_LIKE = 'like';
40 const LANG_SQL_FTS = 'fts';
41 const LANG_SOLR = 'solr';
42
43 /**
44 * Attempt to leave the text as-is.
45 */
46 const MODE_NONE = 'simple';
47
48 /**
49 * Attempt to treat the input text as a phrase
50 */
51 const MODE_PHRASE = 'phrase';
52
53 /**
54 * Attempt to treat the input text as a phrase with
55 * wildcards on each end.
56 */
57 const MODE_WILDPHRASE = 'wildphrase';
58
59 /**
60 * Attempt to treat individual word as if it
61 * had wildcards at the start and end.
62 */
63 const MODE_WILDWORDS = 'wildwords';
64
65 /**
66 * Attempt to treat individual word as if it
67 * had a wildcard at the end.
68 */
69 const MODE_WILDWORDS_SUFFIX = 'wildwords-suffix';
70
71 static protected $singleton;
72
73 /**
74 * @param bool $fresh
75 * @return CRM_Utils_QueryFormatter
76 */
77 public static function singleton($fresh = FALSE) {
78 if ($fresh || self::$singleton === NULL) {
79 $mode = CRM_Core_BAO_Setting::getItem(CRM_Core_BAO_Setting::SEARCH_PREFERENCES_NAME, 'fts_query_mode', NULL, self::MODE_NONE);
80 self::$singleton = new CRM_Utils_QueryFormatter($mode);
81 }
82 return self::$singleton;
83 }
84
85 /**
86 * @var string eg MODE_NONE
87 */
88 protected $mode;
89
90 /**
91 * @param string $mode eg MODE_NONE
92 */
93 function __construct($mode) {
94 $this->mode = $mode;
95 }
96
97 /**
98 * @param mixed $mode
99 */
100 public function setMode($mode) {
101 $this->mode = $mode;
102 }
103
104 /**
105 * @return mixed
106 */
107 public function getMode() {
108 return $this->mode;
109 }
110
111 /**
112 * @param string $text
113 * @param string $language eg LANG_SQL_LIKE, LANG_SQL_FTS, LANG_SOLR
114 * @throws CRM_Core_Exception
115 * @return string
116 */
117 public function format($text, $language) {
118 $text = trim($text);
119
120 switch ($language) {
121 case self::LANG_SOLR:
122 case self::LANG_SQL_FTS:
123 $text = $this->_formatFts($text, $this->mode);
124 break;
125 case self::LANG_SQL_LIKE:
126 $text = $this->_formatLike($text, $this->mode);
127 break;
128 default:
129 $text = NULL;
130 }
131
132 if ($text === NULL) {
133 throw new CRM_Core_Exception("Unrecognized combination: language=[{$language}] mode=[{$this->mode}]");
134 }
135
136 return $text;
137 }
138
139 protected function _formatFts($text, $mode) {
140 $result = NULL;
141
142 // normalize user-inputted wildcards
143 $text = str_replace('%', '*', $text);
144
145 if (empty($text)) {
146 $result = '%';
147 }
148 elseif (strpos($text, '*') !== FALSE) {
149 // if user supplies their own wildcards, then don't do any sophisticated changes
150 return $text;
151 }
152 else {
153 switch ($mode) {
154 case self::MODE_NONE:
155 $result = $text;
156 break;
157
158 case self::MODE_PHRASE:
159 $result = '"' . $text . '"';
160 break;
161
162 case self::MODE_WILDPHRASE:
163 $result = '"*' . $text . '*"';
164 break;
165
166 case self::MODE_WILDWORDS:
167 $result = $this->mapWords($text, '*word*');
168 break;
169
170 case self::MODE_WILDWORDS_SUFFIX:
171 $result = $this->mapWords($text, 'word*');
172 break;
173
174 default:
175 $result = NULL;
176 }
177 }
178
179 return $this->dedupeWildcards($result, '%');
180 }
181
182 protected function _formatLike($text, $mode) {
183 $result = NULL;
184
185 if (empty($text)) {
186 $result = '%';
187 }
188 elseif (strpos($text, '%') !== FALSE) {
189 // if user supplies their own wildcards, then don't do any sophisticated changes
190 $result = $text;
191 }
192 else {
193 switch ($mode) {
194 case self::MODE_NONE:
195 case self::MODE_PHRASE:
196 case self::MODE_WILDPHRASE:
197 $result = "%" . $text . "%";
198 break;
199
200 case self::MODE_WILDWORDS:
201 case self::MODE_WILDWORDS_SUFFIX:
202 $result = "%" . preg_replace('/[ \r\n]+/', '%', $text) . '%';
203 break;
204
205 default:
206 $result = NULL;
207 }
208 }
209
210 return $this->dedupeWildcards($result, '%');
211 }
212
213 /**
214 * @param string $text user-supplied query string
215 * @param string $template a prototypical description of each word, eg "word%" or "word*" or "*word*"
216 * @return string
217 */
218 protected function mapWords($text, $template) {
219 $result = array();
220 foreach ($this->parseWords($text) as $word) {
221 $result[] = str_replace('word', $word, $template);
222 }
223 return implode(' ', $result);
224 }
225
226 /**
227 * @param $text
228 * @return array
229 */
230 protected function parseWords($text) {
231 return explode(' ', preg_replace('/[ \r\n\t]+/', ' ', trim($text)));
232 }
233
234 /**
235 * @param $text
236 * @param $wildcard
237 * @return mixed
238 */
239 protected function dedupeWildcards($text, $wildcard) {
240 if ($text === NULL) {
241 return NULL;
242 }
243
244 // don't use preg_replace because $wildcard might be special char
245 while (strpos($text, "{$wildcard}{$wildcard}") !== FALSE) {
246 $text = str_replace("{$wildcard}{$wildcard}", "{$wildcard}", $text);
247 }
248 return $text;
249 }
250
251 public static function getModes() {
252 return array(
253 self::MODE_NONE,
254 self::MODE_PHRASE,
255 self::MODE_WILDPHRASE,
256 self::MODE_WILDWORDS,
257 self::MODE_WILDWORDS_SUFFIX,
258 );
259 }
260
261 public static function getLanguages() {
262 return array(
263 self::LANG_SOLR,
264 self::LANG_SQL_FTS,
265 self::LANG_SQL_LIKE,
266 );
267 }
268
269 /**
270 * @param $text
271 *
272 * Ex: drush eval 'civicrm_initialize(); CRM_Utils_QueryFormatter::dumpExampleTable("firstword secondword");'
273 */
274 public static function dumpExampleTable($text) {
275 $width = strlen($text) + 8;
276 $buf = '';
277
278 $buf .= sprintf("%-{$width}s", 'mode');
279 foreach (self::getLanguages() as $lang) {
280 $buf .= sprintf("%-{$width}s", $lang);
281 }
282 $buf .= "\n";
283
284 foreach (self::getModes() as $mode) {
285 $formatter = new CRM_Utils_QueryFormatter($mode);
286 $buf .= sprintf("%-{$width}s", $mode);
287 foreach (self::getLanguages() as $lang) {
288 $buf .= sprintf("%-{$width}s", $formatter->format($text, $lang));
289 }
290 $buf .= "\n";
291 }
292
293 echo $buf;
294 }
295 }