From ea74069cd21c5e09a1bf782057430ac1c2e08313 Mon Sep 17 00:00:00 2001 From: Tim Otten Date: Wed, 11 Jun 2014 17:28:52 -0700 Subject: [PATCH] CRM-14765, CRM14811 - FullText QueryFormatter - Attempt to normalize query text This introduces a class, CRM_Utils_QueryFormatter, which takes an inputted query string and converts it to a more specialized query dialect (such as SQL LIKE, MySQL FTS, or Solr). The exact conversion logic determined by a setting ("fts_query_mode"). --- .../Custom/FullText/AbstractPartialQuery.php | 44 +-- CRM/Utils/QueryFormatter.php | 295 ++++++++++++++++++ .../phpunit/CRM/Utils/QueryFormatterTest.php | 56 ++++ 3 files changed, 365 insertions(+), 30 deletions(-) create mode 100644 CRM/Utils/QueryFormatter.php create mode 100644 tests/phpunit/CRM/Utils/QueryFormatterTest.php diff --git a/CRM/Contact/Form/Search/Custom/FullText/AbstractPartialQuery.php b/CRM/Contact/Form/Search/Custom/FullText/AbstractPartialQuery.php index 501750ddb2..ff182d1c30 100644 --- a/CRM/Contact/Form/Search/Custom/FullText/AbstractPartialQuery.php +++ b/CRM/Contact/Form/Search/Custom/FullText/AbstractPartialQuery.php @@ -155,7 +155,8 @@ $sqlStatement } $query = $tableValues + array( - 'text' => $queryText, + 'text' => CRM_Utils_QueryFormatter::singleton() + ->format($queryText, CRM_Utils_QueryFormatter::LANG_SOLR), ); list($intLimit, $intOffset) = $this->parseLimitOffset($limit); $files = $searcher->search($query, $intLimit, $intOffset); @@ -236,9 +237,12 @@ GROUP BY {$tableValues['id']} * @return string SQL, eg "MATCH (col1) AGAINST (queryText)" or "col1 LIKE '%queryText%'" */ public function matchText($table, $fullTextFields, $queryText) { + $strtolower = function_exists('mb_strtolower') ? 'mb_strtolower' : 'strtolower'; + if (strpos($table, ' ') === FALSE) { $tableName = $tableAlias = $table; - } else { + } + else { list ($tableName, $tableAlias) = explode(' ', $table); } if (is_scalar($fullTextFields)) { @@ -247,7 +251,8 @@ GROUP BY {$tableValues['id']} $clauses = array(); if (CRM_Core_InnoDBIndexer::singleton()->hasDeclaredIndex($tableName, $fullTextFields)) { - $strtolower = function_exists('mb_strtolower') ? 'mb_strtolower' : 'strtolower'; + $formattedQuery = CRM_Utils_QueryFormatter::singleton() + ->format($queryText, CRM_Utils_QueryFormatter::LANG_SQL_FTS); $prefixedFieldNames = array(); foreach ($fullTextFields as $fieldName) { @@ -256,7 +261,7 @@ GROUP BY {$tableValues['id']} $clauses[] = sprintf("MATCH (%s) AGAINST ('%s')", implode(',', $prefixedFieldNames), - $strtolower(CRM_Core_DAO::escapeString($queryText)) + $strtolower(CRM_Core_DAO::escapeString($formattedQuery)) ); } else { @@ -264,8 +269,12 @@ GROUP BY {$tableValues['id']} // 1 => $table, // 2 => implode(', ', $fullTextFields), //))); + + $formattedQuery = CRM_Utils_QueryFormatter::singleton() + ->format($queryText, CRM_Utils_QueryFormatter::LANG_SQL_LIKE); + $escapedText = $strtolower(CRM_Core_DAO::escapeString($formattedQuery)); foreach ($fullTextFields as $fieldName) { - $clauses[] = "$tableAlias.$fieldName LIKE {$this->toSqlWildCard($queryText)}"; + $clauses[] = "$tableAlias.$fieldName LIKE '{$escapedText}'"; } } return implode(' OR ', $clauses); @@ -310,31 +319,6 @@ GROUP BY {$tableValues['id']} } } - /** - * Format text to include wild card characters at beginning and end - * - * @param string $text - * @return string - */ - public function toSqlWildCard($text) { - if ($text) { - $strtolower = function_exists('mb_strtolower') ? 'mb_strtolower' : 'strtolower'; - $text = $strtolower(CRM_Core_DAO::escapeString($text)); - if (strpos($text, '%') === FALSE) { - $text = "'%{$text}%'"; - return $text; - } - else { - $text = "'{$text}'"; - return $text; - } - } - else { - $text = "'%'"; - return $text; - } - } - /** * @param int|array $limit * @return string SQL diff --git a/CRM/Utils/QueryFormatter.php b/CRM/Utils/QueryFormatter.php new file mode 100644 index 0000000000..7da213dbb3 --- /dev/null +++ b/CRM/Utils/QueryFormatter.php @@ -0,0 +1,295 @@ +mode = $mode; + } + + /** + * @param mixed $mode + */ + public function setMode($mode) { + $this->mode = $mode; + } + + /** + * @return mixed + */ + public function getMode() { + return $this->mode; + } + + /** + * @param string $text + * @param string $language eg LANG_SQL_LIKE, LANG_SQL_FTS, LANG_SOLR + * @throws CRM_Core_Exception + * @return string + */ + public function format($text, $language) { + $text = trim($text); + + switch ($language) { + case self::LANG_SOLR: + case self::LANG_SQL_FTS: + $text = $this->_formatFts($text, $this->mode); + break; + case self::LANG_SQL_LIKE: + $text = $this->_formatLike($text, $this->mode); + break; + default: + $text = NULL; + } + + if ($text === NULL) { + throw new CRM_Core_Exception("Unrecognized combination: language=[{$language}] mode=[{$this->mode}]"); + } + + return $text; + } + + protected function _formatFts($text, $mode) { + $result = NULL; + + // normalize user-inputted wildcards + $text = str_replace('%', '*', $text); + + if (empty($text)) { + $result = '%'; + } + elseif (strpos($text, '*') !== FALSE) { + // if user supplies their own wildcards, then don't do any sophisticated changes + return $text; + } + else { + switch ($mode) { + case self::MODE_NONE: + $result = $text; + break; + + case self::MODE_PHRASE: + $result = '"' . $text . '"'; + break; + + case self::MODE_WILDPHRASE: + $result = '"*' . $text . '*"'; + break; + + case self::MODE_WILDWORDS: + $result = $this->mapWords($text, '*word*'); + break; + + case self::MODE_WILDWORDS_SUFFIX: + $result = $this->mapWords($text, 'word*'); + break; + + default: + $result = NULL; + } + } + + return $this->dedupeWildcards($result, '%'); + } + + protected function _formatLike($text, $mode) { + $result = NULL; + + if (empty($text)) { + $result = '%'; + } + elseif (strpos($text, '%') !== FALSE) { + // if user supplies their own wildcards, then don't do any sophisticated changes + $result = $text; + } + else { + switch ($mode) { + case self::MODE_NONE: + case self::MODE_PHRASE: + case self::MODE_WILDPHRASE: + $result = "%" . $text . "%"; + break; + + case self::MODE_WILDWORDS: + case self::MODE_WILDWORDS_SUFFIX: + $result = "%" . preg_replace('/[ \r\n]+/', '%', $text) . '%'; + break; + + default: + $result = NULL; + } + } + + return $this->dedupeWildcards($result, '%'); + } + + /** + * @param string $text user-supplied query string + * @param string $template a prototypical description of each word, eg "word%" or "word*" or "*word*" + * @return string + */ + protected function mapWords($text, $template) { + $result = array(); + foreach ($this->parseWords($text) as $word) { + $result[] = str_replace('word', $word, $template); + } + return implode(' ', $result); + } + + /** + * @param $text + * @return array + */ + protected function parseWords($text) { + return explode(' ', preg_replace('/[ \r\n\t]+/', ' ', trim($text))); + } + + /** + * @param $text + * @param $wildcard + * @return mixed + */ + protected function dedupeWildcards($text, $wildcard) { + if ($text === NULL) { + return NULL; + } + + // don't use preg_replace because $wildcard might be special char + while (strpos($text, "{$wildcard}{$wildcard}") !== FALSE) { + $text = str_replace("{$wildcard}{$wildcard}", "{$wildcard}", $text); + } + return $text; + } + + public static function getModes() { + return array( + self::MODE_NONE, + self::MODE_PHRASE, + self::MODE_WILDPHRASE, + self::MODE_WILDWORDS, + self::MODE_WILDWORDS_SUFFIX, + ); + } + + public static function getLanguages() { + return array( + self::LANG_SOLR, + self::LANG_SQL_FTS, + self::LANG_SQL_LIKE, + ); + } + + /** + * @param $text + * + * Ex: drush eval 'civicrm_initialize(); CRM_Utils_QueryFormatter::dumpExampleTable("firstword secondword");' + */ + public static function dumpExampleTable($text) { + $width = strlen($text) + 8; + $buf = ''; + + $buf .= sprintf("%-{$width}s", 'mode'); + foreach (self::getLanguages() as $lang) { + $buf .= sprintf("%-{$width}s", $lang); + } + $buf .= "\n"; + + foreach (self::getModes() as $mode) { + $formatter = new CRM_Utils_QueryFormatter($mode); + $buf .= sprintf("%-{$width}s", $mode); + foreach (self::getLanguages() as $lang) { + $buf .= sprintf("%-{$width}s", $formatter->format($text, $lang)); + } + $buf .= "\n"; + } + + echo $buf; + } +} \ No newline at end of file diff --git a/tests/phpunit/CRM/Utils/QueryFormatterTest.php b/tests/phpunit/CRM/Utils/QueryFormatterTest.php new file mode 100644 index 0000000000..ffe79a53e7 --- /dev/null +++ b/tests/phpunit/CRM/Utils/QueryFormatterTest.php @@ -0,0 +1,56 @@ +$inputText, 1=>$language, 2=>$options, 3=>$expectedText) + + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_LIKE, CRM_Utils_QueryFormatter::MODE_NONE, '%first second%'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_LIKE, CRM_Utils_QueryFormatter::MODE_PHRASE, '%first second%'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_LIKE, CRM_Utils_QueryFormatter::MODE_WILDPHRASE, '%first second%'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_LIKE, CRM_Utils_QueryFormatter::MODE_WILDWORDS, '%first%second%'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_LIKE, CRM_Utils_QueryFormatter::MODE_WILDWORDS_SUFFIX, '%first%second%'); + + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_FTS, CRM_Utils_QueryFormatter::MODE_NONE, 'first second'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_FTS, CRM_Utils_QueryFormatter::MODE_PHRASE, '"first second"'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_FTS, CRM_Utils_QueryFormatter::MODE_WILDPHRASE, '"*first second*"'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_FTS, CRM_Utils_QueryFormatter::MODE_WILDWORDS, '*first* *second*'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SQL_FTS, CRM_Utils_QueryFormatter::MODE_WILDWORDS_SUFFIX, 'first* second*'); + + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SOLR, CRM_Utils_QueryFormatter::MODE_NONE, 'first second'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SOLR, CRM_Utils_QueryFormatter::MODE_PHRASE, '"first second"'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SOLR, CRM_Utils_QueryFormatter::MODE_WILDPHRASE, '"*first second*"'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SOLR, CRM_Utils_QueryFormatter::MODE_WILDWORDS, '*first* *second*'); + $cases[] = array('first second', CRM_Utils_QueryFormatter::LANG_SOLR, CRM_Utils_QueryFormatter::MODE_WILDWORDS_SUFFIX, 'first* second*'); + + // if user supplies wildcards, then ignore mode + foreach (array(CRM_Utils_QueryFormatter::MODE_NONE, CRM_Utils_QueryFormatter::MODE_WILDPHRASE, CRM_Utils_QueryFormatter::MODE_WILDWORDS, CRM_Utils_QueryFormatter::MODE_WILDWORDS_SUFFIX) as $mode) { + $cases[] = array('first% second', CRM_Utils_QueryFormatter::LANG_SQL_LIKE, $mode, 'first% second'); + $cases[] = array('first% second', CRM_Utils_QueryFormatter::LANG_SQL_FTS, $mode, 'first* second'); + $cases[] = array('first% second', CRM_Utils_QueryFormatter::LANG_SOLR, $mode, 'first* second'); + $cases[] = array('first second%', CRM_Utils_QueryFormatter::LANG_SQL_LIKE, $mode, 'first second%'); + $cases[] = array('first second%', CRM_Utils_QueryFormatter::LANG_SQL_FTS, $mode, 'first second*'); + $cases[] = array('first second%', CRM_Utils_QueryFormatter::LANG_SOLR, $mode, 'first second*'); + } + + return $cases; + } + + /** + * @param $text + * @param $language + * @param $mode + * @param $expectedText + * @dataProvider dataProvider + */ + function testFormat($text, $language, $mode, $expectedText) { + $formatter = new CRM_Utils_QueryFormatter($mode); + $actualText = $formatter->format($text, $language); + $this->assertEquals($expectedText, $actualText); + } +} \ No newline at end of file -- 2.25.1