Merge pull request #15208 from civicrm/5.17
[civicrm-core.git] / CRM / Dedupe / BAO / RuleGroup.php
1 <?php
2 /*
3 +--------------------------------------------------------------------+
4 | CiviCRM version 5 |
5 +--------------------------------------------------------------------+
6 | Copyright CiviCRM LLC (c) 2004-2019 |
7 +--------------------------------------------------------------------+
8 | This file is a part of CiviCRM. |
9 | |
10 | CiviCRM is free software; you can copy, modify, and distribute it |
11 | under the terms of the GNU Affero General Public License |
12 | Version 3, 19 November 2007 and the CiviCRM Licensing Exception. |
13 | |
14 | CiviCRM is distributed in the hope that it will be useful, but |
15 | WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
17 | See the GNU Affero General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU Affero General Public |
20 | License and the CiviCRM Licensing Exception along |
21 | with this program; if not, contact CiviCRM LLC |
22 | at info[AT]civicrm[DOT]org. If you have questions about the |
23 | GNU Affero General Public License or the licensing of CiviCRM, |
24 | see the CiviCRM license FAQ at http://civicrm.org/licensing |
25 +--------------------------------------------------------------------+
26 */
27
28 /**
29 *
30 * @package CRM
31 * @copyright CiviCRM LLC (c) 2004-2019
32 * $Id$
33 *
34 */
35
36 /**
37 * The CiviCRM duplicate discovery engine is based on an
38 * algorithm designed by David Strauss <david@fourkitchens.com>.
39 */
40 class CRM_Dedupe_BAO_RuleGroup extends CRM_Dedupe_DAO_RuleGroup {
41
42 /**
43 * Ids of the contacts to limit the SQL queries (whole-database queries otherwise)
44 * @var array
45 */
46 public $contactIds = [];
47
48 /**
49 * Set the contact IDs to restrict the dedupe to.
50 *
51 * @param array $contactIds
52 */
53 public function setContactIds($contactIds) {
54 $this->contactIds = $contactIds;
55 }
56
57 /**
58 * Params to dedupe against (queries against the whole contact set otherwise)
59 * @var array
60 */
61 public $params = [];
62
63 /**
64 * If there are no rules in rule group.
65 * @var bool
66 */
67 public $noRules = FALSE;
68
69 protected $temporaryTables = [];
70
71 /**
72 * Return a structure holding the supported tables, fields and their titles
73 *
74 * @param string $requestedType
75 * The requested contact type.
76 *
77 * @return array
78 * a table-keyed array of field-keyed arrays holding supported fields' titles
79 */
80 public static function supportedFields($requestedType) {
81 static $fields = NULL;
82 if (!$fields) {
83 // this is needed, as we're piggy-backing importableFields() below
84 $replacements = [
85 'civicrm_country.name' => 'civicrm_address.country_id',
86 'civicrm_county.name' => 'civicrm_address.county_id',
87 'civicrm_state_province.name' => 'civicrm_address.state_province_id',
88 'gender.label' => 'civicrm_contact.gender_id',
89 'individual_prefix.label' => 'civicrm_contact.prefix_id',
90 'individual_suffix.label' => 'civicrm_contact.suffix_id',
91 'addressee.label' => 'civicrm_contact.addressee_id',
92 'email_greeting.label' => 'civicrm_contact.email_greeting_id',
93 'postal_greeting.label' => 'civicrm_contact.postal_greeting_id',
94 'civicrm_phone.phone' => 'civicrm_phone.phone_numeric',
95 ];
96 // the table names we support in dedupe rules - a filter for importableFields()
97 $supportedTables = [
98 'civicrm_address',
99 'civicrm_contact',
100 'civicrm_email',
101 'civicrm_im',
102 'civicrm_note',
103 'civicrm_openid',
104 'civicrm_phone',
105 ];
106
107 foreach (['Individual', 'Organization', 'Household'] as $ctype) {
108 // take the table.field pairs and their titles from importableFields() if the table is supported
109 foreach (CRM_Contact_BAO_Contact::importableFields($ctype) as $iField) {
110 if (isset($iField['where'])) {
111 $where = $iField['where'];
112 if (isset($replacements[$where])) {
113 $where = $replacements[$where];
114 }
115 list($table, $field) = explode('.', $where);
116 if (!in_array($table, $supportedTables)) {
117 continue;
118 }
119 $fields[$ctype][$table][$field] = $iField['title'];
120 }
121 }
122 // Note that most of the fields available come from 'importable fields' -
123 // I thought about making this field 'importable' but it felt like there might be unknown consequences
124 // so I opted for just adding it in & securing it with a unit test.
125 // Example usage of sort_name - It is possible to alter sort name via hook so 2 organization names might differ as in
126 // Justice League vs The Justice League but these could have the same sort_name if 'the the'
127 // exension is installed (https://github.com/eileenmcnaughton/org.wikimedia.thethe)
128 $fields[$ctype]['civicrm_contact']['sort_name'] = ts('Sort Name');
129 // add custom data fields
130 foreach (CRM_Core_BAO_CustomGroup::getTree($ctype, NULL, NULL, -1) as $key => $cg) {
131 if (!is_int($key)) {
132 continue;
133 }
134 foreach ($cg['fields'] as $cf) {
135 $fields[$ctype][$cg['table_name']][$cf['column_name']] = $cf['label'];
136 }
137 }
138 }
139 }
140 CRM_Utils_Hook::dupeQuery(CRM_Core_DAO::$_nullObject, 'supportedFields', $fields);
141 return !empty($fields[$requestedType]) ? $fields[$requestedType] : [];
142 }
143
144 /**
145 * Return the SQL query for dropping the temporary table.
146 */
147 public function tableDropQuery() {
148 return 'DROP TEMPORARY TABLE IF EXISTS dedupe';
149 }
150
151 /**
152 * Return a set of SQL queries whose cummulative weights will mark matched
153 * records for the RuleGroup::threasholdQuery() to retrieve.
154 */
155 public function tableQuery() {
156 // make sure we've got a fetched dbrecord, not sure if this is enforced
157 if (!$this->name == NULL || $this->is_reserved == NULL) {
158 $this->find(TRUE);
159 }
160
161 // Reserved Rule Groups can optionally get special treatment by
162 // implementing an optimization class and returning a query array.
163 if ($this->is_reserved &&
164 CRM_Utils_File::isIncludable("CRM/Dedupe/BAO/QueryBuilder/{$this->name}.php")
165 ) {
166 $command = empty($this->params) ? 'internal' : 'record';
167 $queries = call_user_func(["CRM_Dedupe_BAO_QueryBuilder_{$this->name}", $command], $this);
168 }
169 else {
170 // All other rule groups have queries generated by the member dedupe
171 // rules defined in the administrative interface.
172
173 // Find all rules contained by this script sorted by weight so that
174 // their execution can be short circuited on RuleGroup::fillTable()
175 $bao = new CRM_Dedupe_BAO_Rule();
176 $bao->dedupe_rule_group_id = $this->id;
177 $bao->orderBy('rule_weight DESC');
178 $bao->find();
179
180 // Generate a SQL query for each rule in the rule group that is
181 // tailored to respect the param and contactId options provided.
182 $queries = [];
183 while ($bao->fetch()) {
184 $bao->contactIds = $this->contactIds;
185 $bao->params = $this->params;
186
187 // Skipping empty rules? Empty rules shouldn't exist; why check?
188 if ($query = $bao->sql()) {
189 $queries["{$bao->rule_table}.{$bao->rule_field}.{$bao->rule_weight}"] = $query;
190 }
191 }
192 }
193
194 // if there are no rules in this rule group
195 // add an empty query fulfilling the pattern
196 if (!$queries) {
197 $this->noRules = TRUE;
198 return [];
199 }
200
201 return $queries;
202 }
203
204 public function fillTable() {
205 // get the list of queries handy
206 $tableQueries = $this->tableQuery();
207
208 if ($this->params && !$this->noRules) {
209 $this->temporaryTables['dedupe'] = CRM_Utils_SQL_TempTable::build()
210 ->setCategory('dedupe')
211 ->createWithColumns("id1 int, weight int, UNIQUE UI_id1 (id1)")->getName();
212 $insertClause = "INSERT INTO {$this->temporaryTables['dedupe']} (id1, weight)";
213 $groupByClause = "GROUP BY id1, weight";
214 $dupeCopyJoin = " JOIN dedupe_copy ON dedupe_copy.id1 = t1.column WHERE ";
215 }
216 else {
217 $this->temporaryTables['dedupe'] = CRM_Utils_SQL_TempTable::build()
218 ->setCategory('dedupe')
219 ->createWithColumns("id1 int, id2 int, weight int, UNIQUE UI_id1_id2 (id1, id2)")->getName();
220 $insertClause = "INSERT INTO {$this->temporaryTables['dedupe']} (id1, id2, weight)";
221 $groupByClause = "GROUP BY id1, id2, weight";
222 $dupeCopyJoin = " JOIN dedupe_copy ON dedupe_copy.id1 = t1.column AND dedupe_copy.id2 = t2.column WHERE ";
223 }
224 $patternColumn = '/t1.(\w+)/';
225 $exclWeightSum = [];
226
227 $dao = new CRM_Core_DAO();
228 CRM_Utils_Hook::dupeQuery($this, 'table', $tableQueries);
229
230 while (!empty($tableQueries)) {
231 list($isInclusive, $isDie) = self::isQuerySetInclusive($tableQueries, $this->threshold, $exclWeightSum);
232
233 if ($isInclusive) {
234 // order queries by table count
235 self::orderByTableCount($tableQueries);
236
237 $weightSum = array_sum($exclWeightSum);
238 $searchWithinDupes = !empty($exclWeightSum) ? 1 : 0;
239
240 while (!empty($tableQueries)) {
241 // extract the next query ( and weight ) to be executed
242 $fieldWeight = array_keys($tableQueries);
243 $fieldWeight = $fieldWeight[0];
244 $query = array_shift($tableQueries);
245
246 if ($searchWithinDupes) {
247 // get prepared to search within already found dupes if $searchWithinDupes flag is set
248 $dao->query("DROP TEMPORARY TABLE IF EXISTS dedupe_copy");
249 $dao->query("CREATE TEMPORARY TABLE dedupe_copy SELECT * FROM {$this->temporaryTables['dedupe']} WHERE weight >= {$weightSum}");
250
251 preg_match($patternColumn, $query, $matches);
252 $query = str_replace(' WHERE ', str_replace('column', $matches[1], $dupeCopyJoin), $query);
253
254 // CRM-19612: If there's a union, there will be two WHEREs, and you
255 // can't use the temp table twice.
256 if (preg_match('/dedupe_copy[\S\s]*(union)[\S\s]*dedupe_copy/i', $query, $matches, PREG_OFFSET_CAPTURE)) {
257 // Make a second temp table:
258 $dao->query("DROP TEMPORARY TABLE IF EXISTS dedupe_copy_2");
259 $dao->query("CREATE TEMPORARY TABLE dedupe_copy_2 SELECT * FROM {$this->temporaryTables['dedupe']} WHERE weight >= {$weightSum}");
260 // After the union, use that new temp table:
261 $part1 = substr($query, 0, $matches[1][1]);
262 $query = $part1 . str_replace('dedupe_copy', 'dedupe_copy_2', substr($query, $matches[1][1]));
263 }
264 }
265 $searchWithinDupes = 1;
266
267 // construct and execute the intermediate query
268 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
269 $dao->query($query);
270
271 // FIXME: we need to be more acurate with affected rows, especially for insert vs duplicate insert.
272 // And that will help optimize further.
273 $affectedRows = $dao->affectedRows();
274
275 // In an inclusive situation, failure of any query means no further processing -
276 if ($affectedRows == 0) {
277 // reset to make sure no further execution is done.
278 $tableQueries = [];
279 break;
280 }
281 $weightSum = substr($fieldWeight, strrpos($fieldWeight, '.') + 1) + $weightSum;
282 }
283 // An exclusive situation -
284 }
285 elseif (!$isDie) {
286 // since queries are already sorted by weights, we can continue as is
287 $fieldWeight = array_keys($tableQueries);
288 $fieldWeight = $fieldWeight[0];
289 $query = array_shift($tableQueries);
290 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
291 $dao->query($query);
292 if ($dao->affectedRows() >= 1) {
293 $exclWeightSum[] = substr($fieldWeight, strrpos($fieldWeight, '.') + 1);
294 }
295 }
296 else {
297 // its a die situation
298 break;
299 }
300 }
301 }
302
303 /**
304 * Function to determine if a given query set contains inclusive or exclusive set of weights.
305 * The function assumes that the query set is already ordered by weight in desc order.
306 * @param $tableQueries
307 * @param $threshold
308 * @param array $exclWeightSum
309 *
310 * @return array
311 */
312 public static function isQuerySetInclusive($tableQueries, $threshold, $exclWeightSum = []) {
313 $input = [];
314 foreach ($tableQueries as $key => $query) {
315 $input[] = substr($key, strrpos($key, '.') + 1);
316 }
317
318 if (!empty($exclWeightSum)) {
319 $input = array_merge($input, $exclWeightSum);
320 rsort($input);
321 }
322
323 if (count($input) == 1) {
324 return [FALSE, $input[0] < $threshold];
325 }
326
327 $totalCombinations = 0;
328 for ($i = 0; $i < count($input); $i++) {
329 $combination = [$input[$i]];
330 if (array_sum($combination) >= $threshold) {
331 $totalCombinations++;
332 continue;
333 }
334 for ($j = $i + 1; $j < count($input); $j++) {
335 $combination[] = $input[$j];
336 if (array_sum($combination) >= $threshold) {
337 $totalCombinations++;
338 }
339 }
340 }
341 return [$totalCombinations == 1, $totalCombinations <= 0];
342 }
343
344 /**
345 * sort queries by number of records for the table associated with them.
346 * @param $tableQueries
347 */
348 public static function orderByTableCount(&$tableQueries) {
349 static $tableCount = [];
350
351 $tempArray = [];
352 foreach ($tableQueries as $key => $query) {
353 $table = explode(".", $key);
354 $table = $table[0];
355 if (!array_key_exists($table, $tableCount)) {
356 $query = "SELECT COUNT(*) FROM {$table}";
357 $tableCount[$table] = CRM_Core_DAO::singleValueQuery($query);
358 }
359 $tempArray[$key] = $tableCount[$table];
360 }
361
362 asort($tempArray);
363 foreach ($tempArray as $key => $count) {
364 $tempArray[$key] = $tableQueries[$key];
365 }
366 $tableQueries = $tempArray;
367 }
368
369 /**
370 * Return the SQL query for getting only the interesting results out of the dedupe table.
371 *
372 * @$checkPermission boolean $params a flag to indicate if permission should be considered.
373 * default is to always check permissioning but public pages for example might not want
374 * permission to be checked for anonymous users. Refer CRM-6211. We might be beaking
375 * Multi-Site dedupe for public pages.
376 *
377 * @param bool $checkPermission
378 *
379 * @return string
380 */
381 public function thresholdQuery($checkPermission = TRUE) {
382 $this->_aclFrom = '';
383 // CRM-6603: anonymous dupechecks side-step ACLs
384 $this->_aclWhere = ' AND is_deleted = 0 ';
385
386 if ($this->params && !$this->noRules) {
387 if ($checkPermission) {
388 list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause('civicrm_contact');
389 $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : '';
390 }
391 $query = "SELECT {$this->temporaryTables['dedupe']}.id1 as id
392 FROM {$this->temporaryTables['dedupe']} JOIN civicrm_contact ON {$this->temporaryTables['dedupe']}.id1 = civicrm_contact.id {$this->_aclFrom}
393 WHERE contact_type = '{$this->contact_type}' {$this->_aclWhere}
394 AND weight >= {$this->threshold}";
395 }
396 else {
397 $this->_aclWhere = ' AND c1.is_deleted = 0 AND c2.is_deleted = 0';
398 if ($checkPermission) {
399 list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause(['c1', 'c2']);
400 $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : '';
401 }
402 $query = "SELECT IF({$this->temporaryTables['dedupe']}.id1 < {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id1, {$this->temporaryTables['dedupe']}.id2) as id1,
403 IF({$this->temporaryTables['dedupe']}.id1 < {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id1) as id2, {$this->temporaryTables['dedupe']}.weight
404 FROM {$this->temporaryTables['dedupe']} JOIN civicrm_contact c1 ON {$this->temporaryTables['dedupe']}.id1 = c1.id
405 JOIN civicrm_contact c2 ON {$this->temporaryTables['dedupe']}.id2 = c2.id {$this->_aclFrom}
406 LEFT JOIN civicrm_dedupe_exception exc ON {$this->temporaryTables['dedupe']}.id1 = exc.contact_id1 AND {$this->temporaryTables['dedupe']}.id2 = exc.contact_id2
407 WHERE c1.contact_type = '{$this->contact_type}' AND
408 c2.contact_type = '{$this->contact_type}' {$this->_aclWhere}
409 AND weight >= {$this->threshold} AND exc.contact_id1 IS NULL";
410 }
411
412 CRM_Utils_Hook::dupeQuery($this, 'threshold', $query);
413 return $query;
414 }
415
416 /**
417 * find fields related to a rule group.
418 *
419 * @param array $params
420 *
421 * @return array
422 * (rule field => weight) array and threshold associated to rule group
423 */
424 public static function dedupeRuleFieldsWeight($params) {
425 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
426 $rgBao->contact_type = $params['contact_type'];
427 if (!empty($params['id'])) {
428 // accept an ID if provided
429 $rgBao->id = $params['id'];
430 }
431 else {
432 $rgBao->used = $params['used'];
433 }
434 $rgBao->find(TRUE);
435
436 $ruleBao = new CRM_Dedupe_BAO_Rule();
437 $ruleBao->dedupe_rule_group_id = $rgBao->id;
438 $ruleBao->find();
439 $ruleFields = [];
440 while ($ruleBao->fetch()) {
441 $ruleFields[$ruleBao->rule_field] = $ruleBao->rule_weight;
442 }
443
444 return [$ruleFields, $rgBao->threshold];
445 }
446
447 /**
448 * Get all of the combinations of fields that would work with a rule.
449 *
450 * @param array $rgFields
451 * @param int $threshold
452 * @param array $combos
453 * @param array $running
454 */
455 public static function combos($rgFields, $threshold, &$combos, $running = []) {
456 foreach ($rgFields as $rgField => $weight) {
457 unset($rgFields[$rgField]);
458 $diff = $threshold - $weight;
459 $runningnow = $running;
460 $runningnow[] = $rgField;
461 if ($diff > 0) {
462 self::combos($rgFields, $diff, $combos, $runningnow);
463 }
464 else {
465 $combos[] = $runningnow;
466 }
467 }
468 }
469
470 /**
471 * Get an array of rule group id to rule group name
472 * for all th groups for that contactType. If contactType
473 * not specified, do it for all
474 *
475 * @param string $contactType
476 * Individual, Household or Organization.
477 *
478 *
479 * @return array
480 * id => "nice name" of rule group
481 */
482 public static function getByType($contactType = NULL) {
483 $dao = new CRM_Dedupe_DAO_RuleGroup();
484
485 if ($contactType) {
486 $dao->contact_type = $contactType;
487 }
488
489 $dao->find();
490 $result = [];
491 while ($dao->fetch()) {
492 $title = !empty($dao->title) ? $dao->title : (!empty($dao->name) ? $dao->name : $dao->contact_type);
493
494 $name = "$title - {$dao->used}";
495 $result[$dao->id] = $name;
496 }
497 return $result;
498 }
499
500 /**
501 * Get the cached contact type for a particular rule group.
502 *
503 * @param int $rule_group_id
504 *
505 * @return string
506 */
507 public static function getContactTypeForRuleGroup($rule_group_id) {
508 if (!isset(\Civi::$statics[__CLASS__]) || !isset(\Civi::$statics[__CLASS__]['rule_groups'])) {
509 \Civi::$statics[__CLASS__]['rule_groups'] = [];
510 }
511 if (empty(\Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id])) {
512 \Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id]['contact_type'] = CRM_Core_DAO::getFieldValue(
513 'CRM_Dedupe_DAO_RuleGroup',
514 $rule_group_id,
515 'contact_type'
516 );
517 }
518
519 return \Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id]['contact_type'];
520 }
521
522 }