Merge pull request #23575 from civicrm/5.50
[civicrm-core.git] / CRM / Dedupe / BAO / DedupeRuleGroup.php
1 <?php
2 /*
3 +--------------------------------------------------------------------+
4 | Copyright CiviCRM LLC. All rights reserved. |
5 | |
6 | This work is published under the GNU AGPLv3 license with some |
7 | permitted exceptions and without any warranty. For full license |
8 | and copyright information, see https://civicrm.org/licensing |
9 +--------------------------------------------------------------------+
10 */
11
12 /**
13 *
14 * @package CRM
15 * @copyright CiviCRM LLC https://civicrm.org/licensing
16 */
17
18 /**
19 * The CiviCRM duplicate discovery engine is based on an
20 * algorithm designed by David Strauss <david@fourkitchens.com>.
21 */
22 class CRM_Dedupe_BAO_DedupeRuleGroup extends CRM_Dedupe_DAO_DedupeRuleGroup {
23
24 /**
25 * Ids of the contacts to limit the SQL queries (whole-database queries otherwise)
26 * @var array
27 */
28 public $contactIds = [];
29
30 /**
31 * Set the contact IDs to restrict the dedupe to.
32 *
33 * @param array $contactIds
34 */
35 public function setContactIds($contactIds) {
36 $this->contactIds = $contactIds;
37 }
38
39 /**
40 * Params to dedupe against (queries against the whole contact set otherwise)
41 * @var array
42 */
43 public $params = [];
44
45 /**
46 * If there are no rules in rule group.
47 * @var bool
48 */
49 public $noRules = FALSE;
50
51 protected $temporaryTables = [];
52
53 /**
54 * Return a structure holding the supported tables, fields and their titles
55 *
56 * @param string $requestedType
57 * The requested contact type.
58 *
59 * @return array
60 * a table-keyed array of field-keyed arrays holding supported fields' titles
61 */
62 public static function supportedFields($requestedType) {
63 if (!isset(Civi::$statics[__CLASS__]['supportedFields'])) {
64 // this is needed, as we're piggy-backing importableFields() below
65 $replacements = [
66 'civicrm_country.name' => 'civicrm_address.country_id',
67 'civicrm_county.name' => 'civicrm_address.county_id',
68 'civicrm_state_province.name' => 'civicrm_address.state_province_id',
69 'gender.label' => 'civicrm_contact.gender_id',
70 'individual_prefix.label' => 'civicrm_contact.prefix_id',
71 'individual_suffix.label' => 'civicrm_contact.suffix_id',
72 'addressee.label' => 'civicrm_contact.addressee_id',
73 'email_greeting.label' => 'civicrm_contact.email_greeting_id',
74 'postal_greeting.label' => 'civicrm_contact.postal_greeting_id',
75 'civicrm_phone.phone' => 'civicrm_phone.phone_numeric',
76 ];
77 // the table names we support in dedupe rules - a filter for importableFields()
78 $supportedTables = [
79 'civicrm_address',
80 'civicrm_contact',
81 'civicrm_email',
82 'civicrm_im',
83 'civicrm_note',
84 'civicrm_openid',
85 'civicrm_phone',
86 'civicrm_website',
87 ];
88
89 foreach (CRM_Contact_BAO_ContactType::basicTypes() as $ctype) {
90 // take the table.field pairs and their titles from importableFields() if the table is supported
91 foreach (CRM_Contact_BAO_Contact::importableFields($ctype) as $iField) {
92 if (isset($iField['where'])) {
93 $where = $iField['where'];
94 if (isset($replacements[$where])) {
95 $where = $replacements[$where];
96 }
97 list($table, $field) = explode('.', $where);
98 if (!in_array($table, $supportedTables)) {
99 continue;
100 }
101 $fields[$ctype][$table][$field] = $iField['title'];
102 }
103 }
104 // Note that most of the fields available come from 'importable fields' -
105 // I thought about making this field 'importable' but it felt like there might be unknown consequences
106 // so I opted for just adding it in & securing it with a unit test.
107 /// Example usage of sort_name - It is possible to alter sort name via hook so 2 organization names might differ as in
108 // Justice League vs The Justice League but these could have the same sort_name if 'the the'
109 // exension is installed (https://github.com/eileenmcnaughton/org.wikimedia.thethe)
110 $fields[$ctype]['civicrm_contact']['sort_name'] = ts('Sort Name');
111
112 // add all custom data fields including those only for sub_types.
113 foreach (CRM_Core_BAO_CustomGroup::getTree($ctype, NULL, NULL, -1, [], NULL, TRUE, NULL, TRUE) as $key => $cg) {
114 if (!is_int($key)) {
115 continue;
116 }
117 foreach ($cg['fields'] as $cf) {
118 $fields[$ctype][$cg['table_name']][$cf['column_name']] = $cg['title'] . ' : ' . $cf['label'];
119 }
120 }
121 }
122 //Does this have to run outside of cache?
123 CRM_Utils_Hook::dupeQuery(CRM_Core_DAO::$_nullObject, 'supportedFields', $fields);
124 Civi::$statics[__CLASS__]['supportedFields'] = $fields;
125 }
126
127 return Civi::$statics[__CLASS__]['supportedFields'][$requestedType] ?? [];
128
129 }
130
131 /**
132 * Return the SQL query for dropping the temporary table.
133 */
134 public function tableDropQuery() {
135 return 'DROP TEMPORARY TABLE IF EXISTS dedupe';
136 }
137
138 /**
139 * Return a set of SQL queries whose cummulative weights will mark matched
140 * records for the RuleGroup::threasholdQuery() to retrieve.
141 */
142 public function tableQuery() {
143 // make sure we've got a fetched dbrecord, not sure if this is enforced
144 if (!$this->name == NULL || $this->is_reserved == NULL) {
145 $this->find(TRUE);
146 }
147
148 // Reserved Rule Groups can optionally get special treatment by
149 // implementing an optimization class and returning a query array.
150 if ($this->is_reserved &&
151 CRM_Utils_File::isIncludable("CRM/Dedupe/BAO/QueryBuilder/{$this->name}.php")
152 ) {
153 $command = empty($this->params) ? 'internal' : 'record';
154 $queries = call_user_func(["CRM_Dedupe_BAO_QueryBuilder_{$this->name}", $command], $this);
155 }
156 else {
157 // All other rule groups have queries generated by the member dedupe
158 // rules defined in the administrative interface.
159
160 // Find all rules contained by this script sorted by weight so that
161 // their execution can be short circuited on RuleGroup::fillTable()
162 $bao = new CRM_Dedupe_BAO_DedupeRule();
163 $bao->dedupe_rule_group_id = $this->id;
164 $bao->orderBy('rule_weight DESC');
165 $bao->find();
166
167 // Generate a SQL query for each rule in the rule group that is
168 // tailored to respect the param and contactId options provided.
169 $queries = [];
170 while ($bao->fetch()) {
171 $bao->contactIds = $this->contactIds;
172 $bao->params = $this->params;
173
174 // Skipping empty rules? Empty rules shouldn't exist; why check?
175 if ($query = $bao->sql()) {
176 $queries["{$bao->rule_table}.{$bao->rule_field}.{$bao->rule_weight}"] = $query;
177 }
178 }
179 }
180
181 // if there are no rules in this rule group
182 // add an empty query fulfilling the pattern
183 if (!$queries) {
184 $this->noRules = TRUE;
185 return [];
186 }
187
188 return $queries;
189 }
190
191 public function fillTable() {
192 // get the list of queries handy
193 $tableQueries = $this->tableQuery();
194
195 if ($this->params && !$this->noRules) {
196 $this->temporaryTables['dedupe'] = CRM_Utils_SQL_TempTable::build()
197 ->setCategory('dedupe')
198 ->createWithColumns("id1 int, weight int, UNIQUE UI_id1 (id1)")->getName();
199 $dedupeCopyTemporaryTableObject = CRM_Utils_SQL_TempTable::build()
200 ->setCategory('dedupe');
201 $this->temporaryTables['dedupe_copy'] = $dedupeCopyTemporaryTableObject->getName();
202 $insertClause = "INSERT INTO {$this->temporaryTables['dedupe']} (id1, weight)";
203 $groupByClause = "GROUP BY id1, weight";
204 $dupeCopyJoin = " JOIN {$this->temporaryTables['dedupe_copy']} ON {$this->temporaryTables['dedupe_copy']}.id1 = t1.column WHERE ";
205 }
206 else {
207 $this->temporaryTables['dedupe'] = CRM_Utils_SQL_TempTable::build()
208 ->setCategory('dedupe')
209 ->createWithColumns("id1 int, id2 int, weight int, UNIQUE UI_id1_id2 (id1, id2)")->getName();
210 $dedupeCopyTemporaryTableObject = CRM_Utils_SQL_TempTable::build()
211 ->setCategory('dedupe');
212 $this->temporaryTables['dedupe_copy'] = $dedupeCopyTemporaryTableObject->getName();
213 $insertClause = "INSERT INTO {$this->temporaryTables['dedupe']} (id1, id2, weight)";
214 $groupByClause = "GROUP BY id1, id2, weight";
215 $dupeCopyJoin = " JOIN {$this->temporaryTables['dedupe_copy']} ON {$this->temporaryTables['dedupe_copy']}.id1 = t1.column AND {$this->temporaryTables['dedupe_copy']}.id2 = t2.column WHERE ";
216 }
217 $patternColumn = '/t1.(\w+)/';
218 $exclWeightSum = [];
219
220 CRM_Utils_Hook::dupeQuery($this, 'table', $tableQueries);
221
222 while (!empty($tableQueries)) {
223 list($isInclusive, $isDie) = self::isQuerySetInclusive($tableQueries, $this->threshold, $exclWeightSum);
224
225 if ($isInclusive) {
226 // order queries by table count
227 self::orderByTableCount($tableQueries);
228
229 $weightSum = array_sum($exclWeightSum);
230 $searchWithinDupes = !empty($exclWeightSum) ? 1 : 0;
231
232 while (!empty($tableQueries)) {
233 // extract the next query ( and weight ) to be executed
234 $fieldWeight = array_keys($tableQueries);
235 $fieldWeight = $fieldWeight[0];
236 $query = array_shift($tableQueries);
237
238 if ($searchWithinDupes) {
239 // drop dedupe_copy table just in case if its already there.
240 $dedupeCopyTemporaryTableObject->drop();
241 // get prepared to search within already found dupes if $searchWithinDupes flag is set
242 $dedupeCopyTemporaryTableObject->createWithQuery("SELECT * FROM {$this->temporaryTables['dedupe']} WHERE weight >= {$weightSum}");
243
244 preg_match($patternColumn, $query, $matches);
245 $query = str_replace(' WHERE ', str_replace('column', $matches[1], $dupeCopyJoin), $query);
246
247 // CRM-19612: If there's a union, there will be two WHEREs, and you
248 // can't use the temp table twice.
249 if (preg_match('/' . $this->temporaryTables['dedupe_copy'] . '[\S\s]*(union)[\S\s]*' . $this->temporaryTables['dedupe_copy'] . '/i', $query, $matches, PREG_OFFSET_CAPTURE)) {
250 // Make a second temp table:
251 $this->temporaryTables['dedupe_copy_2'] = CRM_Utils_SQL_TempTable::build()
252 ->setCategory('dedupe')
253 ->createWithQuery("SELECT * FROM {$this->temporaryTables['dedupe']} WHERE weight >= {$weightSum}")
254 ->getName();
255 // After the union, use that new temp table:
256 $part1 = substr($query, 0, $matches[1][1]);
257 $query = $part1 . str_replace($this->temporaryTables['dedupe_copy'], $this->temporaryTables['dedupe_copy_2'], substr($query, $matches[1][1]));
258 }
259 }
260 $searchWithinDupes = 1;
261
262 // construct and execute the intermediate query
263 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
264 $dao = CRM_Core_DAO::executeQuery($query);
265
266 // FIXME: we need to be more acurate with affected rows, especially for insert vs duplicate insert.
267 // And that will help optimize further.
268 $affectedRows = $dao->affectedRows();
269
270 // In an inclusive situation, failure of any query means no further processing -
271 if ($affectedRows == 0) {
272 // reset to make sure no further execution is done.
273 $tableQueries = [];
274 break;
275 }
276 $weightSum = substr($fieldWeight, strrpos($fieldWeight, '.') + 1) + $weightSum;
277 }
278 // An exclusive situation -
279 }
280 elseif (!$isDie) {
281 // since queries are already sorted by weights, we can continue as is
282 $fieldWeight = array_keys($tableQueries);
283 $fieldWeight = $fieldWeight[0];
284 $query = array_shift($tableQueries);
285 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
286 $dao = CRM_Core_DAO::executeQuery($query);
287 if ($dao->affectedRows() >= 1) {
288 $exclWeightSum[] = substr($fieldWeight, strrpos($fieldWeight, '.') + 1);
289 }
290 }
291 else {
292 // its a die situation
293 break;
294 }
295 }
296 }
297
298 /**
299 * Function to determine if a given query set contains inclusive or exclusive set of weights.
300 * The function assumes that the query set is already ordered by weight in desc order.
301 * @param $tableQueries
302 * @param $threshold
303 * @param array $exclWeightSum
304 *
305 * @return array
306 */
307 public static function isQuerySetInclusive($tableQueries, $threshold, $exclWeightSum = []) {
308 $input = [];
309 foreach ($tableQueries as $key => $query) {
310 $input[] = substr($key, strrpos($key, '.') + 1);
311 }
312
313 if (!empty($exclWeightSum)) {
314 $input = array_merge($input, $exclWeightSum);
315 rsort($input);
316 }
317
318 if (count($input) == 1) {
319 return [FALSE, $input[0] < $threshold];
320 }
321
322 $totalCombinations = 0;
323 for ($i = 0; $i < count($input); $i++) {
324 $combination = [$input[$i]];
325 if (array_sum($combination) >= $threshold) {
326 $totalCombinations++;
327 continue;
328 }
329 for ($j = $i + 1; $j < count($input); $j++) {
330 $combination[] = $input[$j];
331 if (array_sum($combination) >= $threshold) {
332 $totalCombinations++;
333 }
334 }
335 }
336 return [$totalCombinations == 1, $totalCombinations <= 0];
337 }
338
339 /**
340 * sort queries by number of records for the table associated with them.
341 * @param $tableQueries
342 */
343 public static function orderByTableCount(&$tableQueries) {
344 static $tableCount = [];
345
346 $tempArray = [];
347 foreach ($tableQueries as $key => $query) {
348 $table = explode(".", $key);
349 $table = $table[0];
350 if (!array_key_exists($table, $tableCount)) {
351 $query = "SELECT COUNT(*) FROM {$table}";
352 $tableCount[$table] = CRM_Core_DAO::singleValueQuery($query);
353 }
354 $tempArray[$key] = $tableCount[$table];
355 }
356
357 asort($tempArray);
358 foreach ($tempArray as $key => $count) {
359 $tempArray[$key] = $tableQueries[$key];
360 }
361 $tableQueries = $tempArray;
362 }
363
364 /**
365 * Return the SQL query for getting only the interesting results out of the dedupe table.
366 *
367 * @$checkPermission boolean $params a flag to indicate if permission should be considered.
368 * default is to always check permissioning but public pages for example might not want
369 * permission to be checked for anonymous users. Refer CRM-6211. We might be beaking
370 * Multi-Site dedupe for public pages.
371 *
372 * @param bool $checkPermission
373 *
374 * @return string
375 */
376 public function thresholdQuery($checkPermission = TRUE) {
377 $this->_aclFrom = '';
378 $aclWhere = '';
379
380 if ($this->params && !$this->noRules) {
381 if ($checkPermission) {
382 list($this->_aclFrom, $aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause('civicrm_contact');
383 $aclWhere = $aclWhere ? "AND {$aclWhere}" : '';
384 }
385 $query = "SELECT {$this->temporaryTables['dedupe']}.id1 as id
386 FROM {$this->temporaryTables['dedupe']} JOIN civicrm_contact ON {$this->temporaryTables['dedupe']}.id1 = civicrm_contact.id {$this->_aclFrom}
387 WHERE contact_type = '{$this->contact_type}' AND is_deleted = 0 $aclWhere
388 AND weight >= {$this->threshold}";
389 }
390 else {
391 $aclWhere = '';
392 if ($checkPermission) {
393 list($this->_aclFrom, $aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause(['c1', 'c2']);
394 $aclWhere = $aclWhere ? "AND {$aclWhere}" : '';
395 }
396 $query = "SELECT IF({$this->temporaryTables['dedupe']}.id1 < {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id1, {$this->temporaryTables['dedupe']}.id2) as id1,
397 IF({$this->temporaryTables['dedupe']}.id1 < {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id1) as id2, {$this->temporaryTables['dedupe']}.weight
398 FROM {$this->temporaryTables['dedupe']} JOIN civicrm_contact c1 ON {$this->temporaryTables['dedupe']}.id1 = c1.id
399 JOIN civicrm_contact c2 ON {$this->temporaryTables['dedupe']}.id2 = c2.id {$this->_aclFrom}
400 LEFT JOIN civicrm_dedupe_exception exc ON {$this->temporaryTables['dedupe']}.id1 = exc.contact_id1 AND {$this->temporaryTables['dedupe']}.id2 = exc.contact_id2
401 WHERE c1.contact_type = '{$this->contact_type}' AND
402 c2.contact_type = '{$this->contact_type}'
403 AND c1.is_deleted = 0 AND c2.is_deleted = 0
404 {$aclWhere}
405 AND weight >= {$this->threshold} AND exc.contact_id1 IS NULL";
406 }
407
408 CRM_Utils_Hook::dupeQuery($this, 'threshold', $query);
409 return $query;
410 }
411
412 /**
413 * find fields related to a rule group.
414 *
415 * @param array $params
416 *
417 * @return array
418 * (rule field => weight) array and threshold associated to rule group
419 */
420 public static function dedupeRuleFieldsWeight($params) {
421 $rgBao = new CRM_Dedupe_BAO_DedupeRuleGroup();
422 $rgBao->contact_type = $params['contact_type'];
423 if (!empty($params['id'])) {
424 // accept an ID if provided
425 $rgBao->id = $params['id'];
426 }
427 else {
428 $rgBao->used = $params['used'];
429 }
430 $rgBao->find(TRUE);
431
432 $ruleBao = new CRM_Dedupe_BAO_DedupeRule();
433 $ruleBao->dedupe_rule_group_id = $rgBao->id;
434 $ruleBao->find();
435 $ruleFields = [];
436 while ($ruleBao->fetch()) {
437 $field_name = $ruleBao->rule_field;
438 if ($field_name == 'phone_numeric') {
439 $field_name = 'phone';
440 }
441 $ruleFields[$field_name] = $ruleBao->rule_weight;
442 }
443
444 return [$ruleFields, $rgBao->threshold];
445 }
446
447 /**
448 * Get all of the combinations of fields that would work with a rule.
449 *
450 * @param array $rgFields
451 * @param int $threshold
452 * @param array $combos
453 * @param array $running
454 */
455 public static function combos($rgFields, $threshold, &$combos, $running = []) {
456 foreach ($rgFields as $rgField => $weight) {
457 unset($rgFields[$rgField]);
458 $diff = $threshold - $weight;
459 $runningnow = $running;
460 $runningnow[] = $rgField;
461 if ($diff > 0) {
462 self::combos($rgFields, $diff, $combos, $runningnow);
463 }
464 else {
465 $combos[] = $runningnow;
466 }
467 }
468 }
469
470 /**
471 * Get an array of rule group id to rule group name
472 * for all th groups for that contactType. If contactType
473 * not specified, do it for all
474 *
475 * @param string $contactType
476 * Individual, Household or Organization.
477 *
478 *
479 * @return array
480 * id => "nice name" of rule group
481 */
482 public static function getByType($contactType = NULL) {
483 $dao = new CRM_Dedupe_DAO_DedupeRuleGroup();
484
485 if ($contactType) {
486 $dao->contact_type = $contactType;
487 }
488
489 $dao->find();
490 $result = [];
491 while ($dao->fetch()) {
492 $title = !empty($dao->title) ? $dao->title : (!empty($dao->name) ? $dao->name : $dao->contact_type);
493
494 $name = "$title - {$dao->used}";
495 $result[$dao->id] = $name;
496 }
497 return $result;
498 }
499
500 /**
501 * Get the cached contact type for a particular rule group.
502 *
503 * @param int $rule_group_id
504 *
505 * @return string
506 */
507 public static function getContactTypeForRuleGroup($rule_group_id) {
508 if (!isset(\Civi::$statics[__CLASS__]) || !isset(\Civi::$statics[__CLASS__]['rule_groups'])) {
509 \Civi::$statics[__CLASS__]['rule_groups'] = [];
510 }
511 if (empty(\Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id])) {
512 \Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id]['contact_type'] = CRM_Core_DAO::getFieldValue(
513 'CRM_Dedupe_DAO_DedupeRuleGroup',
514 $rule_group_id,
515 'contact_type'
516 );
517 }
518
519 return \Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id]['contact_type'];
520 }
521
522 }