Merge pull request #16009 from jmcclelland/import-match-phone
[civicrm-core.git] / CRM / Dedupe / BAO / RuleGroup.php
1 <?php
2 /*
3 +--------------------------------------------------------------------+
4 | Copyright CiviCRM LLC. All rights reserved. |
5 | |
6 | This work is published under the GNU AGPLv3 license with some |
7 | permitted exceptions and without any warranty. For full license |
8 | and copyright information, see https://civicrm.org/licensing |
9 +--------------------------------------------------------------------+
10 */
11
12 /**
13 *
14 * @package CRM
15 * @copyright CiviCRM LLC https://civicrm.org/licensing
16 * $Id$
17 *
18 */
19
20 /**
21 * The CiviCRM duplicate discovery engine is based on an
22 * algorithm designed by David Strauss <david@fourkitchens.com>.
23 */
24 class CRM_Dedupe_BAO_RuleGroup extends CRM_Dedupe_DAO_RuleGroup {
25
26 /**
27 * Ids of the contacts to limit the SQL queries (whole-database queries otherwise)
28 * @var array
29 */
30 public $contactIds = [];
31
32 /**
33 * Set the contact IDs to restrict the dedupe to.
34 *
35 * @param array $contactIds
36 */
37 public function setContactIds($contactIds) {
38 $this->contactIds = $contactIds;
39 }
40
41 /**
42 * Params to dedupe against (queries against the whole contact set otherwise)
43 * @var array
44 */
45 public $params = [];
46
47 /**
48 * If there are no rules in rule group.
49 * @var bool
50 */
51 public $noRules = FALSE;
52
53 protected $temporaryTables = [];
54
55 /**
56 * Return a structure holding the supported tables, fields and their titles
57 *
58 * @param string $requestedType
59 * The requested contact type.
60 *
61 * @return array
62 * a table-keyed array of field-keyed arrays holding supported fields' titles
63 */
64 public static function supportedFields($requestedType) {
65 static $fields = NULL;
66 if (!$fields) {
67 // this is needed, as we're piggy-backing importableFields() below
68 $replacements = [
69 'civicrm_country.name' => 'civicrm_address.country_id',
70 'civicrm_county.name' => 'civicrm_address.county_id',
71 'civicrm_state_province.name' => 'civicrm_address.state_province_id',
72 'gender.label' => 'civicrm_contact.gender_id',
73 'individual_prefix.label' => 'civicrm_contact.prefix_id',
74 'individual_suffix.label' => 'civicrm_contact.suffix_id',
75 'addressee.label' => 'civicrm_contact.addressee_id',
76 'email_greeting.label' => 'civicrm_contact.email_greeting_id',
77 'postal_greeting.label' => 'civicrm_contact.postal_greeting_id',
78 'civicrm_phone.phone' => 'civicrm_phone.phone_numeric',
79 ];
80 // the table names we support in dedupe rules - a filter for importableFields()
81 $supportedTables = [
82 'civicrm_address',
83 'civicrm_contact',
84 'civicrm_email',
85 'civicrm_im',
86 'civicrm_note',
87 'civicrm_openid',
88 'civicrm_phone',
89 ];
90
91 foreach (['Individual', 'Organization', 'Household'] as $ctype) {
92 // take the table.field pairs and their titles from importableFields() if the table is supported
93 foreach (CRM_Contact_BAO_Contact::importableFields($ctype) as $iField) {
94 if (isset($iField['where'])) {
95 $where = $iField['where'];
96 if (isset($replacements[$where])) {
97 $where = $replacements[$where];
98 }
99 list($table, $field) = explode('.', $where);
100 if (!in_array($table, $supportedTables)) {
101 continue;
102 }
103 $fields[$ctype][$table][$field] = $iField['title'];
104 }
105 }
106 // Note that most of the fields available come from 'importable fields' -
107 // I thought about making this field 'importable' but it felt like there might be unknown consequences
108 // so I opted for just adding it in & securing it with a unit test.
109 /// Example usage of sort_name - It is possible to alter sort name via hook so 2 organization names might differ as in
110 // Justice League vs The Justice League but these could have the same sort_name if 'the the'
111 // exension is installed (https://github.com/eileenmcnaughton/org.wikimedia.thethe)
112 $fields[$ctype]['civicrm_contact']['sort_name'] = ts('Sort Name');
113 // add custom data fields
114 foreach (CRM_Core_BAO_CustomGroup::getTree($ctype, NULL, NULL, -1) as $key => $cg) {
115 if (!is_int($key)) {
116 continue;
117 }
118 foreach ($cg['fields'] as $cf) {
119 $fields[$ctype][$cg['table_name']][$cf['column_name']] = $cf['label'];
120 }
121 }
122 }
123 }
124 CRM_Utils_Hook::dupeQuery(CRM_Core_DAO::$_nullObject, 'supportedFields', $fields);
125 return !empty($fields[$requestedType]) ? $fields[$requestedType] : [];
126 }
127
128 /**
129 * Return the SQL query for dropping the temporary table.
130 */
131 public function tableDropQuery() {
132 return 'DROP TEMPORARY TABLE IF EXISTS dedupe';
133 }
134
135 /**
136 * Return a set of SQL queries whose cummulative weights will mark matched
137 * records for the RuleGroup::threasholdQuery() to retrieve.
138 */
139 public function tableQuery() {
140 // make sure we've got a fetched dbrecord, not sure if this is enforced
141 if (!$this->name == NULL || $this->is_reserved == NULL) {
142 $this->find(TRUE);
143 }
144
145 // Reserved Rule Groups can optionally get special treatment by
146 // implementing an optimization class and returning a query array.
147 if ($this->is_reserved &&
148 CRM_Utils_File::isIncludable("CRM/Dedupe/BAO/QueryBuilder/{$this->name}.php")
149 ) {
150 $command = empty($this->params) ? 'internal' : 'record';
151 $queries = call_user_func(["CRM_Dedupe_BAO_QueryBuilder_{$this->name}", $command], $this);
152 }
153 else {
154 // All other rule groups have queries generated by the member dedupe
155 // rules defined in the administrative interface.
156
157 // Find all rules contained by this script sorted by weight so that
158 // their execution can be short circuited on RuleGroup::fillTable()
159 $bao = new CRM_Dedupe_BAO_Rule();
160 $bao->dedupe_rule_group_id = $this->id;
161 $bao->orderBy('rule_weight DESC');
162 $bao->find();
163
164 // Generate a SQL query for each rule in the rule group that is
165 // tailored to respect the param and contactId options provided.
166 $queries = [];
167 while ($bao->fetch()) {
168 $bao->contactIds = $this->contactIds;
169 $bao->params = $this->params;
170
171 // Skipping empty rules? Empty rules shouldn't exist; why check?
172 if ($query = $bao->sql()) {
173 $queries["{$bao->rule_table}.{$bao->rule_field}.{$bao->rule_weight}"] = $query;
174 }
175 }
176 }
177
178 // if there are no rules in this rule group
179 // add an empty query fulfilling the pattern
180 if (!$queries) {
181 $this->noRules = TRUE;
182 return [];
183 }
184
185 return $queries;
186 }
187
188 public function fillTable() {
189 // get the list of queries handy
190 $tableQueries = $this->tableQuery();
191
192 if ($this->params && !$this->noRules) {
193 $this->temporaryTables['dedupe'] = CRM_Utils_SQL_TempTable::build()
194 ->setCategory('dedupe')
195 ->createWithColumns("id1 int, weight int, UNIQUE UI_id1 (id1)")->getName();
196 $dedupeCopyTemporaryTableObject = CRM_Utils_SQL_TempTable::build()
197 ->setCategory('dedupe');
198 $this->temporaryTables['dedupe_copy'] = $dedupeCopyTemporaryTableObject->getName();
199 $insertClause = "INSERT INTO {$this->temporaryTables['dedupe']} (id1, weight)";
200 $groupByClause = "GROUP BY id1, weight";
201 $dupeCopyJoin = " JOIN {$this->temporaryTables['dedupe_copy']} ON {$this->temporaryTables['dedupe_copy']}.id1 = t1.column WHERE ";
202 }
203 else {
204 $this->temporaryTables['dedupe'] = CRM_Utils_SQL_TempTable::build()
205 ->setCategory('dedupe')
206 ->createWithColumns("id1 int, id2 int, weight int, UNIQUE UI_id1_id2 (id1, id2)")->getName();
207 $dedupeCopyTemporaryTableObject = CRM_Utils_SQL_TempTable::build()
208 ->setCategory('dedupe');
209 $this->temporaryTables['dedupe_copy'] = $dedupeCopyTemporaryTableObject->getName();
210 $insertClause = "INSERT INTO {$this->temporaryTables['dedupe']} (id1, id2, weight)";
211 $groupByClause = "GROUP BY id1, id2, weight";
212 $dupeCopyJoin = " JOIN {$this->temporaryTables['dedupe_copy']} ON {$this->temporaryTables['dedupe_copy']}.id1 = t1.column AND {$this->temporaryTables['dedupe_copy']}.id2 = t2.column WHERE ";
213 }
214 $patternColumn = '/t1.(\w+)/';
215 $exclWeightSum = [];
216
217 $dao = new CRM_Core_DAO();
218 CRM_Utils_Hook::dupeQuery($this, 'table', $tableQueries);
219
220 while (!empty($tableQueries)) {
221 list($isInclusive, $isDie) = self::isQuerySetInclusive($tableQueries, $this->threshold, $exclWeightSum);
222
223 if ($isInclusive) {
224 // order queries by table count
225 self::orderByTableCount($tableQueries);
226
227 $weightSum = array_sum($exclWeightSum);
228 $searchWithinDupes = !empty($exclWeightSum) ? 1 : 0;
229
230 while (!empty($tableQueries)) {
231 // extract the next query ( and weight ) to be executed
232 $fieldWeight = array_keys($tableQueries);
233 $fieldWeight = $fieldWeight[0];
234 $query = array_shift($tableQueries);
235
236 if ($searchWithinDupes) {
237 // drop dedupe_copy table just in case if its already there.
238 $dedupeCopyTemporaryTableObject->drop();
239 // get prepared to search within already found dupes if $searchWithinDupes flag is set
240 $dedupeCopyTemporaryTableObject->createWithQuery("SELECT * FROM {$this->temporaryTables['dedupe']} WHERE weight >= {$weightSum}");
241
242 preg_match($patternColumn, $query, $matches);
243 $query = str_replace(' WHERE ', str_replace('column', $matches[1], $dupeCopyJoin), $query);
244
245 // CRM-19612: If there's a union, there will be two WHEREs, and you
246 // can't use the temp table twice.
247 if (preg_match('/' . $this->temporaryTables['dedupe_copy'] . '[\S\s]*(union)[\S\s]*' . $this->temporaryTables['dedupe_copy'] . '/i', $query, $matches, PREG_OFFSET_CAPTURE)) {
248 // Make a second temp table:
249 $this->temporaryTables['dedupe_copy_2'] = CRM_Utils_SQL_TempTable::build()
250 ->setCategory('dedupe')
251 ->createWithQuery("SELECT * FROM {$this->temporaryTables['dedupe']} WHERE weight >= {$weightSum}")
252 ->getName();
253 // After the union, use that new temp table:
254 $part1 = substr($query, 0, $matches[1][1]);
255 $query = $part1 . str_replace($this->temporaryTables['dedupe_copy'], $this->temporaryTables['dedupe_copy_2'], substr($query, $matches[1][1]));
256 }
257 }
258 $searchWithinDupes = 1;
259
260 // construct and execute the intermediate query
261 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
262 $dao->query($query);
263
264 // FIXME: we need to be more acurate with affected rows, especially for insert vs duplicate insert.
265 // And that will help optimize further.
266 $affectedRows = $dao->affectedRows();
267
268 // In an inclusive situation, failure of any query means no further processing -
269 if ($affectedRows == 0) {
270 // reset to make sure no further execution is done.
271 $tableQueries = [];
272 break;
273 }
274 $weightSum = substr($fieldWeight, strrpos($fieldWeight, '.') + 1) + $weightSum;
275 }
276 // An exclusive situation -
277 }
278 elseif (!$isDie) {
279 // since queries are already sorted by weights, we can continue as is
280 $fieldWeight = array_keys($tableQueries);
281 $fieldWeight = $fieldWeight[0];
282 $query = array_shift($tableQueries);
283 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
284 $dao->query($query);
285 if ($dao->affectedRows() >= 1) {
286 $exclWeightSum[] = substr($fieldWeight, strrpos($fieldWeight, '.') + 1);
287 }
288 }
289 else {
290 // its a die situation
291 break;
292 }
293 }
294 }
295
296 /**
297 * Function to determine if a given query set contains inclusive or exclusive set of weights.
298 * The function assumes that the query set is already ordered by weight in desc order.
299 * @param $tableQueries
300 * @param $threshold
301 * @param array $exclWeightSum
302 *
303 * @return array
304 */
305 public static function isQuerySetInclusive($tableQueries, $threshold, $exclWeightSum = []) {
306 $input = [];
307 foreach ($tableQueries as $key => $query) {
308 $input[] = substr($key, strrpos($key, '.') + 1);
309 }
310
311 if (!empty($exclWeightSum)) {
312 $input = array_merge($input, $exclWeightSum);
313 rsort($input);
314 }
315
316 if (count($input) == 1) {
317 return [FALSE, $input[0] < $threshold];
318 }
319
320 $totalCombinations = 0;
321 for ($i = 0; $i < count($input); $i++) {
322 $combination = [$input[$i]];
323 if (array_sum($combination) >= $threshold) {
324 $totalCombinations++;
325 continue;
326 }
327 for ($j = $i + 1; $j < count($input); $j++) {
328 $combination[] = $input[$j];
329 if (array_sum($combination) >= $threshold) {
330 $totalCombinations++;
331 }
332 }
333 }
334 return [$totalCombinations == 1, $totalCombinations <= 0];
335 }
336
337 /**
338 * sort queries by number of records for the table associated with them.
339 * @param $tableQueries
340 */
341 public static function orderByTableCount(&$tableQueries) {
342 static $tableCount = [];
343
344 $tempArray = [];
345 foreach ($tableQueries as $key => $query) {
346 $table = explode(".", $key);
347 $table = $table[0];
348 if (!array_key_exists($table, $tableCount)) {
349 $query = "SELECT COUNT(*) FROM {$table}";
350 $tableCount[$table] = CRM_Core_DAO::singleValueQuery($query);
351 }
352 $tempArray[$key] = $tableCount[$table];
353 }
354
355 asort($tempArray);
356 foreach ($tempArray as $key => $count) {
357 $tempArray[$key] = $tableQueries[$key];
358 }
359 $tableQueries = $tempArray;
360 }
361
362 /**
363 * Return the SQL query for getting only the interesting results out of the dedupe table.
364 *
365 * @$checkPermission boolean $params a flag to indicate if permission should be considered.
366 * default is to always check permissioning but public pages for example might not want
367 * permission to be checked for anonymous users. Refer CRM-6211. We might be beaking
368 * Multi-Site dedupe for public pages.
369 *
370 * @param bool $checkPermission
371 *
372 * @return string
373 */
374 public function thresholdQuery($checkPermission = TRUE) {
375 $this->_aclFrom = '';
376 // CRM-6603: anonymous dupechecks side-step ACLs
377 $this->_aclWhere = ' AND is_deleted = 0 ';
378
379 if ($this->params && !$this->noRules) {
380 if ($checkPermission) {
381 list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause('civicrm_contact');
382 $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : '';
383 }
384 $query = "SELECT {$this->temporaryTables['dedupe']}.id1 as id
385 FROM {$this->temporaryTables['dedupe']} JOIN civicrm_contact ON {$this->temporaryTables['dedupe']}.id1 = civicrm_contact.id {$this->_aclFrom}
386 WHERE contact_type = '{$this->contact_type}' {$this->_aclWhere}
387 AND weight >= {$this->threshold}";
388 }
389 else {
390 $this->_aclWhere = ' AND c1.is_deleted = 0 AND c2.is_deleted = 0';
391 if ($checkPermission) {
392 list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause(['c1', 'c2']);
393 $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : '';
394 }
395 $query = "SELECT IF({$this->temporaryTables['dedupe']}.id1 < {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id1, {$this->temporaryTables['dedupe']}.id2) as id1,
396 IF({$this->temporaryTables['dedupe']}.id1 < {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id2, {$this->temporaryTables['dedupe']}.id1) as id2, {$this->temporaryTables['dedupe']}.weight
397 FROM {$this->temporaryTables['dedupe']} JOIN civicrm_contact c1 ON {$this->temporaryTables['dedupe']}.id1 = c1.id
398 JOIN civicrm_contact c2 ON {$this->temporaryTables['dedupe']}.id2 = c2.id {$this->_aclFrom}
399 LEFT JOIN civicrm_dedupe_exception exc ON {$this->temporaryTables['dedupe']}.id1 = exc.contact_id1 AND {$this->temporaryTables['dedupe']}.id2 = exc.contact_id2
400 WHERE c1.contact_type = '{$this->contact_type}' AND
401 c2.contact_type = '{$this->contact_type}' {$this->_aclWhere}
402 AND weight >= {$this->threshold} AND exc.contact_id1 IS NULL";
403 }
404
405 CRM_Utils_Hook::dupeQuery($this, 'threshold', $query);
406 return $query;
407 }
408
409 /**
410 * find fields related to a rule group.
411 *
412 * @param array $params
413 *
414 * @return array
415 * (rule field => weight) array and threshold associated to rule group
416 */
417 public static function dedupeRuleFieldsWeight($params) {
418 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
419 $rgBao->contact_type = $params['contact_type'];
420 if (!empty($params['id'])) {
421 // accept an ID if provided
422 $rgBao->id = $params['id'];
423 }
424 else {
425 $rgBao->used = $params['used'];
426 }
427 $rgBao->find(TRUE);
428
429 $ruleBao = new CRM_Dedupe_BAO_Rule();
430 $ruleBao->dedupe_rule_group_id = $rgBao->id;
431 $ruleBao->find();
432 $ruleFields = [];
433 while ($ruleBao->fetch()) {
434 $field_name = $ruleBao->rule_field;
435 if ($field_name == 'phone_numeric') {
436 $field_name = 'phone';
437 }
438 $ruleFields[$field_name] = $ruleBao->rule_weight;
439 }
440
441 return [$ruleFields, $rgBao->threshold];
442 }
443
444 /**
445 * Get all of the combinations of fields that would work with a rule.
446 *
447 * @param array $rgFields
448 * @param int $threshold
449 * @param array $combos
450 * @param array $running
451 */
452 public static function combos($rgFields, $threshold, &$combos, $running = []) {
453 foreach ($rgFields as $rgField => $weight) {
454 unset($rgFields[$rgField]);
455 $diff = $threshold - $weight;
456 $runningnow = $running;
457 $runningnow[] = $rgField;
458 if ($diff > 0) {
459 self::combos($rgFields, $diff, $combos, $runningnow);
460 }
461 else {
462 $combos[] = $runningnow;
463 }
464 }
465 }
466
467 /**
468 * Get an array of rule group id to rule group name
469 * for all th groups for that contactType. If contactType
470 * not specified, do it for all
471 *
472 * @param string $contactType
473 * Individual, Household or Organization.
474 *
475 *
476 * @return array
477 * id => "nice name" of rule group
478 */
479 public static function getByType($contactType = NULL) {
480 $dao = new CRM_Dedupe_DAO_RuleGroup();
481
482 if ($contactType) {
483 $dao->contact_type = $contactType;
484 }
485
486 $dao->find();
487 $result = [];
488 while ($dao->fetch()) {
489 $title = !empty($dao->title) ? $dao->title : (!empty($dao->name) ? $dao->name : $dao->contact_type);
490
491 $name = "$title - {$dao->used}";
492 $result[$dao->id] = $name;
493 }
494 return $result;
495 }
496
497 /**
498 * Get the cached contact type for a particular rule group.
499 *
500 * @param int $rule_group_id
501 *
502 * @return string
503 */
504 public static function getContactTypeForRuleGroup($rule_group_id) {
505 if (!isset(\Civi::$statics[__CLASS__]) || !isset(\Civi::$statics[__CLASS__]['rule_groups'])) {
506 \Civi::$statics[__CLASS__]['rule_groups'] = [];
507 }
508 if (empty(\Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id])) {
509 \Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id]['contact_type'] = CRM_Core_DAO::getFieldValue(
510 'CRM_Dedupe_DAO_RuleGroup',
511 $rule_group_id,
512 'contact_type'
513 );
514 }
515
516 return \Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id]['contact_type'];
517 }
518
519 }