Merge pull request #11495 from yashodha/CRM-21637
[civicrm-core.git] / CRM / Dedupe / BAO / RuleGroup.php
1 <?php
2 /*
3 +--------------------------------------------------------------------+
4 | CiviCRM version 5 |
5 +--------------------------------------------------------------------+
6 | Copyright CiviCRM LLC (c) 2004-2018 |
7 +--------------------------------------------------------------------+
8 | This file is a part of CiviCRM. |
9 | |
10 | CiviCRM is free software; you can copy, modify, and distribute it |
11 | under the terms of the GNU Affero General Public License |
12 | Version 3, 19 November 2007 and the CiviCRM Licensing Exception. |
13 | |
14 | CiviCRM is distributed in the hope that it will be useful, but |
15 | WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
17 | See the GNU Affero General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU Affero General Public |
20 | License and the CiviCRM Licensing Exception along |
21 | with this program; if not, contact CiviCRM LLC |
22 | at info[AT]civicrm[DOT]org. If you have questions about the |
23 | GNU Affero General Public License or the licensing of CiviCRM, |
24 | see the CiviCRM license FAQ at http://civicrm.org/licensing |
25 +--------------------------------------------------------------------+
26 */
27
28 /**
29 *
30 * @package CRM
31 * @copyright CiviCRM LLC (c) 2004-2018
32 * $Id$
33 *
34 */
35
36 /**
37 * The CiviCRM duplicate discovery engine is based on an
38 * algorithm designed by David Strauss <david@fourkitchens.com>.
39 */
40 class CRM_Dedupe_BAO_RuleGroup extends CRM_Dedupe_DAO_RuleGroup {
41
42 /**
43 * Ids of the contacts to limit the SQL queries (whole-database queries otherwise)
44 */
45 var $contactIds = array();
46
47 /**
48 * Set the contact IDs to restrict the dedupe to.
49 *
50 * @param array $contactIds
51 */
52 public function setContactIds($contactIds) {
53 $this->contactIds = $contactIds;
54 }
55
56 /**
57 * Params to dedupe against (queries against the whole contact set otherwise)
58 */
59 var $params = array();
60
61 /**
62 * If there are no rules in rule group.
63 */
64 var $noRules = FALSE;
65
66 /**
67 * Return a structure holding the supported tables, fields and their titles
68 *
69 * @param string $requestedType
70 * The requested contact type.
71 *
72 * @return array
73 * a table-keyed array of field-keyed arrays holding supported fields' titles
74 */
75 public static function &supportedFields($requestedType) {
76 static $fields = NULL;
77 if (!$fields) {
78 // this is needed, as we're piggy-backing importableFields() below
79 $replacements = array(
80 'civicrm_country.name' => 'civicrm_address.country_id',
81 'civicrm_county.name' => 'civicrm_address.county_id',
82 'civicrm_state_province.name' => 'civicrm_address.state_province_id',
83 'gender.label' => 'civicrm_contact.gender_id',
84 'individual_prefix.label' => 'civicrm_contact.prefix_id',
85 'individual_suffix.label' => 'civicrm_contact.suffix_id',
86 'addressee.label' => 'civicrm_contact.addressee_id',
87 'email_greeting.label' => 'civicrm_contact.email_greeting_id',
88 'postal_greeting.label' => 'civicrm_contact.postal_greeting_id',
89 'civicrm_phone.phone' => 'civicrm_phone.phone_numeric',
90 );
91 // the table names we support in dedupe rules - a filter for importableFields()
92 $supportedTables = array(
93 'civicrm_address',
94 'civicrm_contact',
95 'civicrm_email',
96 'civicrm_im',
97 'civicrm_note',
98 'civicrm_openid',
99 'civicrm_phone',
100 );
101
102 foreach (array('Individual', 'Organization', 'Household') as $ctype) {
103 // take the table.field pairs and their titles from importableFields() if the table is supported
104 foreach (CRM_Contact_BAO_Contact::importableFields($ctype) as $iField) {
105 if (isset($iField['where'])) {
106 $where = $iField['where'];
107 if (isset($replacements[$where])) {
108 $where = $replacements[$where];
109 }
110 list($table, $field) = explode('.', $where);
111 if (!in_array($table, $supportedTables)) {
112 continue;
113 }
114 $fields[$ctype][$table][$field] = $iField['title'];
115 }
116 }
117 // add custom data fields
118 foreach (CRM_Core_BAO_CustomGroup::getTree($ctype, NULL, NULL, -1) as $key => $cg) {
119 if (!is_int($key)) {
120 continue;
121 }
122 foreach ($cg['fields'] as $cf) {
123 $fields[$ctype][$cg['table_name']][$cf['column_name']] = $cf['label'];
124 }
125 }
126 }
127 }
128 CRM_Utils_Hook::dupeQuery(CRM_Core_DAO::$_nullObject, 'supportedFields', $fields);
129 return $fields[$requestedType];
130 }
131
132 /**
133 * Return the SQL query for dropping the temporary table.
134 */
135 public function tableDropQuery() {
136 return 'DROP TEMPORARY TABLE IF EXISTS dedupe';
137 }
138
139 /**
140 * Return a set of SQL queries whose cummulative weights will mark matched
141 * records for the RuleGroup::threasholdQuery() to retrieve.
142 */
143 public function tableQuery() {
144 // make sure we've got a fetched dbrecord, not sure if this is enforced
145 if (!$this->name == NULL || $this->is_reserved == NULL) {
146 $this->find(TRUE);
147 }
148
149 // Reserved Rule Groups can optionally get special treatment by
150 // implementing an optimization class and returning a query array.
151 if ($this->is_reserved &&
152 CRM_Utils_File::isIncludable("CRM/Dedupe/BAO/QueryBuilder/{$this->name}.php")
153 ) {
154 $command = empty($this->params) ? 'internal' : 'record';
155 $queries = call_user_func(array("CRM_Dedupe_BAO_QueryBuilder_{$this->name}", $command), $this);
156 }
157 else {
158 // All other rule groups have queries generated by the member dedupe
159 // rules defined in the administrative interface.
160
161 // Find all rules contained by this script sorted by weight so that
162 // their execution can be short circuited on RuleGroup::fillTable()
163 $bao = new CRM_Dedupe_BAO_Rule();
164 $bao->dedupe_rule_group_id = $this->id;
165 $bao->orderBy('rule_weight DESC');
166 $bao->find();
167
168 // Generate a SQL query for each rule in the rule group that is
169 // tailored to respect the param and contactId options provided.
170 $queries = array();
171 while ($bao->fetch()) {
172 $bao->contactIds = $this->contactIds;
173 $bao->params = $this->params;
174
175 // Skipping empty rules? Empty rules shouldn't exist; why check?
176 if ($query = $bao->sql()) {
177 $queries["{$bao->rule_table}.{$bao->rule_field}.{$bao->rule_weight}"] = $query;
178 }
179 }
180 }
181
182 // if there are no rules in this rule group
183 // add an empty query fulfilling the pattern
184 if (!$queries) {
185 $this->noRules = TRUE;
186 return array();
187 }
188
189 return $queries;
190 }
191
192 public function fillTable() {
193 // get the list of queries handy
194 $tableQueries = $this->tableQuery();
195
196 if ($this->params && !$this->noRules) {
197 $tempTableQuery = "CREATE TEMPORARY TABLE dedupe (id1 int, weight int, UNIQUE UI_id1 (id1)) ENGINE=InnoDB";
198 $insertClause = "INSERT INTO dedupe (id1, weight)";
199 $groupByClause = "GROUP BY id1, weight";
200 $dupeCopyJoin = " JOIN dedupe_copy ON dedupe_copy.id1 = t1.column WHERE ";
201 }
202 else {
203 $tempTableQuery = "CREATE TEMPORARY TABLE dedupe (id1 int, id2 int, weight int, UNIQUE UI_id1_id2 (id1, id2)) ENGINE=InnoDB";
204 $insertClause = "INSERT INTO dedupe (id1, id2, weight)";
205 $groupByClause = "GROUP BY id1, id2, weight";
206 $dupeCopyJoin = " JOIN dedupe_copy ON dedupe_copy.id1 = t1.column AND dedupe_copy.id2 = t2.column WHERE ";
207 }
208 $patternColumn = '/t1.(\w+)/';
209 $exclWeightSum = array();
210
211 // create temp table
212 $dao = new CRM_Core_DAO();
213 $dao->query($tempTableQuery);
214
215 CRM_Utils_Hook::dupeQuery($this, 'table', $tableQueries);
216
217 while (!empty($tableQueries)) {
218 list($isInclusive, $isDie) = self::isQuerySetInclusive($tableQueries, $this->threshold, $exclWeightSum);
219
220 if ($isInclusive) {
221 // order queries by table count
222 self::orderByTableCount($tableQueries);
223
224 $weightSum = array_sum($exclWeightSum);
225 $searchWithinDupes = !empty($exclWeightSum) ? 1 : 0;
226
227 while (!empty($tableQueries)) {
228 // extract the next query ( and weight ) to be executed
229 $fieldWeight = array_keys($tableQueries);
230 $fieldWeight = $fieldWeight[0];
231 $query = array_shift($tableQueries);
232
233 if ($searchWithinDupes) {
234 // get prepared to search within already found dupes if $searchWithinDupes flag is set
235 $dao->query("DROP TEMPORARY TABLE IF EXISTS dedupe_copy");
236 $dao->query("CREATE TEMPORARY TABLE dedupe_copy SELECT * FROM dedupe WHERE weight >= {$weightSum}");
237 $dao->free();
238
239 preg_match($patternColumn, $query, $matches);
240 $query = str_replace(' WHERE ', str_replace('column', $matches[1], $dupeCopyJoin), $query);
241
242 // CRM-19612: If there's a union, there will be two WHEREs, and you
243 // can't use the temp table twice.
244 if (preg_match('/dedupe_copy[\S\s]*(union)[\S\s]*dedupe_copy/i', $query, $matches, PREG_OFFSET_CAPTURE)) {
245 // Make a second temp table:
246 $dao->query("DROP TEMPORARY TABLE IF EXISTS dedupe_copy_2");
247 $dao->query("CREATE TEMPORARY TABLE dedupe_copy_2 SELECT * FROM dedupe WHERE weight >= {$weightSum}");
248 $dao->free();
249 // After the union, use that new temp table:
250 $part1 = substr($query, 0, $matches[1][1]);
251 $query = $part1 . str_replace('dedupe_copy', 'dedupe_copy_2', substr($query, $matches[1][1]));
252 }
253 }
254 $searchWithinDupes = 1;
255
256 // construct and execute the intermediate query
257 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
258 $dao->query($query);
259
260 // FIXME: we need to be more acurate with affected rows, especially for insert vs duplicate insert.
261 // And that will help optimize further.
262 $affectedRows = $dao->affectedRows();
263 $dao->free();
264
265 // In an inclusive situation, failure of any query means no further processing -
266 if ($affectedRows == 0) {
267 // reset to make sure no further execution is done.
268 $tableQueries = array();
269 break;
270 }
271 $weightSum = substr($fieldWeight, strrpos($fieldWeight, '.') + 1) + $weightSum;
272 }
273 // An exclusive situation -
274 }
275 elseif (!$isDie) {
276 // since queries are already sorted by weights, we can continue as is
277 $fieldWeight = array_keys($tableQueries);
278 $fieldWeight = $fieldWeight[0];
279 $query = array_shift($tableQueries);
280 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
281 $dao->query($query);
282 if ($dao->affectedRows() >= 1) {
283 $exclWeightSum[] = substr($fieldWeight, strrpos($fieldWeight, '.') + 1);
284 }
285 $dao->free();
286 }
287 else {
288 // its a die situation
289 break;
290 }
291 }
292 }
293
294 /**
295 * Function to determine if a given query set contains inclusive or exclusive set of weights.
296 * The function assumes that the query set is already ordered by weight in desc order.
297 * @param $tableQueries
298 * @param $threshold
299 * @param array $exclWeightSum
300 *
301 * @return array
302 */
303 public static function isQuerySetInclusive($tableQueries, $threshold, $exclWeightSum = array()) {
304 $input = array();
305 foreach ($tableQueries as $key => $query) {
306 $input[] = substr($key, strrpos($key, '.') + 1);
307 }
308
309 if (!empty($exclWeightSum)) {
310 $input = array_merge($input, $exclWeightSum);
311 rsort($input);
312 }
313
314 if (count($input) == 1) {
315 return array(FALSE, $input[0] < $threshold);
316 }
317
318 $totalCombinations = 0;
319 for ($i = 0; $i < count($input); $i++) {
320 $combination = array($input[$i]);
321 if (array_sum($combination) >= $threshold) {
322 $totalCombinations++;
323 continue;
324 }
325 for ($j = $i + 1; $j < count($input); $j++) {
326 $combination[] = $input[$j];
327 if (array_sum($combination) >= $threshold) {
328 $totalCombinations++;
329 }
330 }
331 }
332 return array($totalCombinations == 1, $totalCombinations <= 0);
333 }
334
335 /**
336 * sort queries by number of records for the table associated with them.
337 * @param $tableQueries
338 */
339 public static function orderByTableCount(&$tableQueries) {
340 static $tableCount = array();
341
342 $tempArray = array();
343 foreach ($tableQueries as $key => $query) {
344 $table = explode(".", $key);
345 $table = $table[0];
346 if (!array_key_exists($table, $tableCount)) {
347 $query = "SELECT COUNT(*) FROM {$table}";
348 $tableCount[$table] = CRM_Core_DAO::singleValueQuery($query);
349 }
350 $tempArray[$key] = $tableCount[$table];
351 }
352
353 asort($tempArray);
354 foreach ($tempArray as $key => $count) {
355 $tempArray[$key] = $tableQueries[$key];
356 }
357 $tableQueries = $tempArray;
358 }
359
360 /**
361 * Return the SQL query for getting only the interesting results out of the dedupe table.
362 *
363 * @$checkPermission boolean $params a flag to indicate if permission should be considered.
364 * default is to always check permissioning but public pages for example might not want
365 * permission to be checked for anonymous users. Refer CRM-6211. We might be beaking
366 * Multi-Site dedupe for public pages.
367 *
368 * @param bool $checkPermission
369 *
370 * @return string
371 */
372 public function thresholdQuery($checkPermission = TRUE) {
373 $this->_aclFrom = '';
374 // CRM-6603: anonymous dupechecks side-step ACLs
375 $this->_aclWhere = ' AND is_deleted = 0 ';
376
377 if ($this->params && !$this->noRules) {
378 if ($checkPermission) {
379 list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause('civicrm_contact');
380 $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : '';
381 }
382 $query = "SELECT dedupe.id1 as id
383 FROM dedupe JOIN civicrm_contact ON dedupe.id1 = civicrm_contact.id {$this->_aclFrom}
384 WHERE contact_type = '{$this->contact_type}' {$this->_aclWhere}
385 AND weight >= {$this->threshold}";
386 }
387 else {
388 $this->_aclWhere = ' AND c1.is_deleted = 0 AND c2.is_deleted = 0';
389 if ($checkPermission) {
390 list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause(array('c1', 'c2'));
391 $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : '';
392 }
393 $query = "SELECT IF(dedupe.id1 < dedupe.id2, dedupe.id1, dedupe.id2) as id1,
394 IF(dedupe.id1 < dedupe.id2, dedupe.id2, dedupe.id1) as id2, dedupe.weight
395 FROM dedupe JOIN civicrm_contact c1 ON dedupe.id1 = c1.id
396 JOIN civicrm_contact c2 ON dedupe.id2 = c2.id {$this->_aclFrom}
397 LEFT JOIN civicrm_dedupe_exception exc ON dedupe.id1 = exc.contact_id1 AND dedupe.id2 = exc.contact_id2
398 WHERE c1.contact_type = '{$this->contact_type}' AND
399 c2.contact_type = '{$this->contact_type}' {$this->_aclWhere}
400 AND weight >= {$this->threshold} AND exc.contact_id1 IS NULL";
401 }
402
403 CRM_Utils_Hook::dupeQuery($this, 'threshold', $query);
404 return $query;
405 }
406
407 /**
408 * find fields related to a rule group.
409 *
410 * @param array $params
411 *
412 * @return array
413 * (rule field => weight) array and threshold associated to rule group
414 */
415 public static function dedupeRuleFieldsWeight($params) {
416 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
417 $rgBao->contact_type = $params['contact_type'];
418 if (!empty($params['id'])) {
419 // accept an ID if provided
420 $rgBao->id = $params['id'];
421 }
422 else {
423 $rgBao->used = $params['used'];
424 }
425 $rgBao->find(TRUE);
426
427 $ruleBao = new CRM_Dedupe_BAO_Rule();
428 $ruleBao->dedupe_rule_group_id = $rgBao->id;
429 $ruleBao->find();
430 $ruleFields = array();
431 while ($ruleBao->fetch()) {
432 $ruleFields[$ruleBao->rule_field] = $ruleBao->rule_weight;
433 }
434
435 return array($ruleFields, $rgBao->threshold);
436 }
437
438 /**
439 * Get all of the combinations of fields that would work with a rule.
440 *
441 * @param array $rgFields
442 * @param int $threshold
443 * @param array $combos
444 * @param array $running
445 */
446 public static function combos($rgFields, $threshold, &$combos, $running = array()) {
447 foreach ($rgFields as $rgField => $weight) {
448 unset($rgFields[$rgField]);
449 $diff = $threshold - $weight;
450 $runningnow = $running;
451 $runningnow[] = $rgField;
452 if ($diff > 0) {
453 self::combos($rgFields, $diff, $combos, $runningnow);
454 }
455 else {
456 $combos[] = $runningnow;
457 }
458 }
459 }
460
461 /**
462 * Get an array of rule group id to rule group name
463 * for all th groups for that contactType. If contactType
464 * not specified, do it for all
465 *
466 * @param string $contactType
467 * Individual, Household or Organization.
468 *
469 *
470 * @return array
471 * id => "nice name" of rule group
472 */
473 public static function getByType($contactType = NULL) {
474 $dao = new CRM_Dedupe_DAO_RuleGroup();
475
476 if ($contactType) {
477 $dao->contact_type = $contactType;
478 }
479
480 $dao->find();
481 $result = array();
482 while ($dao->fetch()) {
483 $title = !empty($dao->title) ? $dao->title : (!empty($dao->name) ? $dao->name : $dao->contact_type);
484
485 $name = "$title - {$dao->used}";
486 $result[$dao->id] = $name;
487 }
488 return $result;
489 }
490
491
492 /**
493 * Get the cached contact type for a particular rule group.
494 *
495 * @param int $rule_group_id
496 *
497 * @return string
498 */
499 public static function getContactTypeForRuleGroup($rule_group_id) {
500 if (!isset(\Civi::$statics[__CLASS__]) || !isset(\Civi::$statics[__CLASS__]['rule_groups'])) {
501 \Civi::$statics[__CLASS__]['rule_groups'] = array();
502 }
503 if (empty(\Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id])) {
504 \Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id]['contact_type'] = CRM_Core_DAO::getFieldValue(
505 'CRM_Dedupe_DAO_RuleGroup',
506 $rule_group_id,
507 'contact_type'
508 );
509 }
510
511 return \Civi::$statics[__CLASS__]['rule_groups'][$rule_group_id]['contact_type'];
512 }
513
514 }