Merge pull request #4893 from colemanw/INFRA-132
[civicrm-core.git] / CRM / Dedupe / BAO / RuleGroup.php
CommitLineData
6a488035
TO
1<?php
2/*
3 +--------------------------------------------------------------------+
39de6fd5 4 | CiviCRM version 4.6 |
6a488035 5 +--------------------------------------------------------------------+
06b69b18 6 | Copyright CiviCRM LLC (c) 2004-2014 |
6a488035
TO
7 +--------------------------------------------------------------------+
8 | This file is a part of CiviCRM. |
9 | |
10 | CiviCRM is free software; you can copy, modify, and distribute it |
11 | under the terms of the GNU Affero General Public License |
12 | Version 3, 19 November 2007 and the CiviCRM Licensing Exception. |
13 | |
14 | CiviCRM is distributed in the hope that it will be useful, but |
15 | WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
17 | See the GNU Affero General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU Affero General Public |
20 | License and the CiviCRM Licensing Exception along |
21 | with this program; if not, contact CiviCRM LLC |
22 | at info[AT]civicrm[DOT]org. If you have questions about the |
23 | GNU Affero General Public License or the licensing of CiviCRM, |
24 | see the CiviCRM license FAQ at http://civicrm.org/licensing |
25 +--------------------------------------------------------------------+
26*/
27
28/**
29 *
30 * @package CRM
06b69b18 31 * @copyright CiviCRM LLC (c) 2004-2014
6a488035
TO
32 * $Id$
33 *
34 */
35
36/**
37 * The CiviCRM duplicate discovery engine is based on an
38 * algorithm designed by David Strauss <david@fourkitchens.com>.
39 */
40class CRM_Dedupe_BAO_RuleGroup extends CRM_Dedupe_DAO_RuleGroup {
41
42 /**
100fef9d 43 * Ids of the contacts to limit the SQL queries (whole-database queries otherwise)
6a488035
TO
44 */
45 var $contactIds = array();
46
47 /**
100fef9d 48 * Params to dedupe against (queries against the whole contact set otherwise)
6a488035
TO
49 */
50 var $params = array();
51
52 /**
100fef9d 53 * If there are no rules in rule group
6a488035
TO
54 */
55 var $noRules = FALSE;
56
57 /**
58 * Return a structure holding the supported tables, fields and their titles
59 *
98997235
TO
60 * @param string $requestedType
61 * The requested contact type.
6a488035 62 *
a6c01b45
CW
63 * @return array
64 * a table-keyed array of field-keyed arrays holding supported fields' titles
6a488035 65 */
00be9182 66 public static function &supportedFields($requestedType) {
6a488035
TO
67 static $fields = NULL;
68 if (!$fields) {
69 // this is needed, as we're piggy-backing importableFields() below
70 $replacements = array(
71 'civicrm_country.name' => 'civicrm_address.country_id',
72 'civicrm_county.name' => 'civicrm_address.county_id',
73 'civicrm_state_province.name' => 'civicrm_address.state_province_id',
74 'gender.label' => 'civicrm_contact.gender_id',
75 'individual_prefix.label' => 'civicrm_contact.prefix_id',
76 'individual_suffix.label' => 'civicrm_contact.suffix_id',
77 'addressee.label' => 'civicrm_contact.addressee_id',
78 'email_greeting.label' => 'civicrm_contact.email_greeting_id',
79 'postal_greeting.label' => 'civicrm_contact.postal_greeting_id',
80 );
81 // the table names we support in dedupe rules - a filter for importableFields()
82 $supportedTables = array(
83 'civicrm_address', 'civicrm_contact', 'civicrm_email',
84 'civicrm_im', 'civicrm_note', 'civicrm_openid', 'civicrm_phone',
85 );
86
309a09df 87 foreach (array('Individual', 'Organization', 'Household') as $ctype) {
6a488035
TO
88 // take the table.field pairs and their titles from importableFields() if the table is supported
89 foreach (CRM_Contact_BAO_Contact::importableFields($ctype) as $iField) {
90 if (isset($iField['where'])) {
91 $where = $iField['where'];
92 if (isset($replacements[$where])) {
93 $where = $replacements[$where];
94 }
95 list($table, $field) = explode('.', $where);
96 if (!in_array($table, $supportedTables)) {
97 continue;
98 }
99 $fields[$ctype][$table][$field] = $iField['title'];
100 }
101 }
102 // add custom data fields
103 foreach (CRM_Core_BAO_CustomGroup::getTree($ctype, CRM_Core_DAO::$_nullObject, NULL, -1) as $key => $cg) {
104 if (!is_int($key)) {
105 continue;
106 }
107 foreach ($cg['fields'] as $cf) {
108 $fields[$ctype][$cg['table_name']][$cf['column_name']] = $cf['label'];
109 }
110 }
111 }
112 }
1cf05c3e 113 CRM_Utils_Hook::dupeQuery(CRM_Core_DAO::$_nullObject, 'supportedFields', $fields);
6a488035
TO
114 return $fields[$requestedType];
115 }
116
117 /**
118 * Return the SQL query for dropping the temporary table.
119 */
00be9182 120 public function tableDropQuery() {
6a488035
TO
121 return 'DROP TEMPORARY TABLE IF EXISTS dedupe';
122 }
123
124 /**
125 * Return a set of SQL queries whose cummulative weights will mark matched
126 * records for the RuleGroup::threasholdQuery() to retrieve.
127 */
00be9182 128 public function tableQuery() {
6a488035
TO
129 // make sure we've got a fetched dbrecord, not sure if this is enforced
130 if (!$this->name == NULL || $this->is_reserved == NULL) {
131 $this->find(TRUE);
132 }
133
134 // Reserved Rule Groups can optionally get special treatment by
135 // implementing an optimization class and returning a query array.
136 if ($this->is_reserved &&
137 CRM_Utils_File::isIncludable("CRM/Dedupe/BAO/QueryBuilder/{$this->name}.php")
138 ) {
139 include_once "CRM/Dedupe/BAO/QueryBuilder/{$this->name}.php";
140 $class = "CRM_Dedupe_BAO_QueryBuilder_{$this->name}";
141 $command = empty($this->params) ? 'internal' : 'record';
142 $queries = call_user_func(array($class, $command), $this);
143 }
144 else {
145 // All other rule groups have queries generated by the member dedupe
146 // rules defined in the administrative interface.
147
148 // Find all rules contained by this script sorted by weight so that
149 // their execution can be short circuited on RuleGroup::fillTable()
150 $bao = new CRM_Dedupe_BAO_Rule();
151 $bao->dedupe_rule_group_id = $this->id;
152 $bao->orderBy('rule_weight DESC');
153 $bao->find();
154
155 // Generate a SQL query for each rule in the rule group that is
156 // tailored to respect the param and contactId options provided.
157 $queries = array();
158 while ($bao->fetch()) {
159 $bao->contactIds = $this->contactIds;
160 $bao->params = $this->params;
161
162 // Skipping empty rules? Empty rules shouldn't exist; why check?
163 if ($query = $bao->sql()) {
164 $queries["{$bao->rule_table}.{$bao->rule_field}.{$bao->rule_weight}"] = $query;
165 }
166 }
167 }
168
169 // if there are no rules in this rule group
170 // add an empty query fulfilling the pattern
171 if (!$queries) {
172 $queries = array('SELECT 0 id1, 0 id2, 0 weight LIMIT 0');
173 $this->noRules = TRUE;
174 }
175
176 return $queries;
177 }
178
00be9182 179 public function fillTable() {
6a488035
TO
180 // get the list of queries handy
181 $tableQueries = $this->tableQuery();
182
183 if ($this->params && !$this->noRules) {
184 $tempTableQuery = "CREATE TEMPORARY TABLE dedupe (id1 int, weight int, UNIQUE UI_id1 (id1)) ENGINE=MyISAM";
185 $insertClause = "INSERT INTO dedupe (id1, weight)";
186 $groupByClause = "GROUP BY id1";
187 $dupeCopyJoin = " JOIN dedupe_copy ON dedupe_copy.id1 = t1.column WHERE ";
188 }
189 else {
190 $tempTableQuery = "CREATE TEMPORARY TABLE dedupe (id1 int, id2 int, weight int, UNIQUE UI_id1_id2 (id1, id2)) ENGINE=MyISAM";
191 $insertClause = "INSERT INTO dedupe (id1, id2, weight)";
192 $groupByClause = "GROUP BY id1, id2";
193 $dupeCopyJoin = " JOIN dedupe_copy ON dedupe_copy.id1 = t1.column AND dedupe_copy.id2 = t2.column WHERE ";
194 }
195 $patternColumn = '/t1.(\w+)/';
196 $exclWeightSum = array();
197
198 // create temp table
199 $dao = new CRM_Core_DAO();
200 $dao->query($tempTableQuery);
201
6a488035
TO
202 CRM_Utils_Hook::dupeQuery($this, 'table', $tableQueries);
203
204 while (!empty($tableQueries)) {
205 list($isInclusive, $isDie) = self::isQuerySetInclusive($tableQueries, $this->threshold, $exclWeightSum);
206
207 if ($isInclusive) {
208 // order queries by table count
209 self::orderByTableCount($tableQueries);
210
211 $weightSum = array_sum($exclWeightSum);
212 $searchWithinDupes = !empty($exclWeightSum) ? 1 : 0;
213
214 while (!empty($tableQueries)) {
215 // extract the next query ( and weight ) to be executed
216 $fieldWeight = array_keys($tableQueries);
217 $fieldWeight = $fieldWeight[0];
218 $query = array_shift($tableQueries);
219
220 if ($searchWithinDupes) {
221 // get prepared to search within already found dupes if $searchWithinDupes flag is set
222 $dao->query("DROP TEMPORARY TABLE IF EXISTS dedupe_copy");
223 $dao->query("CREATE TEMPORARY TABLE dedupe_copy SELECT * FROM dedupe WHERE weight >= {$weightSum}");
224 $dao->free();
225
226 preg_match($patternColumn, $query, $matches);
227 $query = str_replace(' WHERE ', str_replace('column', $matches[1], $dupeCopyJoin), $query);
228 }
229 $searchWithinDupes = 1;
230
231 // construct and execute the intermediate query
232 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
233 $dao->query($query);
234
235 // FIXME: we need to be more acurate with affected rows, especially for insert vs duplicate insert.
236 // And that will help optimize further.
237 $affectedRows = $dao->affectedRows();
238 $dao->free();
239
240 // In an inclusive situation, failure of any query means no further processing -
241 if ($affectedRows == 0) {
242 // reset to make sure no further execution is done.
243 $tableQueries = array();
244 break;
245 }
246 $weightSum = substr($fieldWeight, strrpos($fieldWeight, '.') + 1) + $weightSum;
247 }
248 // An exclusive situation -
249 }
250 elseif (!$isDie) {
251 // since queries are already sorted by weights, we can continue as is
252 $fieldWeight = array_keys($tableQueries);
253 $fieldWeight = $fieldWeight[0];
254 $query = array_shift($tableQueries);
255 $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)";
256 $dao->query($query);
257 if ($dao->affectedRows() >= 1) {
258 $exclWeightSum[] = substr($fieldWeight, strrpos($fieldWeight, '.') + 1);
259 }
260 $dao->free();
261 }
262 else {
263 // its a die situation
264 break;
265 }
266 }
267 }
268
269 // Function to determine if a given query set contains inclusive or exclusive set of weights.
270 // The function assumes that the query set is already ordered by weight in desc order.
e0ef6999
EM
271 /**
272 * @param $tableQueries
273 * @param $threshold
274 * @param array $exclWeightSum
275 *
276 * @return array
277 */
00be9182 278 public static function isQuerySetInclusive($tableQueries, $threshold, $exclWeightSum = array()) {
6a488035
TO
279 $input = array();
280 foreach ($tableQueries as $key => $query) {
281 $input[] = substr($key, strrpos($key, '.') + 1);
282 }
283
284 if (!empty($exclWeightSum)) {
285 $input = array_merge($input, $exclWeightSum);
286 rsort($input);
287 }
288
289 if (count($input) == 1) {
290 return array(FALSE, $input[0] < $threshold);
291 }
292
293 $totalCombinations = 0;
294 for ($i = 0; $i < count($input); $i++) {
295 $combination = array($input[$i]);
296 if (array_sum($combination) >= $threshold) {
297 $totalCombinations++;
298 continue;
299 }
300 for ($j = $i + 1; $j < count($input); $j++) {
301 $combination[] = $input[$j];
302 if (array_sum($combination) >= $threshold) {
303 $totalCombinations++;
304 }
305 }
306 }
307 return array($totalCombinations == 1, $totalCombinations <= 0);
308 }
309
310 // sort queries by number of records for the table associated with them
e0ef6999
EM
311 /**
312 * @param $tableQueries
313 */
00be9182 314 public static function orderByTableCount(&$tableQueries) {
6a488035
TO
315 static $tableCount = array();
316
317 $tempArray = array();
318 foreach ($tableQueries as $key => $query) {
319 $table = explode(".", $key);
320 $table = $table[0];
321 if (!array_key_exists($table, $tableCount)) {
322 $query = "SELECT COUNT(*) FROM {$table}";
323 $tableCount[$table] = CRM_Core_DAO::singleValueQuery($query);
324 }
325 $tempArray[$key] = $tableCount[$table];
326 }
327
328 asort($tempArray);
329 foreach ($tempArray as $key => $count) {
330 $tempArray[$key] = $tableQueries[$key];
331 }
332 $tableQueries = $tempArray;
333 }
334
335 /**
336 * Return the SQL query for getting only the interesting results out of the dedupe table.
337 *
338 * @$checkPermission boolean $params a flag to indicate if permission should be considered.
339 * default is to always check permissioning but public pages for example might not want
340 * permission to be checked for anonymous users. Refer CRM-6211. We might be beaking
341 * Multi-Site dedupe for public pages.
6a488035 342 */
00be9182 343 public function thresholdQuery($checkPermission = TRUE) {
6a488035
TO
344 $this->_aclFrom = '';
345 // CRM-6603: anonymous dupechecks side-step ACLs
346 $this->_aclWhere = ' AND is_deleted = 0 ';
347
348 if ($this->params && !$this->noRules) {
349 if ($checkPermission) {
350 list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause('civicrm_contact');
351 $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : '';
352 }
353 $query = "SELECT dedupe.id1 as id
354 FROM dedupe JOIN civicrm_contact ON dedupe.id1 = civicrm_contact.id {$this->_aclFrom}
355 WHERE contact_type = '{$this->contact_type}' {$this->_aclWhere}
356 AND weight >= {$this->threshold}";
357 }
358 else {
359 $this->_aclWhere = ' AND c1.is_deleted = 0 AND c2.is_deleted = 0';
360 if ($checkPermission) {
361 list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause(array('c1', 'c2'));
362 $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : '';
363 }
364 $query = "SELECT dedupe.id1, dedupe.id2, dedupe.weight
d4c8a770 365 FROM dedupe JOIN civicrm_contact c1 ON dedupe.id1 = c1.id
6a488035
TO
366 JOIN civicrm_contact c2 ON dedupe.id2 = c2.id {$this->_aclFrom}
367 LEFT JOIN civicrm_dedupe_exception exc ON dedupe.id1 = exc.contact_id1 AND dedupe.id2 = exc.contact_id2
d4c8a770 368 WHERE c1.contact_type = '{$this->contact_type}' AND
6a488035
TO
369 c2.contact_type = '{$this->contact_type}' {$this->_aclWhere}
370 AND weight >= {$this->threshold} AND exc.contact_id1 IS NULL";
371 }
372
373 CRM_Utils_Hook::dupeQuery($this, 'threshold', $query);
374 return $query;
375 }
376
377 /**
dc195289 378 * find fields related to a rule group.
6a488035 379 *
c490a46a 380 * @param array $params
77b97be7 381 *
a6c01b45
CW
382 * @return array
383 * (rule field => weight) array and threshold associated to rule group
e0b82b44 384 * @access public
6a488035 385 */
00be9182 386 public static function dedupeRuleFieldsWeight($params) {
6a488035 387 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
6a488035 388 $rgBao->contact_type = $params['contact_type'];
03390e26 389 if (CRM_Utils_Array::value('id', $params)) {
390 // accept an ID if provided
391 $rgBao->id = $params['id'];
392 }
393 else {
394 $rgBao->used = $params['used'];
395 }
6a488035
TO
396 $rgBao->find(TRUE);
397
398 $ruleBao = new CRM_Dedupe_BAO_Rule();
399 $ruleBao->dedupe_rule_group_id = $rgBao->id;
400 $ruleBao->find();
401 $ruleFields = array();
402 while ($ruleBao->fetch()) {
403 $ruleFields[$ruleBao->rule_field] = $ruleBao->rule_weight;
404 }
405
406 return array($ruleFields, $rgBao->threshold);
407 }
408
03390e26 409 /**
410 * Get all of the combinations of fields that would work with a rule
411 */
412
00be9182 413 public static function combos($rgFields, $threshold, &$combos, $running = array()) {
03390e26 414 foreach ($rgFields as $rgField => $weight) {
415 unset($rgFields[$rgField]);
416 $diff = $threshold - $weight;
417 $runningnow = $running;
418 $runningnow[] = $rgField;
419 if ($diff > 0) {
420 self::combos($rgFields, $diff, $combos, $runningnow);
421 }
422 else {
423 $combos[] = $runningnow;
424 }
425 }
426 }
427
6a488035
TO
428 /**
429 * Get an array of rule group id to rule group name
430 * for all th groups for that contactType. If contactType
431 * not specified, do it for all
432 *
98997235
TO
433 * @param string $contactType
434 * Individual, Household or Organization.
6a488035
TO
435 *
436 * @static
437 *
a6c01b45
CW
438 * @return array
439 * id => "nice name" of rule group
6a488035 440 */
00be9182 441 public static function getByType($contactType = NULL) {
6a488035
TO
442 $dao = new CRM_Dedupe_DAO_RuleGroup();
443
444 if ($contactType) {
445 $dao->contact_type = $contactType;
446 }
447
448 $dao->find();
449 $result = array();
450 while ($dao->fetch()) {
01a93ecd 451 $title =
d58a19a1 452 !empty($dao->title) ? $dao->title : (!empty($dao->name) ? $dao->name : $dao->contact_type);
01a93ecd
DL
453
454 $name = "$title - {$dao->used}";
6a488035
TO
455 $result[$dao->id] = $name;
456 }
457 return $result;
458 }
459}