Commit | Line | Data |
---|---|---|
6a488035 TO |
1 | <?php |
2 | /* | |
3 | +--------------------------------------------------------------------+ | |
39de6fd5 | 4 | | CiviCRM version 4.6 | |
6a488035 | 5 | +--------------------------------------------------------------------+ |
06b69b18 | 6 | | Copyright CiviCRM LLC (c) 2004-2014 | |
6a488035 TO |
7 | +--------------------------------------------------------------------+ |
8 | | This file is a part of CiviCRM. | | |
9 | | | | |
10 | | CiviCRM is free software; you can copy, modify, and distribute it | | |
11 | | under the terms of the GNU Affero General Public License | | |
12 | | Version 3, 19 November 2007 and the CiviCRM Licensing Exception. | | |
13 | | | | |
14 | | CiviCRM is distributed in the hope that it will be useful, but | | |
15 | | WITHOUT ANY WARRANTY; without even the implied warranty of | | |
16 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | | |
17 | | See the GNU Affero General Public License for more details. | | |
18 | | | | |
19 | | You should have received a copy of the GNU Affero General Public | | |
20 | | License and the CiviCRM Licensing Exception along | | |
21 | | with this program; if not, contact CiviCRM LLC | | |
22 | | at info[AT]civicrm[DOT]org. If you have questions about the | | |
23 | | GNU Affero General Public License or the licensing of CiviCRM, | | |
24 | | see the CiviCRM license FAQ at http://civicrm.org/licensing | | |
25 | +--------------------------------------------------------------------+ | |
26 | */ | |
27 | ||
28 | /** | |
29 | * | |
30 | * @package CRM | |
06b69b18 | 31 | * @copyright CiviCRM LLC (c) 2004-2014 |
6a488035 TO |
32 | * $Id$ |
33 | * | |
34 | */ | |
35 | ||
36 | /** | |
37 | * The CiviCRM duplicate discovery engine is based on an | |
38 | * algorithm designed by David Strauss <david@fourkitchens.com>. | |
39 | */ | |
40 | class CRM_Dedupe_BAO_RuleGroup extends CRM_Dedupe_DAO_RuleGroup { | |
41 | ||
42 | /** | |
100fef9d | 43 | * Ids of the contacts to limit the SQL queries (whole-database queries otherwise) |
6a488035 TO |
44 | */ |
45 | var $contactIds = array(); | |
46 | ||
47 | /** | |
100fef9d | 48 | * Params to dedupe against (queries against the whole contact set otherwise) |
6a488035 TO |
49 | */ |
50 | var $params = array(); | |
51 | ||
52 | /** | |
100fef9d | 53 | * If there are no rules in rule group |
6a488035 TO |
54 | */ |
55 | var $noRules = FALSE; | |
56 | ||
57 | /** | |
58 | * Return a structure holding the supported tables, fields and their titles | |
59 | * | |
98997235 TO |
60 | * @param string $requestedType |
61 | * The requested contact type. | |
6a488035 | 62 | * |
a6c01b45 CW |
63 | * @return array |
64 | * a table-keyed array of field-keyed arrays holding supported fields' titles | |
6a488035 | 65 | */ |
00be9182 | 66 | public static function &supportedFields($requestedType) { |
6a488035 TO |
67 | static $fields = NULL; |
68 | if (!$fields) { | |
69 | // this is needed, as we're piggy-backing importableFields() below | |
70 | $replacements = array( | |
71 | 'civicrm_country.name' => 'civicrm_address.country_id', | |
72 | 'civicrm_county.name' => 'civicrm_address.county_id', | |
73 | 'civicrm_state_province.name' => 'civicrm_address.state_province_id', | |
74 | 'gender.label' => 'civicrm_contact.gender_id', | |
75 | 'individual_prefix.label' => 'civicrm_contact.prefix_id', | |
76 | 'individual_suffix.label' => 'civicrm_contact.suffix_id', | |
77 | 'addressee.label' => 'civicrm_contact.addressee_id', | |
78 | 'email_greeting.label' => 'civicrm_contact.email_greeting_id', | |
79 | 'postal_greeting.label' => 'civicrm_contact.postal_greeting_id', | |
80 | ); | |
81 | // the table names we support in dedupe rules - a filter for importableFields() | |
82 | $supportedTables = array( | |
353ffa53 TO |
83 | 'civicrm_address', |
84 | 'civicrm_contact', | |
85 | 'civicrm_email', | |
86 | 'civicrm_im', | |
87 | 'civicrm_note', | |
88 | 'civicrm_openid', | |
89 | 'civicrm_phone', | |
6a488035 TO |
90 | ); |
91 | ||
309a09df | 92 | foreach (array('Individual', 'Organization', 'Household') as $ctype) { |
6a488035 TO |
93 | // take the table.field pairs and their titles from importableFields() if the table is supported |
94 | foreach (CRM_Contact_BAO_Contact::importableFields($ctype) as $iField) { | |
95 | if (isset($iField['where'])) { | |
96 | $where = $iField['where']; | |
97 | if (isset($replacements[$where])) { | |
98 | $where = $replacements[$where]; | |
99 | } | |
100 | list($table, $field) = explode('.', $where); | |
101 | if (!in_array($table, $supportedTables)) { | |
102 | continue; | |
103 | } | |
104 | $fields[$ctype][$table][$field] = $iField['title']; | |
105 | } | |
106 | } | |
107 | // add custom data fields | |
108 | foreach (CRM_Core_BAO_CustomGroup::getTree($ctype, CRM_Core_DAO::$_nullObject, NULL, -1) as $key => $cg) { | |
109 | if (!is_int($key)) { | |
110 | continue; | |
111 | } | |
112 | foreach ($cg['fields'] as $cf) { | |
113 | $fields[$ctype][$cg['table_name']][$cf['column_name']] = $cf['label']; | |
114 | } | |
115 | } | |
116 | } | |
117 | } | |
1cf05c3e | 118 | CRM_Utils_Hook::dupeQuery(CRM_Core_DAO::$_nullObject, 'supportedFields', $fields); |
6a488035 TO |
119 | return $fields[$requestedType]; |
120 | } | |
121 | ||
122 | /** | |
123 | * Return the SQL query for dropping the temporary table. | |
124 | */ | |
00be9182 | 125 | public function tableDropQuery() { |
6a488035 TO |
126 | return 'DROP TEMPORARY TABLE IF EXISTS dedupe'; |
127 | } | |
128 | ||
129 | /** | |
130 | * Return a set of SQL queries whose cummulative weights will mark matched | |
131 | * records for the RuleGroup::threasholdQuery() to retrieve. | |
132 | */ | |
00be9182 | 133 | public function tableQuery() { |
6a488035 TO |
134 | // make sure we've got a fetched dbrecord, not sure if this is enforced |
135 | if (!$this->name == NULL || $this->is_reserved == NULL) { | |
136 | $this->find(TRUE); | |
137 | } | |
138 | ||
139 | // Reserved Rule Groups can optionally get special treatment by | |
140 | // implementing an optimization class and returning a query array. | |
141 | if ($this->is_reserved && | |
142 | CRM_Utils_File::isIncludable("CRM/Dedupe/BAO/QueryBuilder/{$this->name}.php") | |
143 | ) { | |
144 | include_once "CRM/Dedupe/BAO/QueryBuilder/{$this->name}.php"; | |
353ffa53 | 145 | $class = "CRM_Dedupe_BAO_QueryBuilder_{$this->name}"; |
6a488035 TO |
146 | $command = empty($this->params) ? 'internal' : 'record'; |
147 | $queries = call_user_func(array($class, $command), $this); | |
148 | } | |
149 | else { | |
150 | // All other rule groups have queries generated by the member dedupe | |
151 | // rules defined in the administrative interface. | |
152 | ||
153 | // Find all rules contained by this script sorted by weight so that | |
154 | // their execution can be short circuited on RuleGroup::fillTable() | |
155 | $bao = new CRM_Dedupe_BAO_Rule(); | |
156 | $bao->dedupe_rule_group_id = $this->id; | |
157 | $bao->orderBy('rule_weight DESC'); | |
158 | $bao->find(); | |
159 | ||
160 | // Generate a SQL query for each rule in the rule group that is | |
161 | // tailored to respect the param and contactId options provided. | |
162 | $queries = array(); | |
163 | while ($bao->fetch()) { | |
164 | $bao->contactIds = $this->contactIds; | |
165 | $bao->params = $this->params; | |
166 | ||
167 | // Skipping empty rules? Empty rules shouldn't exist; why check? | |
168 | if ($query = $bao->sql()) { | |
169 | $queries["{$bao->rule_table}.{$bao->rule_field}.{$bao->rule_weight}"] = $query; | |
170 | } | |
171 | } | |
172 | } | |
173 | ||
174 | // if there are no rules in this rule group | |
175 | // add an empty query fulfilling the pattern | |
176 | if (!$queries) { | |
177 | $queries = array('SELECT 0 id1, 0 id2, 0 weight LIMIT 0'); | |
178 | $this->noRules = TRUE; | |
179 | } | |
180 | ||
181 | return $queries; | |
182 | } | |
183 | ||
00be9182 | 184 | public function fillTable() { |
6a488035 TO |
185 | // get the list of queries handy |
186 | $tableQueries = $this->tableQuery(); | |
187 | ||
188 | if ($this->params && !$this->noRules) { | |
189 | $tempTableQuery = "CREATE TEMPORARY TABLE dedupe (id1 int, weight int, UNIQUE UI_id1 (id1)) ENGINE=MyISAM"; | |
353ffa53 TO |
190 | $insertClause = "INSERT INTO dedupe (id1, weight)"; |
191 | $groupByClause = "GROUP BY id1"; | |
192 | $dupeCopyJoin = " JOIN dedupe_copy ON dedupe_copy.id1 = t1.column WHERE "; | |
6a488035 TO |
193 | } |
194 | else { | |
195 | $tempTableQuery = "CREATE TEMPORARY TABLE dedupe (id1 int, id2 int, weight int, UNIQUE UI_id1_id2 (id1, id2)) ENGINE=MyISAM"; | |
353ffa53 TO |
196 | $insertClause = "INSERT INTO dedupe (id1, id2, weight)"; |
197 | $groupByClause = "GROUP BY id1, id2"; | |
198 | $dupeCopyJoin = " JOIN dedupe_copy ON dedupe_copy.id1 = t1.column AND dedupe_copy.id2 = t2.column WHERE "; | |
6a488035 TO |
199 | } |
200 | $patternColumn = '/t1.(\w+)/'; | |
201 | $exclWeightSum = array(); | |
202 | ||
203 | // create temp table | |
204 | $dao = new CRM_Core_DAO(); | |
205 | $dao->query($tempTableQuery); | |
206 | ||
6a488035 TO |
207 | CRM_Utils_Hook::dupeQuery($this, 'table', $tableQueries); |
208 | ||
209 | while (!empty($tableQueries)) { | |
210 | list($isInclusive, $isDie) = self::isQuerySetInclusive($tableQueries, $this->threshold, $exclWeightSum); | |
211 | ||
212 | if ($isInclusive) { | |
213 | // order queries by table count | |
214 | self::orderByTableCount($tableQueries); | |
215 | ||
216 | $weightSum = array_sum($exclWeightSum); | |
217 | $searchWithinDupes = !empty($exclWeightSum) ? 1 : 0; | |
218 | ||
219 | while (!empty($tableQueries)) { | |
220 | // extract the next query ( and weight ) to be executed | |
221 | $fieldWeight = array_keys($tableQueries); | |
222 | $fieldWeight = $fieldWeight[0]; | |
353ffa53 | 223 | $query = array_shift($tableQueries); |
6a488035 TO |
224 | |
225 | if ($searchWithinDupes) { | |
226 | // get prepared to search within already found dupes if $searchWithinDupes flag is set | |
227 | $dao->query("DROP TEMPORARY TABLE IF EXISTS dedupe_copy"); | |
228 | $dao->query("CREATE TEMPORARY TABLE dedupe_copy SELECT * FROM dedupe WHERE weight >= {$weightSum}"); | |
229 | $dao->free(); | |
230 | ||
231 | preg_match($patternColumn, $query, $matches); | |
232 | $query = str_replace(' WHERE ', str_replace('column', $matches[1], $dupeCopyJoin), $query); | |
233 | } | |
234 | $searchWithinDupes = 1; | |
235 | ||
236 | // construct and execute the intermediate query | |
237 | $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)"; | |
238 | $dao->query($query); | |
239 | ||
240 | // FIXME: we need to be more acurate with affected rows, especially for insert vs duplicate insert. | |
241 | // And that will help optimize further. | |
242 | $affectedRows = $dao->affectedRows(); | |
243 | $dao->free(); | |
244 | ||
245 | // In an inclusive situation, failure of any query means no further processing - | |
246 | if ($affectedRows == 0) { | |
247 | // reset to make sure no further execution is done. | |
248 | $tableQueries = array(); | |
249 | break; | |
250 | } | |
251 | $weightSum = substr($fieldWeight, strrpos($fieldWeight, '.') + 1) + $weightSum; | |
252 | } | |
253 | // An exclusive situation - | |
254 | } | |
255 | elseif (!$isDie) { | |
256 | // since queries are already sorted by weights, we can continue as is | |
257 | $fieldWeight = array_keys($tableQueries); | |
258 | $fieldWeight = $fieldWeight[0]; | |
353ffa53 TO |
259 | $query = array_shift($tableQueries); |
260 | $query = "{$insertClause} {$query} {$groupByClause} ON DUPLICATE KEY UPDATE weight = weight + VALUES(weight)"; | |
6a488035 TO |
261 | $dao->query($query); |
262 | if ($dao->affectedRows() >= 1) { | |
263 | $exclWeightSum[] = substr($fieldWeight, strrpos($fieldWeight, '.') + 1); | |
264 | } | |
265 | $dao->free(); | |
266 | } | |
267 | else { | |
268 | // its a die situation | |
269 | break; | |
270 | } | |
271 | } | |
272 | } | |
273 | ||
e0ef6999 | 274 | /** |
4f1f1f2a CW |
275 | * Function to determine if a given query set contains inclusive or exclusive set of weights. |
276 | * The function assumes that the query set is already ordered by weight in desc order. | |
e0ef6999 EM |
277 | * @param $tableQueries |
278 | * @param $threshold | |
279 | * @param array $exclWeightSum | |
280 | * | |
281 | * @return array | |
282 | */ | |
00be9182 | 283 | public static function isQuerySetInclusive($tableQueries, $threshold, $exclWeightSum = array()) { |
6a488035 TO |
284 | $input = array(); |
285 | foreach ($tableQueries as $key => $query) { | |
286 | $input[] = substr($key, strrpos($key, '.') + 1); | |
287 | } | |
288 | ||
289 | if (!empty($exclWeightSum)) { | |
290 | $input = array_merge($input, $exclWeightSum); | |
291 | rsort($input); | |
292 | } | |
293 | ||
294 | if (count($input) == 1) { | |
295 | return array(FALSE, $input[0] < $threshold); | |
296 | } | |
297 | ||
298 | $totalCombinations = 0; | |
299 | for ($i = 0; $i < count($input); $i++) { | |
300 | $combination = array($input[$i]); | |
301 | if (array_sum($combination) >= $threshold) { | |
302 | $totalCombinations++; | |
303 | continue; | |
304 | } | |
305 | for ($j = $i + 1; $j < count($input); $j++) { | |
306 | $combination[] = $input[$j]; | |
307 | if (array_sum($combination) >= $threshold) { | |
308 | $totalCombinations++; | |
309 | } | |
310 | } | |
311 | } | |
312 | return array($totalCombinations == 1, $totalCombinations <= 0); | |
313 | } | |
314 | ||
e0ef6999 | 315 | /** |
4f1f1f2a | 316 | * sort queries by number of records for the table associated with them |
e0ef6999 EM |
317 | * @param $tableQueries |
318 | */ | |
00be9182 | 319 | public static function orderByTableCount(&$tableQueries) { |
6a488035 TO |
320 | static $tableCount = array(); |
321 | ||
322 | $tempArray = array(); | |
323 | foreach ($tableQueries as $key => $query) { | |
324 | $table = explode(".", $key); | |
325 | $table = $table[0]; | |
326 | if (!array_key_exists($table, $tableCount)) { | |
327 | $query = "SELECT COUNT(*) FROM {$table}"; | |
328 | $tableCount[$table] = CRM_Core_DAO::singleValueQuery($query); | |
329 | } | |
330 | $tempArray[$key] = $tableCount[$table]; | |
331 | } | |
332 | ||
333 | asort($tempArray); | |
334 | foreach ($tempArray as $key => $count) { | |
335 | $tempArray[$key] = $tableQueries[$key]; | |
336 | } | |
337 | $tableQueries = $tempArray; | |
338 | } | |
339 | ||
340 | /** | |
341 | * Return the SQL query for getting only the interesting results out of the dedupe table. | |
342 | * | |
343 | * @$checkPermission boolean $params a flag to indicate if permission should be considered. | |
344 | * default is to always check permissioning but public pages for example might not want | |
345 | * permission to be checked for anonymous users. Refer CRM-6211. We might be beaking | |
346 | * Multi-Site dedupe for public pages. | |
6a488035 | 347 | */ |
00be9182 | 348 | public function thresholdQuery($checkPermission = TRUE) { |
6a488035 TO |
349 | $this->_aclFrom = ''; |
350 | // CRM-6603: anonymous dupechecks side-step ACLs | |
351 | $this->_aclWhere = ' AND is_deleted = 0 '; | |
352 | ||
353 | if ($this->params && !$this->noRules) { | |
354 | if ($checkPermission) { | |
355 | list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause('civicrm_contact'); | |
356 | $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : ''; | |
357 | } | |
358 | $query = "SELECT dedupe.id1 as id | |
359 | FROM dedupe JOIN civicrm_contact ON dedupe.id1 = civicrm_contact.id {$this->_aclFrom} | |
360 | WHERE contact_type = '{$this->contact_type}' {$this->_aclWhere} | |
361 | AND weight >= {$this->threshold}"; | |
362 | } | |
363 | else { | |
364 | $this->_aclWhere = ' AND c1.is_deleted = 0 AND c2.is_deleted = 0'; | |
365 | if ($checkPermission) { | |
366 | list($this->_aclFrom, $this->_aclWhere) = CRM_Contact_BAO_Contact_Permission::cacheClause(array('c1', 'c2')); | |
367 | $this->_aclWhere = $this->_aclWhere ? "AND {$this->_aclWhere}" : ''; | |
368 | } | |
369 | $query = "SELECT dedupe.id1, dedupe.id2, dedupe.weight | |
d4c8a770 | 370 | FROM dedupe JOIN civicrm_contact c1 ON dedupe.id1 = c1.id |
6a488035 TO |
371 | JOIN civicrm_contact c2 ON dedupe.id2 = c2.id {$this->_aclFrom} |
372 | LEFT JOIN civicrm_dedupe_exception exc ON dedupe.id1 = exc.contact_id1 AND dedupe.id2 = exc.contact_id2 | |
d4c8a770 | 373 | WHERE c1.contact_type = '{$this->contact_type}' AND |
6a488035 TO |
374 | c2.contact_type = '{$this->contact_type}' {$this->_aclWhere} |
375 | AND weight >= {$this->threshold} AND exc.contact_id1 IS NULL"; | |
376 | } | |
377 | ||
378 | CRM_Utils_Hook::dupeQuery($this, 'threshold', $query); | |
379 | return $query; | |
380 | } | |
381 | ||
382 | /** | |
dc195289 | 383 | * find fields related to a rule group. |
6a488035 | 384 | * |
c490a46a | 385 | * @param array $params |
77b97be7 | 386 | * |
a6c01b45 CW |
387 | * @return array |
388 | * (rule field => weight) array and threshold associated to rule group | |
6a488035 | 389 | */ |
00be9182 | 390 | public static function dedupeRuleFieldsWeight($params) { |
353ffa53 | 391 | $rgBao = new CRM_Dedupe_BAO_RuleGroup(); |
6a488035 | 392 | $rgBao->contact_type = $params['contact_type']; |
03390e26 | 393 | if (CRM_Utils_Array::value('id', $params)) { |
394 | // accept an ID if provided | |
395 | $rgBao->id = $params['id']; | |
396 | } | |
397 | else { | |
398 | $rgBao->used = $params['used']; | |
399 | } | |
6a488035 TO |
400 | $rgBao->find(TRUE); |
401 | ||
402 | $ruleBao = new CRM_Dedupe_BAO_Rule(); | |
403 | $ruleBao->dedupe_rule_group_id = $rgBao->id; | |
404 | $ruleBao->find(); | |
405 | $ruleFields = array(); | |
406 | while ($ruleBao->fetch()) { | |
407 | $ruleFields[$ruleBao->rule_field] = $ruleBao->rule_weight; | |
408 | } | |
409 | ||
410 | return array($ruleFields, $rgBao->threshold); | |
411 | } | |
412 | ||
03390e26 | 413 | /** |
414 | * Get all of the combinations of fields that would work with a rule | |
415 | */ | |
00be9182 | 416 | public static function combos($rgFields, $threshold, &$combos, $running = array()) { |
03390e26 | 417 | foreach ($rgFields as $rgField => $weight) { |
418 | unset($rgFields[$rgField]); | |
419 | $diff = $threshold - $weight; | |
420 | $runningnow = $running; | |
421 | $runningnow[] = $rgField; | |
422 | if ($diff > 0) { | |
423 | self::combos($rgFields, $diff, $combos, $runningnow); | |
424 | } | |
425 | else { | |
426 | $combos[] = $runningnow; | |
427 | } | |
428 | } | |
429 | } | |
430 | ||
6a488035 TO |
431 | /** |
432 | * Get an array of rule group id to rule group name | |
433 | * for all th groups for that contactType. If contactType | |
434 | * not specified, do it for all | |
435 | * | |
98997235 TO |
436 | * @param string $contactType |
437 | * Individual, Household or Organization. | |
6a488035 | 438 | * |
6a488035 | 439 | * |
a6c01b45 CW |
440 | * @return array |
441 | * id => "nice name" of rule group | |
6a488035 | 442 | */ |
00be9182 | 443 | public static function getByType($contactType = NULL) { |
6a488035 TO |
444 | $dao = new CRM_Dedupe_DAO_RuleGroup(); |
445 | ||
446 | if ($contactType) { | |
447 | $dao->contact_type = $contactType; | |
448 | } | |
449 | ||
450 | $dao->find(); | |
451 | $result = array(); | |
452 | while ($dao->fetch()) { | |
389bcebf | 453 | $title = !empty($dao->title) ? $dao->title : (!empty($dao->name) ? $dao->name : $dao->contact_type); |
01a93ecd DL |
454 | |
455 | $name = "$title - {$dao->used}"; | |
6a488035 TO |
456 | $result[$dao->id] = $name; |
457 | } | |
458 | return $result; | |
459 | } | |
460 | } |