Merge pull request #12306 from JO0st/dev-core-166
[civicrm-core.git] / CRM / Dedupe / Finder.php
CommitLineData
6a488035
TO
1<?php
2/*
3 +--------------------------------------------------------------------+
fee14197 4 | CiviCRM version 5 |
6a488035 5 +--------------------------------------------------------------------+
8c9251b3 6 | Copyright CiviCRM LLC (c) 2004-2018 |
6a488035
TO
7 +--------------------------------------------------------------------+
8 | This file is a part of CiviCRM. |
9 | |
10 | CiviCRM is free software; you can copy, modify, and distribute it |
11 | under the terms of the GNU Affero General Public License |
12 | Version 3, 19 November 2007 and the CiviCRM Licensing Exception. |
13 | |
14 | CiviCRM is distributed in the hope that it will be useful, but |
15 | WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
17 | See the GNU Affero General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU Affero General Public |
20 | License and the CiviCRM Licensing Exception along |
21 | with this program; if not, contact CiviCRM LLC |
22 | at info[AT]civicrm[DOT]org. If you have questions about the |
23 | GNU Affero General Public License or the licensing of CiviCRM, |
24 | see the CiviCRM license FAQ at http://civicrm.org/licensing |
25 +--------------------------------------------------------------------+
d25dd0ee 26 */
6a488035
TO
27
28/**
29 *
30 * @package CRM
8c9251b3 31 * @copyright CiviCRM LLC (c) 2004-2018
6a488035
TO
32 * $Id$
33 *
34 */
35
36/**
37 * The CiviCRM duplicate discovery engine is based on an
38 * algorithm designed by David Strauss <david@fourkitchens.com>.
39 */
40class CRM_Dedupe_Finder {
41
42 /**
43 * Return a contact_id-keyed array of arrays of possible dupes
44 * (of the key contact_id) - limited to dupes of $cids if provided.
45 *
98997235
TO
46 * @param int $rgid
47 * Rule group id.
48 * @param array $cids
49 * Contact ids to limit the search to.
6a488035 50 *
3058f4d9 51 * @param bool $checkPermissions
52 * Respect logged in user permissions.
53 *
21a95d83 54 * @param int $searchLimit
55 * Limit for the number of contacts to be used for comparison.
56 * The search methodology finds all matches for the searchedContacts so this limits
57 * the number of searched contacts, not the matches found.
6c866f0c 58 *
a6c01b45 59 * @return array
3058f4d9 60 * Array of (cid1, cid2, weight) dupe triples
6c866f0c 61 *
62 * @throws CiviCRM_API3_Exception
63 * @throws Exception
6a488035 64 */
21a95d83 65 public static function dupes($rgid, $cids = array(), $checkPermissions = TRUE, $searchLimit = 0) {
353ffa53
TO
66 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
67 $rgBao->id = $rgid;
6a488035
TO
68 $rgBao->contactIds = $cids;
69 if (!$rgBao->find(TRUE)) {
16254ae1 70 CRM_Core_Error::fatal("Dedupe rule not found for selected contacts");
6a488035 71 }
21a95d83 72 if (empty($rgBao->contactIds) && !empty($searchLimit)) {
4c8b4719 73 $limitedContacts = civicrm_api3('Contact', 'get', array(
74 'return' => 'id',
75 'contact_type' => $rgBao->contact_type,
21a95d83 76 'options' => array('limit' => $searchLimit),
4c8b4719 77 ));
78 $rgBao->contactIds = array_keys($limitedContacts['values']);
79 }
6a488035
TO
80
81 $rgBao->fillTable();
82 $dao = new CRM_Core_DAO();
3058f4d9 83 $dao->query($rgBao->thresholdQuery($checkPermissions));
6a488035
TO
84 $dupes = array();
85 while ($dao->fetch()) {
86 $dupes[] = array($dao->id1, $dao->id2, $dao->weight);
87 }
88 $dao->query($rgBao->tableDropQuery());
89
90 return $dupes;
91 }
92
93 /**
94 * Return an array of possible dupes, based on the provided array of
95 * params, using the default rule group for the given contact type and
96 * usage.
97 *
98 * check_permission is a boolean flag to indicate if permission should be considered.
99 * default is to always check permissioning but public pages for example might not want
100 * permission to be checked for anonymous users. Refer CRM-6211. We might be beaking
101 * Multi-Site dedupe for public pages.
102 *
98997235
TO
103 * @param array $params
104 * Array of params of the form $params[$table][$field] == $value.
105 * @param string $ctype
106 * Contact type to match against.
107 * @param string $used
108 * Dedupe rule group usage ('Unsupervised' or 'Supervised' or 'General').
109 * @param array $except
110 * Array of contacts that shouldn't be considered dupes.
111 * @param int $ruleGroupID
112 * The id of the dedupe rule we should be using.
6a488035 113 *
a6c01b45
CW
114 * @return array
115 * matching contact ids
6a488035 116 */
389bcebf 117 public static function dupesByParams(
57b29d67 118 $params,
6a488035 119 $ctype,
d58a19a1
TO
120 $used = 'Unsupervised',
121 $except = array(),
6a488035
TO
122 $ruleGroupID = NULL
123 ) {
124 // If $params is empty there is zero reason to proceed.
125 if (!$params) {
126 return array();
127 }
128
129 $foundByID = FALSE;
130 if ($ruleGroupID) {
353ffa53
TO
131 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
132 $rgBao->id = $ruleGroupID;
6a488035
TO
133 $rgBao->contact_type = $ctype;
134 if ($rgBao->find(TRUE)) {
135 $foundByID = TRUE;
136 }
137 }
138
139 if (!$foundByID) {
353ffa53 140 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
6a488035 141 $rgBao->contact_type = $ctype;
353ffa53 142 $rgBao->used = $used;
6a488035
TO
143 if (!$rgBao->find(TRUE)) {
144 CRM_Core_Error::fatal("$used rule for $ctype does not exist");
145 }
146 }
147 $params['check_permission'] = CRM_Utils_Array::value('check_permission', $params, TRUE);
148
ce83f203
JM
149 if (isset($params['civicrm_phone']['phone_numeric'])) {
150 $orig = $params['civicrm_phone']['phone_numeric'];
151 $params['civicrm_phone']['phone_numeric'] = preg_replace('/[^\d]/', '', $orig);
152 }
6a488035
TO
153 $rgBao->params = $params;
154 $rgBao->fillTable();
155 $dao = new CRM_Core_DAO();
156 $dao->query($rgBao->thresholdQuery($params['check_permission']));
157 $dupes = array();
158 while ($dao->fetch()) {
159 if (isset($dao->id) && $dao->id) {
160 $dupes[] = $dao->id;
161 }
162 }
163 $dao->query($rgBao->tableDropQuery());
164 return array_diff($dupes, $except);
165 }
166
167 /**
168 * Return a contact_id-keyed array of arrays of possible dupes in the given group.
169 *
98997235
TO
170 * @param int $rgid
171 * Rule group id.
172 * @param int $gid
329840ed 173 * Contact group id.
6a488035 174 *
21a95d83 175 * @param int $searchLimit
176 * Limit for the number of contacts to be used for comparison.
177 * The search methodology finds all matches for the searchedContacts so this limits
178 * the number of searched contacts, not the matches found.
179 *
a6c01b45
CW
180 * @return array
181 * array of (cid1, cid2, weight) dupe triples
6a488035 182 */
21a95d83 183 public static function dupesInGroup($rgid, $gid, $searchLimit = 0) {
917acf6f 184 $cids = array_keys(CRM_Contact_BAO_Group::getMember($gid, TRUE, $searchLimit));
481a74f4 185 if (!empty($cids)) {
d58a19a1
TO
186 return self::dupes($rgid, $cids);
187 }
6a488035
TO
188 return array();
189 }
190
6a488035
TO
191 /**
192 * A hackish function needed to massage CRM_Contact_Form_$ctype::formRule()
193 * object into a valid $params array for dedupe
194 *
98997235
TO
195 * @param array $fields
196 * Contact structure from formRule().
197 * @param string $ctype
198 * Contact type of the given contact.
6a488035 199 *
a6c01b45
CW
200 * @return array
201 * valid $params array for dedupe
6a488035 202 */
00be9182 203 public static function formatParams($fields, $ctype) {
6a488035
TO
204 $flat = array();
205 CRM_Utils_Array::flatten($fields, $flat);
206
309a09df 207 // FIXME: This may no longer be necessary - check inputs
6a488035
TO
208 $replace_these = array(
209 'individual_prefix' => 'prefix_id',
210 'individual_suffix' => 'suffix_id',
211 'gender' => 'gender_id',
212 );
309a09df 213 foreach (array('individual_suffix', 'individual_prefix', 'gender') as $name) {
a7488080 214 if (!empty($fields[$name])) {
6a488035
TO
215 $flat[$replace_these[$name]] = $flat[$name];
216 unset($flat[$name]);
217 }
218 }
219
220 // handle {birth,deceased}_date
221 foreach (array(
c5c263ca
AH
222 'birth_date',
223 'deceased_date',
224 ) as $date) {
a7488080 225 if (!empty($fields[$date])) {
6a488035
TO
226 $flat[$date] = $fields[$date];
227 if (is_array($flat[$date])) {
228 $flat[$date] = CRM_Utils_Date::format($flat[$date]);
229 }
230 $flat[$date] = CRM_Utils_Date::processDate($flat[$date]);
231 }
232 }
233
a7488080 234 if (!empty($flat['contact_source'])) {
6a488035
TO
235 $flat['source'] = $flat['contact_source'];
236 unset($flat['contact_source']);
237 }
238
239 // handle preferred_communication_method
df5ad245 240 if (!empty($fields['preferred_communication_method'])) {
6a488035
TO
241 $methods = array_intersect($fields['preferred_communication_method'], array('1'));
242 $methods = array_keys($methods);
243 sort($methods);
244 if ($methods) {
245 $flat['preferred_communication_method'] = CRM_Core_DAO::VALUE_SEPARATOR . implode(CRM_Core_DAO::VALUE_SEPARATOR, $methods) . CRM_Core_DAO::VALUE_SEPARATOR;
246 }
247 }
248
249 // handle custom data
0b330e6d 250 $tree = CRM_Core_BAO_CustomGroup::getTree($ctype, NULL, NULL, -1);
6a488035
TO
251 CRM_Core_BAO_CustomGroup::postProcess($tree, $fields, TRUE);
252 foreach ($tree as $key => $cg) {
253 if (!is_int($key)) {
254 continue;
255 }
256 foreach ($cg['fields'] as $cf) {
257 $flat[$cf['column_name']] = CRM_Utils_Array::value('data', $cf['customValue']);
258 }
259 }
260
261 // if the key is dotted, keep just the last part of it
262 foreach ($flat as $key => $value) {
263 if (substr_count($key, '.')) {
264 $last = explode('.', $key);
265 $last = array_pop($last);
b44e3f84 266 // make sure the first occurrence is kept, not the last
6a488035
TO
267 if (!isset($flat[$last])) {
268 $flat[$last] = $value;
269 }
270 unset($flat[$key]);
271 }
272 }
273
274 // drop the -digit (and -Primary, for CRM-3902) postfixes (so event registration's $flat['email-5'] becomes $flat['email'])
275 // FIXME: CRM-5026 should be fixed here; the below clobbers all address info; we should split off address fields and match
276 // the -digit to civicrm_address.location_type_id and -Primary to civicrm_address.is_primary
277 foreach ($flat as $key => $value) {
278 $matches = array();
eba5ec3d 279 if (preg_match('/(.*)-(Primary-[\d+])$|(.*)-(\d+|Primary)$/', $key, $matches)) {
520b28fe 280 $return = array_values(array_filter($matches));
281 $flat[$return[1]] = $value;
6a488035
TO
282 unset($flat[$key]);
283 }
284 }
285
286 $params = array();
287 $supportedFields = CRM_Dedupe_BAO_RuleGroup::supportedFields($ctype);
288 if (is_array($supportedFields)) {
289 foreach ($supportedFields as $table => $fields) {
290 if ($table == 'civicrm_address') {
291 // for matching on civicrm_address fields, we also need the location_type_id
292 $fields['location_type_id'] = '';
293 // FIXME: we also need to do some hacking for id and name fields, see CRM-3902’s comments
294 $fixes = array(
d58a19a1 295 'address_name' => 'name',
353ffa53 296 'country' => 'country_id',
d58a19a1 297 'state_province' => 'state_province_id',
353ffa53 298 'county' => 'county_id',
6a488035
TO
299 );
300 foreach ($fixes as $orig => $target) {
59a67127
JM
301 if (!empty($flat[$orig])) {
302 $params[$table][$target] = $flat[$orig];
303 }
304 }
305 }
306 if ($table == 'civicrm_phone') {
307 $fixes = array(
0d7e59b0 308 'phone' => 'phone_numeric',
59a67127
JM
309 );
310 foreach ($fixes as $orig => $target) {
a7488080 311 if (!empty($flat[$orig])) {
6a488035
TO
312 $params[$table][$target] = $flat[$orig];
313 }
314 }
315 }
316 foreach ($fields as $field => $title) {
a7488080 317 if (!empty($flat[$field])) {
6a488035
TO
318 $params[$table][$field] = $flat[$field];
319 }
320 }
321 }
322 }
323 return $params;
324 }
96025800 325
1719073d 326 /**
327 * Parse duplicate pairs into a standardised array and store in the prev_next_cache.
328 *
329 * @param array $foundDupes
330 * @param string $cacheKeyString
331 *
332 * @return array Dupe pairs with the keys
333 * Dupe pairs with the keys
334 * -srcID
335 * -srcName
336 * -dstID
337 * -dstName
338 * -weight
339 * -canMerge
340 *
341 * @throws CRM_Core_Exception
342 */
343 public static function parseAndStoreDupePairs($foundDupes, $cacheKeyString) {
344 $cids = array();
345 foreach ($foundDupes as $dupe) {
346 $cids[$dupe[0]] = 1;
347 $cids[$dupe[1]] = 1;
348 }
349 $cidString = implode(', ', array_keys($cids));
1fe557c2 350
351 $dao = CRM_Core_DAO::executeQuery("SELECT id, display_name FROM civicrm_contact WHERE id IN ($cidString) ORDER BY sort_name");
1719073d 352 $displayNames = array();
353 while ($dao->fetch()) {
354 $displayNames[$dao->id] = $dao->display_name;
355 }
356
3bdcd4ec 357 $userId = CRM_Core_Session::getLoggedInContactID();
1719073d 358 foreach ($foundDupes as $dupes) {
1fe557c2 359 $srcID = $dupes[1];
360 $dstID = $dupes[0];
063ffcb7 361 // The logged in user should never be the src (ie. the contact to be removed).
1fe557c2 362 if ($srcID == $userId) {
363 $srcID = $dstID;
364 $dstID = $userId;
1719073d 365 }
366
367 $mainContacts[] = $row = array(
1719073d 368 'dstID' => $dstID,
369 'dstName' => $displayNames[$dstID],
e67dcaf8 370 'srcID' => $srcID,
371 'srcName' => $displayNames[$srcID],
1719073d 372 'weight' => $dupes[2],
373 'canMerge' => TRUE,
374 );
375
376 $data = CRM_Core_DAO::escapeString(serialize($row));
1fe557c2 377 $values[] = " ( 'civicrm_contact', $dstID, $srcID, '$cacheKeyString', '$data' ) ";
1719073d 378 }
379 CRM_Core_BAO_PrevNextCache::setItem($values);
380 return $mainContacts;
381 }
382
6a488035 383}