Merge pull request #15185 from eileenmcnaughton/dedupe10
[civicrm-core.git] / CRM / Dedupe / Finder.php
1 <?php
2 /*
3 +--------------------------------------------------------------------+
4 | CiviCRM version 5 |
5 +--------------------------------------------------------------------+
6 | Copyright CiviCRM LLC (c) 2004-2019 |
7 +--------------------------------------------------------------------+
8 | This file is a part of CiviCRM. |
9 | |
10 | CiviCRM is free software; you can copy, modify, and distribute it |
11 | under the terms of the GNU Affero General Public License |
12 | Version 3, 19 November 2007 and the CiviCRM Licensing Exception. |
13 | |
14 | CiviCRM is distributed in the hope that it will be useful, but |
15 | WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
17 | See the GNU Affero General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU Affero General Public |
20 | License and the CiviCRM Licensing Exception along |
21 | with this program; if not, contact CiviCRM LLC |
22 | at info[AT]civicrm[DOT]org. If you have questions about the |
23 | GNU Affero General Public License or the licensing of CiviCRM, |
24 | see the CiviCRM license FAQ at http://civicrm.org/licensing |
25 +--------------------------------------------------------------------+
26 */
27
28 /**
29 *
30 * @package CRM
31 * @copyright CiviCRM LLC (c) 2004-2019
32 * $Id$
33 *
34 */
35
36 /**
37 * The CiviCRM duplicate discovery engine is based on an
38 * algorithm designed by David Strauss <david@fourkitchens.com>.
39 */
40 class CRM_Dedupe_Finder {
41
42 /**
43 * Return a contact_id-keyed array of arrays of possible dupes
44 * (of the key contact_id) - limited to dupes of $cids if provided.
45 *
46 * @param int $rgid
47 * Rule group id.
48 * @param array $cids
49 * Contact ids to limit the search to.
50 *
51 * @param bool $checkPermissions
52 * Respect logged in user permissions.
53 *
54 * @return array
55 * Array of (cid1, cid2, weight) dupe triples
56 *
57 * @throws Exception
58 */
59 public static function dupes($rgid, $cids = [], $checkPermissions = TRUE) {
60 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
61 $rgBao->id = $rgid;
62 $rgBao->contactIds = $cids;
63 if (!$rgBao->find(TRUE)) {
64 CRM_Core_Error::fatal("Dedupe rule not found for selected contacts");
65 }
66
67 $rgBao->fillTable();
68 $dao = new CRM_Core_DAO();
69 $dao->query($rgBao->thresholdQuery($checkPermissions));
70 $dupes = [];
71 while ($dao->fetch()) {
72 $dupes[] = [$dao->id1, $dao->id2, $dao->weight];
73 }
74 $dao->query($rgBao->tableDropQuery());
75
76 return $dupes;
77 }
78
79 /**
80 * Return an array of possible dupes, based on the provided array of
81 * params, using the default rule group for the given contact type and
82 * usage.
83 *
84 * check_permission is a boolean flag to indicate if permission should be considered.
85 * default is to always check permissioning but public pages for example might not want
86 * permission to be checked for anonymous users. Refer CRM-6211. We might be breaking
87 * Multi-Site dedupe for public pages.
88 *
89 * @param array $params
90 * Array of params of the form $params[$table][$field] == $value.
91 * @param string $ctype
92 * Contact type to match against.
93 * @param string $used
94 * Dedupe rule group usage ('Unsupervised' or 'Supervised' or 'General').
95 * @param array $except
96 * Array of contacts that shouldn't be considered dupes.
97 * @param int $ruleGroupID
98 * The id of the dedupe rule we should be using.
99 *
100 * @return array
101 * matching contact ids
102 */
103 public static function dupesByParams(
104 $params,
105 $ctype,
106 $used = 'Unsupervised',
107 $except = [],
108 $ruleGroupID = NULL
109 ) {
110 // If $params is empty there is zero reason to proceed.
111 if (!$params) {
112 return [];
113 }
114 $checkPermission = CRM_Utils_Array::value('check_permission', $params, TRUE);
115 // This may no longer be required - see https://github.com/civicrm/civicrm-core/pull/13176
116 $params = array_filter($params);
117
118 $foundByID = FALSE;
119 if ($ruleGroupID) {
120 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
121 $rgBao->id = $ruleGroupID;
122 $rgBao->contact_type = $ctype;
123 if ($rgBao->find(TRUE)) {
124 $foundByID = TRUE;
125 }
126 }
127
128 if (!$foundByID) {
129 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
130 $rgBao->contact_type = $ctype;
131 $rgBao->used = $used;
132 if (!$rgBao->find(TRUE)) {
133 CRM_Core_Error::fatal("$used rule for $ctype does not exist");
134 }
135 }
136
137 if (isset($params['civicrm_phone']['phone_numeric'])) {
138 $orig = $params['civicrm_phone']['phone_numeric'];
139 $params['civicrm_phone']['phone_numeric'] = preg_replace('/[^\d]/', '', $orig);
140 }
141 $rgBao->params = $params;
142 $rgBao->fillTable();
143 $dao = new CRM_Core_DAO();
144 $dao->query($rgBao->thresholdQuery($checkPermission));
145 $dupes = [];
146 while ($dao->fetch()) {
147 if (isset($dao->id) && $dao->id) {
148 $dupes[] = $dao->id;
149 }
150 }
151 $dao->query($rgBao->tableDropQuery());
152 return array_diff($dupes, $except);
153 }
154
155 /**
156 * Return a contact_id-keyed array of arrays of possible dupes in the given group.
157 *
158 * @param int $rgid
159 * Rule group id.
160 * @param int $gid
161 * Contact group id.
162 *
163 * @param int $searchLimit
164 * Limit for the number of contacts to be used for comparison.
165 * The search methodology finds all matches for the searchedContacts so this limits
166 * the number of searched contacts, not the matches found.
167 *
168 * @return array
169 * array of (cid1, cid2, weight) dupe triples
170 */
171 public static function dupesInGroup($rgid, $gid, $searchLimit = 0) {
172 $cids = array_keys(CRM_Contact_BAO_Group::getMember($gid, TRUE, $searchLimit));
173 if (!empty($cids)) {
174 return self::dupes($rgid, $cids);
175 }
176 return [];
177 }
178
179 /**
180 * A hackish function needed to massage CRM_Contact_Form_$ctype::formRule()
181 * object into a valid $params array for dedupe
182 *
183 * @param array $fields
184 * Contact structure from formRule().
185 * @param string $ctype
186 * Contact type of the given contact.
187 *
188 * @return array
189 * valid $params array for dedupe
190 */
191 public static function formatParams($fields, $ctype) {
192 $flat = [];
193 CRM_Utils_Array::flatten($fields, $flat);
194
195 // FIXME: This may no longer be necessary - check inputs
196 $replace_these = [
197 'individual_prefix' => 'prefix_id',
198 'individual_suffix' => 'suffix_id',
199 'gender' => 'gender_id',
200 ];
201 foreach (['individual_suffix', 'individual_prefix', 'gender'] as $name) {
202 if (!empty($fields[$name])) {
203 $flat[$replace_these[$name]] = $flat[$name];
204 unset($flat[$name]);
205 }
206 }
207
208 // handle {birth,deceased}_date
209 foreach ([
210 'birth_date',
211 'deceased_date',
212 ] as $date) {
213 if (!empty($fields[$date])) {
214 $flat[$date] = $fields[$date];
215 if (is_array($flat[$date])) {
216 $flat[$date] = CRM_Utils_Date::format($flat[$date]);
217 }
218 $flat[$date] = CRM_Utils_Date::processDate($flat[$date]);
219 }
220 }
221
222 if (!empty($flat['contact_source'])) {
223 $flat['source'] = $flat['contact_source'];
224 unset($flat['contact_source']);
225 }
226
227 // handle preferred_communication_method
228 if (!empty($fields['preferred_communication_method'])) {
229 $methods = array_intersect($fields['preferred_communication_method'], ['1']);
230 $methods = array_keys($methods);
231 sort($methods);
232 if ($methods) {
233 $flat['preferred_communication_method'] = CRM_Core_DAO::VALUE_SEPARATOR . implode(CRM_Core_DAO::VALUE_SEPARATOR, $methods) . CRM_Core_DAO::VALUE_SEPARATOR;
234 }
235 }
236
237 // handle custom data
238 $tree = CRM_Core_BAO_CustomGroup::getTree($ctype, NULL, NULL, -1);
239 CRM_Core_BAO_CustomGroup::postProcess($tree, $fields, TRUE);
240 foreach ($tree as $key => $cg) {
241 if (!is_int($key)) {
242 continue;
243 }
244 foreach ($cg['fields'] as $cf) {
245 $flat[$cf['column_name']] = CRM_Utils_Array::value('data', $cf['customValue']);
246 }
247 }
248
249 // if the key is dotted, keep just the last part of it
250 foreach ($flat as $key => $value) {
251 if (substr_count($key, '.')) {
252 $last = explode('.', $key);
253 $last = array_pop($last);
254 // make sure the first occurrence is kept, not the last
255 if (!isset($flat[$last])) {
256 $flat[$last] = $value;
257 }
258 unset($flat[$key]);
259 }
260 }
261
262 // drop the -digit (and -Primary, for CRM-3902) postfixes (so event registration's $flat['email-5'] becomes $flat['email'])
263 // FIXME: CRM-5026 should be fixed here; the below clobbers all address info; we should split off address fields and match
264 // the -digit to civicrm_address.location_type_id and -Primary to civicrm_address.is_primary
265 foreach ($flat as $key => $value) {
266 $matches = [];
267 if (preg_match('/(.*)-(Primary-[\d+])$|(.*)-(\d+|Primary)$/', $key, $matches)) {
268 $return = array_values(array_filter($matches));
269 // make sure the first occurrence is kept, not the last
270 $flat[$return[1]] = empty($flat[$return[1]]) ? $value : $flat[$return[1]];
271 unset($flat[$key]);
272 }
273 }
274
275 $params = [];
276 $supportedFields = CRM_Dedupe_BAO_RuleGroup::supportedFields($ctype);
277 if (is_array($supportedFields)) {
278 foreach ($supportedFields as $table => $fields) {
279 if ($table == 'civicrm_address') {
280 // for matching on civicrm_address fields, we also need the location_type_id
281 $fields['location_type_id'] = '';
282 // FIXME: we also need to do some hacking for id and name fields, see CRM-3902’s comments
283 $fixes = [
284 'address_name' => 'name',
285 'country' => 'country_id',
286 'state_province' => 'state_province_id',
287 'county' => 'county_id',
288 ];
289 foreach ($fixes as $orig => $target) {
290 if (!empty($flat[$orig])) {
291 $params[$table][$target] = $flat[$orig];
292 }
293 }
294 }
295 if ($table == 'civicrm_phone') {
296 $fixes = [
297 'phone' => 'phone_numeric',
298 ];
299 foreach ($fixes as $orig => $target) {
300 if (!empty($flat[$orig])) {
301 $params[$table][$target] = $flat[$orig];
302 }
303 }
304 }
305 foreach ($fields as $field => $title) {
306 if (!empty($flat[$field])) {
307 $params[$table][$field] = $flat[$field];
308 }
309 }
310 }
311 }
312 return $params;
313 }
314
315 /**
316 * Parse duplicate pairs into a standardised array and store in the prev_next_cache.
317 *
318 * @param array $foundDupes
319 * @param string $cacheKeyString
320 *
321 * @return array
322 * Dupe pairs with the keys
323 * -srcID
324 * -srcName
325 * -dstID
326 * -dstName
327 * -weight
328 * -canMerge
329 *
330 * @throws CRM_Core_Exception
331 */
332 public static function parseAndStoreDupePairs($foundDupes, $cacheKeyString) {
333 $cids = [];
334 foreach ($foundDupes as $dupe) {
335 $cids[$dupe[0]] = 1;
336 $cids[$dupe[1]] = 1;
337 }
338 $cidString = implode(', ', array_keys($cids));
339
340 $dao = CRM_Core_DAO::executeQuery("SELECT id, display_name FROM civicrm_contact WHERE id IN ($cidString) ORDER BY sort_name");
341 $displayNames = [];
342 while ($dao->fetch()) {
343 $displayNames[$dao->id] = $dao->display_name;
344 }
345
346 $userId = CRM_Core_Session::getLoggedInContactID();
347 foreach ($foundDupes as $dupes) {
348 $srcID = $dupes[1];
349 $dstID = $dupes[0];
350 // The logged in user should never be the src (ie. the contact to be removed).
351 if ($srcID == $userId) {
352 $srcID = $dstID;
353 $dstID = $userId;
354 }
355
356 $mainContacts[] = $row = [
357 'dstID' => $dstID,
358 'dstName' => $displayNames[$dstID],
359 'srcID' => $srcID,
360 'srcName' => $displayNames[$srcID],
361 'weight' => $dupes[2],
362 'canMerge' => TRUE,
363 ];
364
365 $data = CRM_Core_DAO::escapeString(serialize($row));
366 CRM_Core_BAO_PrevNextCache::setItem('civicrm_contact', $dstID, $srcID, $cacheKeyString, $data);
367 }
368 return $mainContacts;
369 }
370
371 }