INFRA-132 - CRM/Dedupe - phpcbf
[civicrm-core.git] / CRM / Dedupe / Finder.php
CommitLineData
6a488035
TO
1<?php
2/*
3 +--------------------------------------------------------------------+
39de6fd5 4 | CiviCRM version 4.6 |
6a488035 5 +--------------------------------------------------------------------+
06b69b18 6 | Copyright CiviCRM LLC (c) 2004-2014 |
6a488035
TO
7 +--------------------------------------------------------------------+
8 | This file is a part of CiviCRM. |
9 | |
10 | CiviCRM is free software; you can copy, modify, and distribute it |
11 | under the terms of the GNU Affero General Public License |
12 | Version 3, 19 November 2007 and the CiviCRM Licensing Exception. |
13 | |
14 | CiviCRM is distributed in the hope that it will be useful, but |
15 | WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
17 | See the GNU Affero General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU Affero General Public |
20 | License and the CiviCRM Licensing Exception along |
21 | with this program; if not, contact CiviCRM LLC |
22 | at info[AT]civicrm[DOT]org. If you have questions about the |
23 | GNU Affero General Public License or the licensing of CiviCRM, |
24 | see the CiviCRM license FAQ at http://civicrm.org/licensing |
25 +--------------------------------------------------------------------+
26*/
27
28/**
29 *
30 * @package CRM
06b69b18 31 * @copyright CiviCRM LLC (c) 2004-2014
6a488035
TO
32 * $Id$
33 *
34 */
35
36/**
37 * The CiviCRM duplicate discovery engine is based on an
38 * algorithm designed by David Strauss <david@fourkitchens.com>.
39 */
40class CRM_Dedupe_Finder {
41
42 /**
43 * Return a contact_id-keyed array of arrays of possible dupes
44 * (of the key contact_id) - limited to dupes of $cids if provided.
45 *
98997235
TO
46 * @param int $rgid
47 * Rule group id.
48 * @param array $cids
49 * Contact ids to limit the search to.
6a488035
TO
50 *
51 * @return array array of (cid1, cid2, weight) dupe triples
52 */
00be9182 53 public static function dupes($rgid, $cids = array()) {
6a488035
TO
54 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
55 $rgBao->id = $rgid;
56 $rgBao->contactIds = $cids;
57 if (!$rgBao->find(TRUE)) {
16254ae1 58 CRM_Core_Error::fatal("Dedupe rule not found for selected contacts");
6a488035
TO
59 }
60
61 $rgBao->fillTable();
62 $dao = new CRM_Core_DAO();
63 $dao->query($rgBao->thresholdQuery());
64 $dupes = array();
65 while ($dao->fetch()) {
66 $dupes[] = array($dao->id1, $dao->id2, $dao->weight);
67 }
68 $dao->query($rgBao->tableDropQuery());
69
70 return $dupes;
71 }
72
73 /**
74 * Return an array of possible dupes, based on the provided array of
75 * params, using the default rule group for the given contact type and
76 * usage.
77 *
78 * check_permission is a boolean flag to indicate if permission should be considered.
79 * default is to always check permissioning but public pages for example might not want
80 * permission to be checked for anonymous users. Refer CRM-6211. We might be beaking
81 * Multi-Site dedupe for public pages.
82 *
98997235
TO
83 * @param array $params
84 * Array of params of the form $params[$table][$field] == $value.
85 * @param string $ctype
86 * Contact type to match against.
87 * @param string $used
88 * Dedupe rule group usage ('Unsupervised' or 'Supervised' or 'General').
89 * @param array $except
90 * Array of contacts that shouldn't be considered dupes.
91 * @param int $ruleGroupID
92 * The id of the dedupe rule we should be using.
6a488035
TO
93 *
94 * @return array matching contact ids
95 */
96 static function dupesByParams($params,
97 $ctype,
d58a19a1
TO
98 $used = 'Unsupervised',
99 $except = array(),
6a488035
TO
100 $ruleGroupID = NULL
101 ) {
102 // If $params is empty there is zero reason to proceed.
103 if (!$params) {
104 return array();
105 }
106
107 $foundByID = FALSE;
108 if ($ruleGroupID) {
109 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
110 $rgBao->id = $ruleGroupID;
111 $rgBao->contact_type = $ctype;
112 if ($rgBao->find(TRUE)) {
113 $foundByID = TRUE;
114 }
115 }
116
117 if (!$foundByID) {
118 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
119 $rgBao->contact_type = $ctype;
120 $rgBao->used = $used;
121 if (!$rgBao->find(TRUE)) {
122 CRM_Core_Error::fatal("$used rule for $ctype does not exist");
123 }
124 }
125 $params['check_permission'] = CRM_Utils_Array::value('check_permission', $params, TRUE);
126
127 $rgBao->params = $params;
128 $rgBao->fillTable();
129 $dao = new CRM_Core_DAO();
130 $dao->query($rgBao->thresholdQuery($params['check_permission']));
131 $dupes = array();
132 while ($dao->fetch()) {
133 if (isset($dao->id) && $dao->id) {
134 $dupes[] = $dao->id;
135 }
136 }
137 $dao->query($rgBao->tableDropQuery());
138 return array_diff($dupes, $except);
139 }
140
141 /**
142 * Return a contact_id-keyed array of arrays of possible dupes in the given group.
143 *
98997235
TO
144 * @param int $rgid
145 * Rule group id.
146 * @param int $gid
147 * Contact group id (currently, works only with non-smart groups).
6a488035
TO
148 *
149 * @return array array of (cid1, cid2, weight) dupe triples
150 */
00be9182 151 public static function dupesInGroup($rgid, $gid) {
6a488035
TO
152 $cids = array_keys(CRM_Contact_BAO_Group::getMember($gid));
153 if ( !empty($cids) ) {
d58a19a1
TO
154 return self::dupes($rgid, $cids);
155 }
6a488035
TO
156 return array();
157 }
158
159 /**
160 * Return dupes of a given contact, using the default rule group (of a provided usage).
161 *
98997235
TO
162 * @param int $cid
163 * Contact id of the given contact.
164 * @param string $used
165 * Dedupe rule group usage ('Unsupervised' or 'Supervised' or 'General').
166 * @param string $ctype
167 * Contact type of the given contact.
6a488035
TO
168 *
169 * @return array array of dupe contact_ids
170 */
00be9182 171 public static function dupesOfContact($cid, $used = 'Unsupervised', $ctype = NULL) {
6a488035
TO
172 // if not provided, fetch the contact type from the database
173 if (!$ctype) {
174 $dao = new CRM_Contact_DAO_Contact();
175 $dao->id = $cid;
176 if (!$dao->find(TRUE)) {
177 CRM_Core_Error::fatal("contact id of $cid does not exist");
178 }
179 $ctype = $dao->contact_type;
180 }
181 $rgBao = new CRM_Dedupe_BAO_RuleGroup();
182 $rgBao->used = $used;
183 $rgBao->contact_type = $ctype;
184 if (!$rgBao->find(TRUE)) {
185 CRM_Core_Error::fatal("$used rule for $ctype does not exist");
186 }
187 $dupes = self::dupes($rgBao->id, array($cid));
188
189 // get the dupes for this cid
190 $result = array();
191 foreach ($dupes as $dupe) {
192 if ($dupe[0] == $cid) {
193 $result[] = $dupe[1];
194 }
195 elseif ($dupe[1] == $cid) {
196 $result[] = $dupe[0];
197 }
198 }
199 return $result;
200 }
201
202 /**
203 * A hackish function needed to massage CRM_Contact_Form_$ctype::formRule()
204 * object into a valid $params array for dedupe
205 *
98997235
TO
206 * @param array $fields
207 * Contact structure from formRule().
208 * @param string $ctype
209 * Contact type of the given contact.
6a488035
TO
210 *
211 * @return array valid $params array for dedupe
212 */
00be9182 213 public static function formatParams($fields, $ctype) {
6a488035
TO
214 $flat = array();
215 CRM_Utils_Array::flatten($fields, $flat);
216
309a09df 217 // FIXME: This may no longer be necessary - check inputs
6a488035
TO
218 $replace_these = array(
219 'individual_prefix' => 'prefix_id',
220 'individual_suffix' => 'suffix_id',
221 'gender' => 'gender_id',
222 );
309a09df 223 foreach (array('individual_suffix', 'individual_prefix', 'gender') as $name) {
a7488080 224 if (!empty($fields[$name])) {
6a488035
TO
225 $flat[$replace_these[$name]] = $flat[$name];
226 unset($flat[$name]);
227 }
228 }
229
230 // handle {birth,deceased}_date
231 foreach (array(
232 'birth_date', 'deceased_date') as $date) {
a7488080 233 if (!empty($fields[$date])) {
6a488035
TO
234 $flat[$date] = $fields[$date];
235 if (is_array($flat[$date])) {
236 $flat[$date] = CRM_Utils_Date::format($flat[$date]);
237 }
238 $flat[$date] = CRM_Utils_Date::processDate($flat[$date]);
239 }
240 }
241
a7488080 242 if (!empty($flat['contact_source'])) {
6a488035
TO
243 $flat['source'] = $flat['contact_source'];
244 unset($flat['contact_source']);
245 }
246
247 // handle preferred_communication_method
df5ad245 248 if (!empty($fields['preferred_communication_method'])) {
6a488035
TO
249 $methods = array_intersect($fields['preferred_communication_method'], array('1'));
250 $methods = array_keys($methods);
251 sort($methods);
252 if ($methods) {
253 $flat['preferred_communication_method'] = CRM_Core_DAO::VALUE_SEPARATOR . implode(CRM_Core_DAO::VALUE_SEPARATOR, $methods) . CRM_Core_DAO::VALUE_SEPARATOR;
254 }
255 }
256
257 // handle custom data
258 $tree = CRM_Core_BAO_CustomGroup::getTree($ctype, CRM_Core_DAO::$_nullObject, NULL, -1);
259 CRM_Core_BAO_CustomGroup::postProcess($tree, $fields, TRUE);
260 foreach ($tree as $key => $cg) {
261 if (!is_int($key)) {
262 continue;
263 }
264 foreach ($cg['fields'] as $cf) {
265 $flat[$cf['column_name']] = CRM_Utils_Array::value('data', $cf['customValue']);
266 }
267 }
268
269 // if the key is dotted, keep just the last part of it
270 foreach ($flat as $key => $value) {
271 if (substr_count($key, '.')) {
272 $last = explode('.', $key);
273 $last = array_pop($last);
274 // make sure the first occurence is kept, not the last
275 if (!isset($flat[$last])) {
276 $flat[$last] = $value;
277 }
278 unset($flat[$key]);
279 }
280 }
281
282 // drop the -digit (and -Primary, for CRM-3902) postfixes (so event registration's $flat['email-5'] becomes $flat['email'])
283 // FIXME: CRM-5026 should be fixed here; the below clobbers all address info; we should split off address fields and match
284 // the -digit to civicrm_address.location_type_id and -Primary to civicrm_address.is_primary
285 foreach ($flat as $key => $value) {
286 $matches = array();
287 if (preg_match('/(.*)-(\d+|Primary)$/', $key, $matches)) {
288 $flat[$matches[1]] = $value;
289 unset($flat[$key]);
290 }
291 }
292
293 $params = array();
294 $supportedFields = CRM_Dedupe_BAO_RuleGroup::supportedFields($ctype);
295 if (is_array($supportedFields)) {
296 foreach ($supportedFields as $table => $fields) {
297 if ($table == 'civicrm_address') {
298 // for matching on civicrm_address fields, we also need the location_type_id
299 $fields['location_type_id'] = '';
300 // FIXME: we also need to do some hacking for id and name fields, see CRM-3902’s comments
301 $fixes = array(
d58a19a1
TO
302 'address_name' => 'name',
303 'country' => 'country_id',
304 'state_province' => 'state_province_id',
305 'county' => 'county_id',
6a488035
TO
306 );
307 foreach ($fixes as $orig => $target) {
a7488080 308 if (!empty($flat[$orig])) {
6a488035
TO
309 $params[$table][$target] = $flat[$orig];
310 }
311 }
312 }
313 foreach ($fields as $field => $title) {
a7488080 314 if (!empty($flat[$field])) {
6a488035
TO
315 $params[$table][$field] = $flat[$field];
316 }
317 }
318 }
319 }
320 return $params;
321 }
322}