From: eileen Date: Wed, 27 Apr 2016 05:20:43 +0000 (+1200) Subject: CRM-18842 Dedupe query: remove OR join in favour of more performant UNION X-Git-Url: https://vcs.fsf.org/?a=commitdiff_plain;h=c232f0051e2d0bb26a4d0f7d07ba0b3f68dedf7a;p=civicrm-core.git CRM-18842 Dedupe query: remove OR join in favour of more performant UNION Unions are much faster than OR joins. This change took the length of the query to get the dedupes on a large database from 'as long as it took for the server to fall over' to less than one second on a small group of contacts This query is only affecting one path - ie Individuals - at the moment as I can only extend that as fast as I can write tests. --- diff --git a/CRM/Dedupe/BAO/QueryBuilder.php b/CRM/Dedupe/BAO/QueryBuilder.php index 484ce391c6..3daff16951 100644 --- a/CRM/Dedupe/BAO/QueryBuilder.php +++ b/CRM/Dedupe/BAO/QueryBuilder.php @@ -22,4 +22,28 @@ class CRM_Dedupe_BAO_QueryBuilder { } } + /** + * If a contact list is specified then adjust the query to ensure one contact is in that list. + * + * Doing an OR join here will lead to a server-killing unindexed query. However, a union will + * perform better. + * + * @param array $contactList + * @param string $query + * @param string $strID1 + * @param string $strID2 + * + * @return string + */ + protected static function filterQueryByContactList(array $contactList, $query, $strID1 = 'contact1.id', $strID2 = 'contact2.id') { + if (empty($contactList)) { + return $query . " AND ($strID1 < $strID2)"; + } + $contactIDs = implode(',', $contactList); + return "$query AND $strID1 IN ($contactIDs) AND $strID1 > $strID2 + UNION $query AND $strID1 > $strID2 AND $strID2 IN ($contactIDs) AND $strID1 NOT IN ($contactIDs) + "; + + } + } diff --git a/CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php b/CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php index 806f023e6c..e150f44fd0 100644 --- a/CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php +++ b/CRM/Dedupe/BAO/QueryBuilder/IndividualSupervised.php @@ -53,7 +53,7 @@ class CRM_Dedupe_BAO_QueryBuilder_IndividualSupervised extends CRM_Dedupe_BAO_Qu * @return array */ public static function internal($rg) { - $query = " + $query = self::filterQueryByContactList($rg->contactIds, " SELECT contact1.id as id1, contact2.id as id2, {$rg->threshold} as weight FROM civicrm_contact as contact1 JOIN civicrm_email as email1 ON email1.contact_id=contact1.id @@ -63,8 +63,8 @@ class CRM_Dedupe_BAO_QueryBuilder_IndividualSupervised extends CRM_Dedupe_BAO_Qu JOIN civicrm_email as email2 ON email2.contact_id=contact2.id AND email1.email=email2.email - WHERE contact1.contact_type = 'Individual' - AND " . self::internalFilters($rg); + WHERE contact1.contact_type = 'Individual'"); + return array( "civicrm_contact.{$rg->name}.{$rg->threshold}" => $query, );