Use a custom character map instead of name normalizer

Instead of normalizing names during caching, add a function that that maps accented characters to their alphabetic equivalents using switch statements. This character map is used in the on-the-fly matching algorithm. This speeds up the caching process(11k contacts) from 800-1500ms to about 600-1000ms since we no longer perform the normalizing step during caching. Bug: 6977981 Change-Id: I98dfc3cba00258bb7ff03b346eab7ca7dc1065be
author: Yorke Lee <yorkelee@google.com> 2012-11-08 13:13:24 -0800
committer: Yorke Lee <yorkelee@google.com> 2012-11-09 14:05:51 -0800
commit: 678aa6530062e4db2afc5eadea82681e4ba49a1a (patch)
tree: 639715708219ee67f6c03fd2cb3a1bf8bb2fe8bc
parent: ac834156f56a4d4edcf207865f8235647fa26980 (diff)
3 files changed, 337 insertions, 5 deletions
diff --git a/src/com/android/dialer/dialpad/SmartDialLoaderTask.java b/src/com/android/dialer/dialpad/SmartDialLoaderTask.java
index ec99d8a14..74f2e5e05 100644
--- a/src/com/android/dialer/dialpad/SmartDialLoaderTask.java
+++ b/src/com/android/dialer/dialpad/SmartDialLoaderTask.java
@@ -50,13 +50,11 @@ public class SmartDialLoaderTask extends AsyncTask<String, Integer, List<SmartDi
 
     private class Contact {
         final String mDisplayName;
-        final String mStrippedDisplayName;
         final String mLookupKey;
         final long mId;
 
         public Contact(long id, String displayName, String lookupKey) {
             mDisplayName = displayName;
-            mStrippedDisplayName = SmartDialNameMatcher.stripDiacritics(displayName);
             mLookupKey = lookupKey;
             mId = id;
         }
@@ -223,9 +221,9 @@ public class SmartDialLoaderTask extends AsyncTask<String, Integer, List<SmartDi
         int count = 0;
         for (int i = 0; i < sContactsCache.size(); i++) {
             final Contact contact = sContactsCache.get(i);
-            final String strippedDisplayName = contact.mStrippedDisplayName;
+            final String displayName = contact.mDisplayName;
 
-            if (!mNameMatcher.matches(strippedDisplayName)) {
+            if (!mNameMatcher.matches(displayName)) {
                 continue;
             }
             // Matched; create SmartDialEntry.
diff --git a/src/com/android/dialer/dialpad/SmartDialNameMatcher.java b/src/com/android/dialer/dialpad/SmartDialNameMatcher.java
index 4a340f987..1253a566d 100644
--- a/src/com/android/dialer/dialpad/SmartDialNameMatcher.java
+++ b/src/com/android/dialer/dialpad/SmartDialNameMatcher.java
@@ -46,6 +46,339 @@ public class SmartDialNameMatcher {
         '9', '9', '9', '9' // W,X,Y,Z -> 9
     };
 
+    /*
+     * The switch statement in this function was generated using the python code:
+     * from unidecode import unidecode
+     * for i in range(192, 564):
+     *     char = unichr(i)
+     *     decoded = unidecode(char)
+     *     # Unicode characters that decompose into multiple characters i.e.
+     *     #  into ss are not supported for now
+     *     if (len(decoded) == 1 and decoded.isalpha()):
+     *         print "case '" + char + "': return '" + unidecode(char) +  "';"
+     *
+     * This gives us a way to map characters containing accents/diacritics to their
+     * alphabetic equivalents. The unidecode library can be found at:
+     * http://pypi.python.org/pypi/Unidecode/0.04.1
+     */
+    private static char remapAccentedChars(char c) {
+        switch (c) {
+            case 'À': return 'A';
+            case 'Á': return 'A';
+            case 'Â': return 'A';
+            case 'Ã': return 'A';
+            case 'Ä': return 'A';
+            case 'Å': return 'A';
+            case 'Ç': return 'C';
+            case 'È': return 'E';
+            case 'É': return 'E';
+            case 'Ê': return 'E';
+            case 'Ë': return 'E';
+            case 'Ì': return 'I';
+            case 'Í': return 'I';
+            case 'Î': return 'I';
+            case 'Ï': return 'I';
+            case 'Ð': return 'D';
+            case 'Ñ': return 'N';
+            case 'Ò': return 'O';
+            case 'Ó': return 'O';
+            case 'Ô': return 'O';
+            case 'Õ': return 'O';
+            case 'Ö': return 'O';
+            case '×': return 'x';
+            case 'Ø': return 'O';
+            case 'Ù': return 'U';
+            case 'Ú': return 'U';
+            case 'Û': return 'U';
+            case 'Ü': return 'U';
+            case 'Ý': return 'U';
+            case 'à': return 'a';
+            case 'á': return 'a';
+            case 'â': return 'a';
+            case 'ã': return 'a';
+            case 'ä': return 'a';
+            case 'å': return 'a';
+            case 'ç': return 'c';
+            case 'è': return 'e';
+            case 'é': return 'e';
+            case 'ê': return 'e';
+            case 'ë': return 'e';
+            case 'ì': return 'i';
+            case 'í': return 'i';
+            case 'î': return 'i';
+            case 'ï': return 'i';
+            case 'ð': return 'd';
+            case 'ñ': return 'n';
+            case 'ò': return 'o';
+            case 'ó': return 'o';
+            case 'ô': return 'o';
+            case 'õ': return 'o';
+            case 'ö': return 'o';
+            case 'ø': return 'o';
+            case 'ù': return 'u';
+            case 'ú': return 'u';
+            case 'û': return 'u';
+            case 'ü': return 'u';
+            case 'ý': return 'y';
+            case 'ÿ': return 'y';
+            case 'Ā': return 'A';
+            case 'ā': return 'a';
+            case 'Ă': return 'A';
+            case 'ă': return 'a';
+            case 'Ą': return 'A';
+            case 'ą': return 'a';
+            case 'Ć': return 'C';
+            case 'ć': return 'c';
+            case 'Ĉ': return 'C';
+            case 'ĉ': return 'c';
+            case 'Ċ': return 'C';
+            case 'ċ': return 'c';
+            case 'Č': return 'C';
+            case 'č': return 'c';
+            case 'Ď': return 'D';
+            case 'ď': return 'd';
+            case 'Đ': return 'D';
+            case 'đ': return 'd';
+            case 'Ē': return 'E';
+            case 'ē': return 'e';
+            case 'Ĕ': return 'E';
+            case 'ĕ': return 'e';
+            case 'Ė': return 'E';
+            case 'ė': return 'e';
+            case 'Ę': return 'E';
+            case 'ę': return 'e';
+            case 'Ě': return 'E';
+            case 'ě': return 'e';
+            case 'Ĝ': return 'G';
+            case 'ĝ': return 'g';
+            case 'Ğ': return 'G';
+            case 'ğ': return 'g';
+            case 'Ġ': return 'G';
+            case 'ġ': return 'g';
+            case 'Ģ': return 'G';
+            case 'ģ': return 'g';
+            case 'Ĥ': return 'H';
+            case 'ĥ': return 'h';
+            case 'Ħ': return 'H';
+            case 'ħ': return 'h';
+            case 'Ĩ': return 'I';
+            case 'ĩ': return 'i';
+            case 'Ī': return 'I';
+            case 'ī': return 'i';
+            case 'Ĭ': return 'I';
+            case 'ĭ': return 'i';
+            case 'Į': return 'I';
+            case 'į': return 'i';
+            case 'İ': return 'I';
+            case 'ı': return 'i';
+            case 'Ĵ': return 'J';
+            case 'ĵ': return 'j';
+            case 'Ķ': return 'K';
+            case 'ķ': return 'k';
+            case 'ĸ': return 'k';
+            case 'Ĺ': return 'L';
+            case 'ĺ': return 'l';
+            case 'Ļ': return 'L';
+            case 'ļ': return 'l';
+            case 'Ľ': return 'L';
+            case 'ľ': return 'l';
+            case 'Ŀ': return 'L';
+            case 'ŀ': return 'l';
+            case 'Ł': return 'L';
+            case 'ł': return 'l';
+            case 'Ń': return 'N';
+            case 'ń': return 'n';
+            case 'Ņ': return 'N';
+            case 'ņ': return 'n';
+            case 'Ň': return 'N';
+            case 'ň': return 'n';
+            case 'Ō': return 'O';
+            case 'ō': return 'o';
+            case 'Ŏ': return 'O';
+            case 'ŏ': return 'o';
+            case 'Ő': return 'O';
+            case 'ő': return 'o';
+            case 'Ŕ': return 'R';
+            case 'ŕ': return 'r';
+            case 'Ŗ': return 'R';
+            case 'ŗ': return 'r';
+            case 'Ř': return 'R';
+            case 'ř': return 'r';
+            case 'Ś': return 'S';
+            case 'ś': return 's';
+            case 'Ŝ': return 'S';
+            case 'ŝ': return 's';
+            case 'Ş': return 'S';
+            case 'ş': return 's';
+            case 'Š': return 'S';
+            case 'š': return 's';
+            case 'Ţ': return 'T';
+            case 'ţ': return 't';
+            case 'Ť': return 'T';
+            case 'ť': return 't';
+            case 'Ŧ': return 'T';
+            case 'ŧ': return 't';
+            case 'Ũ': return 'U';
+            case 'ũ': return 'u';
+            case 'Ū': return 'U';
+            case 'ū': return 'u';
+            case 'Ŭ': return 'U';
+            case 'ŭ': return 'u';
+            case 'Ů': return 'U';
+            case 'ů': return 'u';
+            case 'Ű': return 'U';
+            case 'ű': return 'u';
+            case 'Ų': return 'U';
+            case 'ų': return 'u';
+            case 'Ŵ': return 'W';
+            case 'ŵ': return 'w';
+            case 'Ŷ': return 'Y';
+            case 'ŷ': return 'y';
+            case 'Ÿ': return 'Y';
+            case 'Ź': return 'Z';
+            case 'ź': return 'z';
+            case 'Ż': return 'Z';
+            case 'ż': return 'z';
+            case 'Ž': return 'Z';
+            case 'ž': return 'z';
+            case 'ſ': return 's';
+            case 'ƀ': return 'b';
+            case 'Ɓ': return 'B';
+            case 'Ƃ': return 'B';
+            case 'ƃ': return 'b';
+            case 'Ɔ': return 'O';
+            case 'Ƈ': return 'C';
+            case 'ƈ': return 'c';
+            case 'Ɖ': return 'D';
+            case 'Ɗ': return 'D';
+            case 'Ƌ': return 'D';
+            case 'ƌ': return 'd';
+            case 'ƍ': return 'd';
+            case 'Ɛ': return 'E';
+            case 'Ƒ': return 'F';
+            case 'ƒ': return 'f';
+            case 'Ɠ': return 'G';
+            case 'Ɣ': return 'G';
+            case 'Ɩ': return 'I';
+            case 'Ɨ': return 'I';
+            case 'Ƙ': return 'K';
+            case 'ƙ': return 'k';
+            case 'ƚ': return 'l';
+            case 'ƛ': return 'l';
+            case 'Ɯ': return 'W';
+            case 'Ɲ': return 'N';
+            case 'ƞ': return 'n';
+            case 'Ɵ': return 'O';
+            case 'Ơ': return 'O';
+            case 'ơ': return 'o';
+            case 'Ƥ': return 'P';
+            case 'ƥ': return 'p';
+            case 'ƫ': return 't';
+            case 'Ƭ': return 'T';
+            case 'ƭ': return 't';
+            case 'Ʈ': return 'T';
+            case 'Ư': return 'U';
+            case 'ư': return 'u';
+            case 'Ʊ': return 'Y';
+            case 'Ʋ': return 'V';
+            case 'Ƴ': return 'Y';
+            case 'ƴ': return 'y';
+            case 'Ƶ': return 'Z';
+            case 'ƶ': return 'z';
+            case 'ƿ': return 'w';
+            case 'Ǎ': return 'A';
+            case 'ǎ': return 'a';
+            case 'Ǐ': return 'I';
+            case 'ǐ': return 'i';
+            case 'Ǒ': return 'O';
+            case 'ǒ': return 'o';
+            case 'Ǔ': return 'U';
+            case 'ǔ': return 'u';
+            case 'Ǖ': return 'U';
+            case 'ǖ': return 'u';
+            case 'Ǘ': return 'U';
+            case 'ǘ': return 'u';
+            case 'Ǚ': return 'U';
+            case 'ǚ': return 'u';
+            case 'Ǜ': return 'U';
+            case 'ǜ': return 'u';
+            case 'Ǟ': return 'A';
+            case 'ǟ': return 'a';
+            case 'Ǡ': return 'A';
+            case 'ǡ': return 'a';
+            case 'Ǥ': return 'G';
+            case 'ǥ': return 'g';
+            case 'Ǧ': return 'G';
+            case 'ǧ': return 'g';
+            case 'Ǩ': return 'K';
+            case 'ǩ': return 'k';
+            case 'Ǫ': return 'O';
+            case 'ǫ': return 'o';
+            case 'Ǭ': return 'O';
+            case 'ǭ': return 'o';
+            case 'ǰ': return 'j';
+            case 'ǲ': return 'D';
+            case 'Ǵ': return 'G';
+            case 'ǵ': return 'g';
+            case 'Ƿ': return 'W';
+            case 'Ǹ': return 'N';
+            case 'ǹ': return 'n';
+            case 'Ǻ': return 'A';
+            case 'ǻ': return 'a';
+            case 'Ǿ': return 'O';
+            case 'ǿ': return 'o';
+            case 'Ȁ': return 'A';
+            case 'ȁ': return 'a';
+            case 'Ȃ': return 'A';
+            case 'ȃ': return 'a';
+            case 'Ȅ': return 'E';
+            case 'ȅ': return 'e';
+            case 'Ȇ': return 'E';
+            case 'ȇ': return 'e';
+            case 'Ȉ': return 'I';
+            case 'ȉ': return 'i';
+            case 'Ȋ': return 'I';
+            case 'ȋ': return 'i';
+            case 'Ȍ': return 'O';
+            case 'ȍ': return 'o';
+            case 'Ȏ': return 'O';
+            case 'ȏ': return 'o';
+            case 'Ȑ': return 'R';
+            case 'ȑ': return 'r';
+            case 'Ȓ': return 'R';
+            case 'ȓ': return 'r';
+            case 'Ȕ': return 'U';
+            case 'ȕ': return 'u';
+            case 'Ȗ': return 'U';
+            case 'ȗ': return 'u';
+            case 'Ș': return 'S';
+            case 'ș': return 's';
+            case 'Ț': return 'T';
+            case 'ț': return 't';
+            case 'Ȝ': return 'Y';
+            case 'ȝ': return 'y';
+            case 'Ȟ': return 'H';
+            case 'ȟ': return 'h';
+            case 'Ȥ': return 'Z';
+            case 'ȥ': return 'z';
+            case 'Ȧ': return 'A';
+            case 'ȧ': return 'a';
+            case 'Ȩ': return 'E';
+            case 'ȩ': return 'e';
+            case 'Ȫ': return 'O';
+            case 'ȫ': return 'o';
+            case 'Ȭ': return 'O';
+            case 'ȭ': return 'o';
+            case 'Ȯ': return 'O';
+            case 'ȯ': return 'o';
+            case 'Ȱ': return 'O';
+            case 'ȱ': return 'o';
+            case 'Ȳ': return 'Y';
+            case 'ȳ': return 'y';
+            default: return c;
+        }
+    }
+
     private final ArrayList<SmartDialMatchPosition> mMatchPositions = Lists.newArrayList();
 
     public SmartDialNameMatcher(String query) {
@@ -153,6 +486,8 @@ public class SmartDialNameMatcher {
         // Keep going until we reach the end of displayName
         while (nameStart < nameLength && queryStart < queryLength) {
             char ch = displayName.charAt(nameStart);
+            // Strip diacritics from accented characters if any
+            ch = remapAccentedChars(ch);
             if ((ch >= 'A') && (ch <= 'Z')) {
                 // Simply change the ascii code to the lower case version instead of using
                 // toLowerCase for efficiency
diff --git a/tests/src/com/android/dialer/SmartDialNameMatcherTest.java b/tests/src/com/android/dialer/SmartDialNameMatcherTest.java
index babae55fa..492e5b422 100644
--- a/tests/src/com/android/dialer/SmartDialNameMatcherTest.java
+++ b/tests/src/com/android/dialer/SmartDialNameMatcherTest.java
@@ -106,7 +106,6 @@ public class SmartDialNameMatcherTest extends TestCase {
         final SmartDialNameMatcher matcher = new SmartDialNameMatcher(query);
         final ArrayList<SmartDialMatchPosition> matchPositions =
                 new ArrayList<SmartDialMatchPosition>();
-        displayName = SmartDialNameMatcher.stripDiacritics(displayName);
         final boolean matches = matcher.matchesCombination(
                 displayName, query, matchPositions);
         Log.d(TAG, "query=" + query + "  text=" + displayName
author	Yorke Lee <yorkelee@google.com>	2012-11-08 13:13:24 -0800
committer	Yorke Lee <yorkelee@google.com>	2012-11-09 14:05:51 -0800
commit	678aa6530062e4db2afc5eadea82681e4ba49a1a (patch)
tree	639715708219ee67f6c03fd2cb3a1bf8bb2fe8bc
parent	ac834156f56a4d4edcf207865f8235647fa26980 (diff)