001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Encodes a string into a Metaphone value. 025 * <p> 026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 028 * </p> 029 * <p> 030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 031 * p 39.</CITE> 032 * </p> 033 * <p> 034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: 035 * </p> 036 * <ul> 037 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> 038 * (broken link 4/30/2013) </li> 039 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> 040 * (link checked 4/30/2013) </li> 041 * </ul> 042 * <p> 043 * They have had undocumented changes from the originally published algorithm. 044 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 045 * </p> 046 * <p> 047 * This class is conditionally thread-safe. 048 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)} 049 * but is not volatile, and accesses are not synchronized. 050 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization 051 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 052 * after initial setup. 053 * </p> 054 */ 055public class Metaphone implements StringEncoder { 056 057 /** 058 * Five values in the English language 059 */ 060 private static final String VOWELS = "AEIOU"; 061 062 /** 063 * Variable used in Metaphone algorithm 064 */ 065 private static final String FRONTV = "EIY"; 066 067 /** 068 * Variable used in Metaphone algorithm 069 */ 070 private static final String VARSON = "CSPTG"; 071 072 /** 073 * The max code length for metaphone is 4 074 */ 075 private int maxCodeLen = 4; 076 077 /** 078 * Find the metaphone value of a String. This is similar to the 079 * soundex algorithm, but better at finding similar sounding words. 080 * All input is converted to upper case. 081 * Limitations: Input format is expected to be a single ASCII word 082 * with only characters in the A - Z range, no punctuation or numbers. 083 * 084 * @param txt String to find the metaphone code for 085 * @return A metaphone code corresponding to the String supplied 086 */ 087 public String metaphone(final String txt) { 088 boolean hard = false; 089 final int txtLength; 090 if (txt == null || (txtLength = txt.length()) == 0) { 091 return ""; 092 } 093 // single character is itself 094 if (txtLength == 1) { 095 return txt.toUpperCase(java.util.Locale.ENGLISH); 096 } 097 098 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); 099 100 final StringBuilder local = new StringBuilder(40); // manipulate 101 final StringBuilder code = new StringBuilder(10); // output 102 // handle initial 2 characters exceptions 103 switch(inwd[0]) { 104 case 'K': 105 case 'G': 106 case 'P': /* looking for KN, etc*/ 107 if (inwd[1] == 'N') { 108 local.append(inwd, 1, inwd.length - 1); 109 } else { 110 local.append(inwd); 111 } 112 break; 113 case 'A': /* looking for AE */ 114 if (inwd[1] == 'E') { 115 local.append(inwd, 1, inwd.length - 1); 116 } else { 117 local.append(inwd); 118 } 119 break; 120 case 'W': /* looking for WR or WH */ 121 if (inwd[1] == 'R') { // WR -> R 122 local.append(inwd, 1, inwd.length - 1); 123 break; 124 } 125 if (inwd[1] == 'H') { 126 local.append(inwd, 1, inwd.length - 1); 127 local.setCharAt(0, 'W'); // WH -> W 128 } else { 129 local.append(inwd); 130 } 131 break; 132 case 'X': /* initial X becomes S */ 133 inwd[0] = 'S'; 134 local.append(inwd); 135 break; 136 default: 137 local.append(inwd); 138 } // now local has working string with initials fixed 139 140 final int wdsz = local.length(); 141 int n = 0; 142 143 while (code.length() < this.getMaxCodeLen() && 144 n < wdsz ) { // max code size of 4 works well 145 final char symb = local.charAt(n); 146 // remove duplicate letters except C 147 if (symb != 'C' && isPreviousChar( local, n, symb ) ) { 148 n++; 149 } else { // not dup 150 switch(symb) { 151 case 'A': 152 case 'E': 153 case 'I': 154 case 'O': 155 case 'U': 156 if (n == 0) { 157 code.append(symb); 158 } 159 break; // only use vowel if leading char 160 case 'B': 161 if ( isPreviousChar(local, n, 'M') && 162 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 163 break; 164 } 165 code.append(symb); 166 break; 167 case 'C': // lots of C special cases 168 /* discard if SCI, SCE or SCY */ 169 if ( isPreviousChar(local, n, 'S') && 170 !isLastChar(wdsz, n) && 171 FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { 172 break; 173 } 174 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 175 code.append('X'); 176 break; 177 } 178 if (!isLastChar(wdsz, n) && 179 FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 180 code.append('S'); 181 break; // CI,CE,CY -> S 182 } 183 if (isPreviousChar(local, n, 'S') && 184 isNextChar(local, n, 'H') ) { // SCH->sk 185 code.append('K'); 186 break; 187 } 188 if (isNextChar(local, n, 'H')) { // detect CH 189 if (n == 0 && 190 wdsz >= 3 && 191 isVowel(local,2) ) { // CH consonant -> K consonant 192 code.append('K'); 193 } else { 194 code.append('X'); // CHvowel -> X 195 } 196 } else { 197 code.append('K'); 198 } 199 break; 200 case 'D': 201 if (!isLastChar(wdsz, n + 1) && 202 isNextChar(local, n, 'G') && 203 FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 204 code.append('J'); n += 2; 205 } else { 206 code.append('T'); 207 } 208 break; 209 case 'G': // GH silent at end or before consonant 210 if (isLastChar(wdsz, n + 1) && 211 isNextChar(local, n, 'H')) { 212 break; 213 } 214 if (!isLastChar(wdsz, n + 1) && 215 isNextChar(local,n,'H') && 216 !isVowel(local,n+2)) { 217 break; 218 } 219 if (n > 0 && 220 ( regionMatch(local, n, "GN") || 221 regionMatch(local, n, "GNED") ) ) { 222 break; // silent G 223 } 224 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 225 hard = isPreviousChar(local, n, 'G'); 226 if (!isLastChar(wdsz, n) && 227 FRONTV.indexOf(local.charAt(n + 1)) >= 0 && 228 !hard) { 229 code.append('J'); 230 } else { 231 code.append('K'); 232 } 233 break; 234 case 'H': 235 if (isLastChar(wdsz, n)) { 236 break; // terminal H 237 } 238 if (n > 0 && 239 VARSON.indexOf(local.charAt(n - 1)) >= 0) { 240 break; 241 } 242 if (isVowel(local,n+1)) { 243 code.append('H'); // Hvowel 244 } 245 break; 246 case 'F': 247 case 'J': 248 case 'L': 249 case 'M': 250 case 'N': 251 case 'R': 252 code.append(symb); 253 break; 254 case 'K': 255 if (n > 0) { // not initial 256 if (!isPreviousChar(local, n, 'C')) { 257 code.append(symb); 258 } 259 } else { 260 code.append(symb); // initial K 261 } 262 break; 263 case 'P': 264 if (isNextChar(local,n,'H')) { 265 // PH -> F 266 code.append('F'); 267 } else { 268 code.append(symb); 269 } 270 break; 271 case 'Q': 272 code.append('K'); 273 break; 274 case 'S': 275 if (regionMatch(local,n,"SH") || 276 regionMatch(local,n,"SIO") || 277 regionMatch(local,n,"SIA")) { 278 code.append('X'); 279 } else { 280 code.append('S'); 281 } 282 break; 283 case 'T': 284 if (regionMatch(local,n,"TIA") || 285 regionMatch(local,n,"TIO")) { 286 code.append('X'); 287 break; 288 } 289 if (regionMatch(local,n,"TCH")) { 290 // Silent if in "TCH" 291 break; 292 } 293 // substitute numeral 0 for TH (resembles theta after all) 294 if (regionMatch(local,n,"TH")) { 295 code.append('0'); 296 } else { 297 code.append('T'); 298 } 299 break; 300 case 'V': 301 code.append('F'); break; 302 case 'W': 303 case 'Y': // silent if not followed by vowel 304 if (!isLastChar(wdsz,n) && 305 isVowel(local,n+1)) { 306 code.append(symb); 307 } 308 break; 309 case 'X': 310 code.append('K'); 311 code.append('S'); 312 break; 313 case 'Z': 314 code.append('S'); 315 break; 316 default: 317 // do nothing 318 break; 319 } // end switch 320 n++; 321 } // end else from symb != 'C' 322 if (code.length() > this.getMaxCodeLen()) { 323 code.setLength(this.getMaxCodeLen()); 324 } 325 } 326 return code.toString(); 327 } 328 329 private boolean isVowel(final StringBuilder string, final int index) { 330 return VOWELS.indexOf(string.charAt(index)) >= 0; 331 } 332 333 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 334 boolean matches = false; 335 if( index > 0 && 336 index < string.length() ) { 337 matches = string.charAt(index - 1) == c; 338 } 339 return matches; 340 } 341 342 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 343 boolean matches = false; 344 if( index >= 0 && 345 index < string.length() - 1 ) { 346 matches = string.charAt(index + 1) == c; 347 } 348 return matches; 349 } 350 351 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 352 boolean matches = false; 353 if (index >= 0 && index + test.length() - 1 < string.length()) { 354 final String substring = string.substring(index, index + test.length()); 355 matches = substring.equals(test); 356 } 357 return matches; 358 } 359 360 private boolean isLastChar(final int wdsz, final int n) { 361 return n + 1 == wdsz; 362 } 363 364 /** 365 * Encodes an Object using the metaphone algorithm. This method 366 * is provided in order to satisfy the requirements of the 367 * Encoder interface, and will throw an EncoderException if the 368 * supplied object is not of type java.lang.String. 369 * 370 * @param obj Object to encode 371 * @return An object (or type java.lang.String) containing the 372 * metaphone code which corresponds to the String supplied. 373 * @throws EncoderException if the parameter supplied is not 374 * of type java.lang.String 375 */ 376 @Override 377 public Object encode(final Object obj) throws EncoderException { 378 if (!(obj instanceof String)) { 379 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 380 } 381 return metaphone((String) obj); 382 } 383 384 /** 385 * Encodes a String using the Metaphone algorithm. 386 * 387 * @param str String object to encode 388 * @return The metaphone code corresponding to the String supplied 389 */ 390 @Override 391 public String encode(final String str) { 392 return metaphone(str); 393 } 394 395 /** 396 * Tests is the metaphones of two strings are identical. 397 * 398 * @param str1 First of two strings to compare 399 * @param str2 Second of two strings to compare 400 * @return {@code true} if the metaphones of these strings are identical, 401 * {@code false} otherwise. 402 */ 403 public boolean isMetaphoneEqual(final String str1, final String str2) { 404 return metaphone(str1).equals(metaphone(str2)); 405 } 406 407 /** 408 * Returns the maxCodeLen. 409 * @return int 410 */ 411 public int getMaxCodeLen() { return this.maxCodeLen; } 412 413 /** 414 * Sets the maxCodeLen. 415 * @param maxCodeLen The maxCodeLen to set 416 */ 417 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 418 419}