001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022import org.apache.commons.codec.binary.StringUtils; 023 024/** 025 * Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <CITE>Lawrence 026 * Philips</CITE>. 027 * <p> 028 * This class is conditionally thread-safe. The instance field for the maximum code length is mutable 029 * {@link #setMaxCodeLen(int)} but is not volatile, and accesses are not synchronized. If an instance of the class is 030 * shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication 031 * of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} after initial setup. 032 * </p> 033 * 034 * @see <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a> 035 * @see <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a> 036 */ 037public class DoubleMetaphone implements StringEncoder { 038 039 /** 040 * "Vowels" to test for 041 */ 042 private static final String VOWELS = "AEIOUY"; 043 044 /** 045 * Prefixes when present which are not pronounced 046 */ 047 private static final String[] SILENT_START = 048 { "GN", "KN", "PN", "WR", "PS" }; 049 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 050 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 051 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 052 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 053 private static final String[] L_T_K_S_N_M_B_Z = 054 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 055 056 /** 057 * Maximum length of an encoding, default is 4 058 */ 059 private int maxCodeLen = 4; 060 061 /** 062 * Encode a value with Double Metaphone. 063 * 064 * @param value String to encode 065 * @return an encoded string 066 */ 067 public String doubleMetaphone(final String value) { 068 return doubleMetaphone(value, false); 069 } 070 071 /** 072 * Encode a value with Double Metaphone, optionally using the alternate encoding. 073 * 074 * @param value String to encode 075 * @param alternate use alternate encode 076 * @return an encoded string 077 */ 078 public String doubleMetaphone(String value, final boolean alternate) { 079 value = cleanInput(value); 080 if (value == null) { 081 return null; 082 } 083 084 final boolean slavoGermanic = isSlavoGermanic(value); 085 int index = isSilentStart(value) ? 1 : 0; 086 087 final DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 088 089 while (!result.isComplete() && index <= value.length() - 1) { 090 switch (value.charAt(index)) { 091 case 'A': 092 case 'E': 093 case 'I': 094 case 'O': 095 case 'U': 096 case 'Y': 097 index = handleAEIOUY(result, index); 098 break; 099 case 'B': 100 result.append('P'); 101 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 102 break; 103 case '\u00C7': 104 // A C with a Cedilla 105 result.append('S'); 106 index++; 107 break; 108 case 'C': 109 index = handleC(value, result, index); 110 break; 111 case 'D': 112 index = handleD(value, result, index); 113 break; 114 case 'F': 115 result.append('F'); 116 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 117 break; 118 case 'G': 119 index = handleG(value, result, index, slavoGermanic); 120 break; 121 case 'H': 122 index = handleH(value, result, index); 123 break; 124 case 'J': 125 index = handleJ(value, result, index, slavoGermanic); 126 break; 127 case 'K': 128 result.append('K'); 129 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 130 break; 131 case 'L': 132 index = handleL(value, result, index); 133 break; 134 case 'M': 135 result.append('M'); 136 index = conditionM0(value, index) ? index + 2 : index + 1; 137 break; 138 case 'N': 139 result.append('N'); 140 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 141 break; 142 case '\u00D1': 143 // N with a tilde (spanish ene) 144 result.append('N'); 145 index++; 146 break; 147 case 'P': 148 index = handleP(value, result, index); 149 break; 150 case 'Q': 151 result.append('K'); 152 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 153 break; 154 case 'R': 155 index = handleR(value, result, index, slavoGermanic); 156 break; 157 case 'S': 158 index = handleS(value, result, index, slavoGermanic); 159 break; 160 case 'T': 161 index = handleT(value, result, index); 162 break; 163 case 'V': 164 result.append('F'); 165 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 166 break; 167 case 'W': 168 index = handleW(value, result, index); 169 break; 170 case 'X': 171 index = handleX(value, result, index); 172 break; 173 case 'Z': 174 index = handleZ(value, result, index, slavoGermanic); 175 break; 176 default: 177 index++; 178 break; 179 } 180 } 181 182 return alternate ? result.getAlternate() : result.getPrimary(); 183 } 184 185 /** 186 * Encode the value using DoubleMetaphone. It will only work if 187 * {@code obj} is a {@code String} (like {@code Metaphone}). 188 * 189 * @param obj Object to encode (should be of type String) 190 * @return An encoded Object (will be of type String) 191 * @throws EncoderException encode parameter is not of type String 192 */ 193 @Override 194 public Object encode(final Object obj) throws EncoderException { 195 if (!(obj instanceof String)) { 196 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 197 } 198 return doubleMetaphone((String) obj); 199 } 200 201 /** 202 * Encode the value using DoubleMetaphone. 203 * 204 * @param value String to encode 205 * @return An encoded String 206 */ 207 @Override 208 public String encode(final String value) { 209 return doubleMetaphone(value); 210 } 211 212 /** 213 * Check if the Double Metaphone values of two {@code String} values 214 * are equal. 215 * 216 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 217 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 218 * @return {@code true} if the encoded {@code String}s are equal; 219 * {@code false} otherwise. 220 * @see #isDoubleMetaphoneEqual(String,String,boolean) 221 */ 222 public boolean isDoubleMetaphoneEqual(final String value1, final String value2) { 223 return isDoubleMetaphoneEqual(value1, value2, false); 224 } 225 226 /** 227 * Check if the Double Metaphone values of two {@code String} values 228 * are equal, optionally using the alternate value. 229 * 230 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 231 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 232 * @param alternate use the alternate value if {@code true}. 233 * @return {@code true} if the encoded {@code String}s are equal; 234 * {@code false} otherwise. 235 */ 236 public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) { 237 return StringUtils.equals(doubleMetaphone(value1, alternate), doubleMetaphone(value2, alternate)); 238 } 239 240 /** 241 * Returns the maxCodeLen. 242 * @return int 243 */ 244 public int getMaxCodeLen() { 245 return this.maxCodeLen; 246 } 247 248 /** 249 * Sets the maxCodeLen. 250 * @param maxCodeLen The maxCodeLen to set 251 */ 252 public void setMaxCodeLen(final int maxCodeLen) { 253 this.maxCodeLen = maxCodeLen; 254 } 255 256 //-- BEGIN HANDLERS --// 257 258 /** 259 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases. 260 */ 261 private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) { 262 if (index == 0) { 263 result.append('A'); 264 } 265 return index + 1; 266 } 267 268 /** 269 * Handles 'C' cases. 270 */ 271 private int handleC(final String value, final DoubleMetaphoneResult result, int index) { 272 if (conditionC0(value, index)) { // very confusing, moved out 273 result.append('K'); 274 index += 2; 275 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 276 result.append('S'); 277 index += 2; 278 } else if (contains(value, index, 2, "CH")) { 279 index = handleCH(value, result, index); 280 } else if (contains(value, index, 2, "CZ") && 281 !contains(value, index - 2, 4, "WICZ")) { 282 //-- "Czerny" --// 283 result.append('S', 'X'); 284 index += 2; 285 } else if (contains(value, index + 1, 3, "CIA")) { 286 //-- "focaccia" --// 287 result.append('X'); 288 index += 3; 289 } else if (contains(value, index, 2, "CC") && 290 !(index == 1 && charAt(value, 0) == 'M')) { 291 //-- double "cc" but not "McClelland" --// 292 return handleCC(value, result, index); 293 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 294 result.append('K'); 295 index += 2; 296 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 297 //-- Italian vs. English --// 298 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 299 result.append('S', 'X'); 300 } else { 301 result.append('S'); 302 } 303 index += 2; 304 } else { 305 result.append('K'); 306 if (contains(value, index + 1, 2, " C", " Q", " G")) { 307 //-- Mac Caffrey, Mac Gregor --// 308 index += 3; 309 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 310 !contains(value, index + 1, 2, "CE", "CI")) { 311 index += 2; 312 } else { 313 index++; 314 } 315 } 316 317 return index; 318 } 319 320 /** 321 * Handles 'CC' cases. 322 */ 323 private int handleCC(final String value, final DoubleMetaphoneResult result, int index) { 324 if (contains(value, index + 2, 1, "I", "E", "H") && 325 !contains(value, index + 2, 2, "HU")) { 326 //-- "bellocchio" but not "bacchus" --// 327 if ((index == 1 && charAt(value, index - 1) == 'A') || 328 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 329 //-- "accident", "accede", "succeed" --// 330 result.append("KS"); 331 } else { 332 //-- "bacci", "bertucci", other Italian --// 333 result.append('X'); 334 } 335 index += 3; 336 } else { // Pierce's rule 337 result.append('K'); 338 index += 2; 339 } 340 341 return index; 342 } 343 344 /** 345 * Handles 'CH' cases. 346 */ 347 private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) { 348 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 349 result.append('K', 'X'); 350 return index + 2; 351 } 352 if (conditionCH0(value, index)) { 353 //-- Greek roots ("chemistry", "chorus", etc.) --// 354 result.append('K'); 355 return index + 2; 356 } 357 if (conditionCH1(value, index)) { 358 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 359 result.append('K'); 360 return index + 2; 361 } 362 if (index > 0) { 363 if (contains(value, 0, 2, "MC")) { 364 result.append('K'); 365 } else { 366 result.append('X', 'K'); 367 } 368 } else { 369 result.append('X'); 370 } 371 return index + 2; 372 } 373 374 /** 375 * Handles 'D' cases. 376 */ 377 private int handleD(final String value, final DoubleMetaphoneResult result, int index) { 378 if (contains(value, index, 2, "DG")) { 379 //-- "Edge" --// 380 if (contains(value, index + 2, 1, "I", "E", "Y")) { 381 result.append('J'); 382 index += 3; 383 //-- "Edgar" --// 384 } else { 385 result.append("TK"); 386 index += 2; 387 } 388 } else if (contains(value, index, 2, "DT", "DD")) { 389 result.append('T'); 390 index += 2; 391 } else { 392 result.append('T'); 393 index++; 394 } 395 return index; 396 } 397 398 /** 399 * Handles 'G' cases. 400 */ 401 private int handleG(final String value, final DoubleMetaphoneResult result, int index, 402 final boolean slavoGermanic) { 403 if (charAt(value, index + 1) == 'H') { 404 index = handleGH(value, result, index); 405 } else if (charAt(value, index + 1) == 'N') { 406 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 407 result.append("KN", "N"); 408 } else if (!contains(value, index + 2, 2, "EY") && 409 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 410 result.append("N", "KN"); 411 } else { 412 result.append("KN"); 413 } 414 index = index + 2; 415 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 416 result.append("KL", "L"); 417 index += 2; 418 } else if (index == 0 && 419 (charAt(value, index + 1) == 'Y' || 420 contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 421 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 422 result.append('K', 'J'); 423 index += 2; 424 } else if ((contains(value, index + 1, 2, "ER") || 425 charAt(value, index + 1) == 'Y') && 426 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 427 !contains(value, index - 1, 1, "E", "I") && 428 !contains(value, index - 1, 3, "RGY", "OGY")) { 429 //-- -ger-, -gy- --// 430 result.append('K', 'J'); 431 index += 2; 432 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 433 contains(value, index - 1, 4, "AGGI", "OGGI")) { 434 //-- Italian "biaggi" --// 435 if (contains(value, 0 ,4, "VAN ", "VON ") || 436 contains(value, 0, 3, "SCH") || 437 contains(value, index + 1, 2, "ET")) { 438 //-- obvious germanic --// 439 result.append('K'); 440 } else if (contains(value, index + 1, 3, "IER")) { 441 result.append('J'); 442 } else { 443 result.append('J', 'K'); 444 } 445 index += 2; 446 } else if (charAt(value, index + 1) == 'G') { 447 index += 2; 448 result.append('K'); 449 } else { 450 index++; 451 result.append('K'); 452 } 453 return index; 454 } 455 456 /** 457 * Handles 'GH' cases. 458 */ 459 private int handleGH(final String value, final DoubleMetaphoneResult result, int index) { 460 if (index > 0 && !isVowel(charAt(value, index - 1))) { 461 result.append('K'); 462 index += 2; 463 } else if (index == 0) { 464 if (charAt(value, index + 2) == 'I') { 465 result.append('J'); 466 } else { 467 result.append('K'); 468 } 469 index += 2; 470 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || 471 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || 472 (index > 3 && contains(value, index - 4, 1, "B", "H"))) { 473 //-- Parker's rule (with some further refinements) - "hugh" 474 index += 2; 475 } else { 476 if (index > 2 && charAt(value, index - 1) == 'U' && 477 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 478 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 479 result.append('F'); 480 } else if (index > 0 && charAt(value, index - 1) != 'I') { 481 result.append('K'); 482 } 483 index += 2; 484 } 485 return index; 486 } 487 488 /** 489 * Handles 'H' cases. 490 */ 491 private int handleH(final String value, final DoubleMetaphoneResult result, int index) { 492 //-- only keep if first & before vowel or between 2 vowels --// 493 if ((index == 0 || isVowel(charAt(value, index - 1))) && 494 isVowel(charAt(value, index + 1))) { 495 result.append('H'); 496 index += 2; 497 //-- also takes car of "HH" --// 498 } else { 499 index++; 500 } 501 return index; 502 } 503 504 /** 505 * Handles 'J' cases. 506 */ 507 private int handleJ(final String value, final DoubleMetaphoneResult result, int index, 508 final boolean slavoGermanic) { 509 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 510 //-- obvious Spanish, "Jose", "San Jacinto" --// 511 if ((index == 0 && (charAt(value, index + 4) == ' ') || 512 value.length() == 4) || contains(value, 0, 4, "SAN ")) { 513 result.append('H'); 514 } else { 515 result.append('J', 'H'); 516 } 517 index++; 518 } else { 519 if (index == 0 && !contains(value, index, 4, "JOSE")) { 520 result.append('J', 'A'); 521 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 522 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 523 result.append('J', 'H'); 524 } else if (index == value.length() - 1) { 525 result.append('J', ' '); 526 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && 527 !contains(value, index - 1, 1, "S", "K", "L")) { 528 result.append('J'); 529 } 530 531 if (charAt(value, index + 1) == 'J') { 532 index += 2; 533 } else { 534 index++; 535 } 536 } 537 return index; 538 } 539 540 /** 541 * Handles 'L' cases. 542 */ 543 private int handleL(final String value, final DoubleMetaphoneResult result, int index) { 544 if (charAt(value, index + 1) == 'L') { 545 if (conditionL0(value, index)) { 546 result.appendPrimary('L'); 547 } else { 548 result.append('L'); 549 } 550 index += 2; 551 } else { 552 index++; 553 result.append('L'); 554 } 555 return index; 556 } 557 558 /** 559 * Handles 'P' cases. 560 */ 561 private int handleP(final String value, final DoubleMetaphoneResult result, int index) { 562 if (charAt(value, index + 1) == 'H') { 563 result.append('F'); 564 index += 2; 565 } else { 566 result.append('P'); 567 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 568 } 569 return index; 570 } 571 572 /** 573 * Handles 'R' cases. 574 */ 575 private int handleR(final String value, final DoubleMetaphoneResult result, final int index, 576 final boolean slavoGermanic) { 577 if (index == value.length() - 1 && !slavoGermanic && 578 contains(value, index - 2, 2, "IE") && 579 !contains(value, index - 4, 2, "ME", "MA")) { 580 result.appendAlternate('R'); 581 } else { 582 result.append('R'); 583 } 584 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 585 } 586 587 /** 588 * Handles 'S' cases. 589 */ 590 private int handleS(final String value, final DoubleMetaphoneResult result, int index, 591 final boolean slavoGermanic) { 592 if (contains(value, index - 1, 3, "ISL", "YSL")) { 593 //-- special cases "island", "isle", "carlisle", "carlysle" --// 594 index++; 595 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 596 //-- special case "sugar-" --// 597 result.append('X', 'S'); 598 index++; 599 } else if (contains(value, index, 2, "SH")) { 600 if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { 601 //-- germanic --// 602 result.append('S'); 603 } else { 604 result.append('X'); 605 } 606 index += 2; 607 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 608 //-- Italian and Armenian --// 609 if (slavoGermanic) { 610 result.append('S'); 611 } else { 612 result.append('S', 'X'); 613 } 614 index += 3; 615 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || 616 contains(value, index + 1, 1, "Z")) { 617 //-- german & anglicisations, e.g. "smith" match "schmidt" // 618 // "snider" match "schneider" --// 619 //-- also, -sz- in slavic language although in hungarian it // 620 // is pronounced "s" --// 621 result.append('S', 'X'); 622 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 623 } else if (contains(value, index, 2, "SC")) { 624 index = handleSC(value, result, index); 625 } else { 626 if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) { 627 //-- french e.g. "resnais", "artois" --// 628 result.appendAlternate('S'); 629 } else { 630 result.append('S'); 631 } 632 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 633 } 634 return index; 635 } 636 637 /** 638 * Handles 'SC' cases. 639 */ 640 private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) { 641 if (charAt(value, index + 2) == 'H') { 642 //-- Schlesinger's rule --// 643 if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 644 //-- Dutch origin, e.g. "school", "schooner" --// 645 if (contains(value, index + 3, 2, "ER", "EN")) { 646 //-- "schermerhorn", "schenker" --// 647 result.append("X", "SK"); 648 } else { 649 result.append("SK"); 650 } 651 } else if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 652 result.append('X', 'S'); 653 } else { 654 result.append('X'); 655 } 656 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 657 result.append('S'); 658 } else { 659 result.append("SK"); 660 } 661 return index + 3; 662 } 663 664 /** 665 * Handles 'T' cases. 666 */ 667 private int handleT(final String value, final DoubleMetaphoneResult result, int index) { 668 if (contains(value, index, 4, "TION") || contains(value, index, 3, "TIA", "TCH")) { 669 result.append('X'); 670 index += 3; 671 } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) { 672 if (contains(value, index + 2, 2, "OM", "AM") || 673 //-- special case "thomas", "thames" or germanic --// 674 contains(value, 0, 4, "VAN ", "VON ") || 675 contains(value, 0, 3, "SCH")) { 676 result.append('T'); 677 } else { 678 result.append('0', 'T'); 679 } 680 index += 2; 681 } else { 682 result.append('T'); 683 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 684 } 685 return index; 686 } 687 688 /** 689 * Handles 'W' cases. 690 */ 691 private int handleW(final String value, final DoubleMetaphoneResult result, int index) { 692 if (contains(value, index, 2, "WR")) { 693 //-- can also be in middle of word --// 694 result.append('R'); 695 index += 2; 696 } else if (index == 0 && (isVowel(charAt(value, index + 1)) || 697 contains(value, index, 2, "WH"))) { 698 if (isVowel(charAt(value, index + 1))) { 699 //-- Wasserman should match Vasserman --// 700 result.append('A', 'F'); 701 } else { 702 //-- need Uomo to match Womo --// 703 result.append('A'); 704 } 705 index++; 706 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || 707 contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 708 contains(value, 0, 3, "SCH")) { 709 //-- Arnow should match Arnoff --// 710 result.appendAlternate('F'); 711 index++; 712 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 713 //-- Polish e.g. "filipowicz" --// 714 result.append("TS", "FX"); 715 index += 4; 716 } else { 717 index++; 718 } 719 return index; 720 } 721 722 /** 723 * Handles 'X' cases. 724 */ 725 private int handleX(final String value, final DoubleMetaphoneResult result, int index) { 726 if (index == 0) { 727 result.append('S'); 728 index++; 729 } else { 730 if (!((index == value.length() - 1) && 731 (contains(value, index - 3, 3, "IAU", "EAU") || 732 contains(value, index - 2, 2, "AU", "OU")))) { 733 //-- French e.g. breaux --// 734 result.append("KS"); 735 } 736 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 737 } 738 return index; 739 } 740 741 /** 742 * Handles 'Z' cases. 743 */ 744 private int handleZ(final String value, final DoubleMetaphoneResult result, int index, 745 final boolean slavoGermanic) { 746 if (charAt(value, index + 1) == 'H') { 747 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 748 result.append('J'); 749 index += 2; 750 } else { 751 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || 752 (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { 753 result.append("S", "TS"); 754 } else { 755 result.append('S'); 756 } 757 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 758 } 759 return index; 760 } 761 762 //-- BEGIN CONDITIONS --// 763 764 /** 765 * Complex condition 0 for 'C'. 766 */ 767 private boolean conditionC0(final String value, final int index) { 768 if (contains(value, index, 4, "CHIA")) { 769 return true; 770 } 771 if (index <= 1) { 772 return false; 773 } 774 if (isVowel(charAt(value, index - 2))) { 775 return false; 776 } 777 if (!contains(value, index - 1, 3, "ACH")) { 778 return false; 779 } 780 final char c = charAt(value, index + 2); 781 return (c != 'I' && c != 'E') || 782 contains(value, index - 2, 6, "BACHER", "MACHER"); 783 } 784 785 /** 786 * Complex condition 0 for 'CH'. 787 */ 788 private boolean conditionCH0(final String value, final int index) { 789 if (index != 0) { 790 return false; 791 } 792 if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 793 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 794 return false; 795 } 796 return !contains(value, 0, 5, "CHORE"); 797 } 798 799 /** 800 * Complex condition 1 for 'CH'. 801 */ 802 private boolean conditionCH1(final String value, final int index) { 803 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || 804 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 805 contains(value, index + 2, 1, "T", "S") || 806 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 807 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); 808 } 809 810 /** 811 * Complex condition 0 for 'L'. 812 */ 813 private boolean conditionL0(final String value, final int index) { 814 if (index == value.length() - 3 && 815 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 816 return true; 817 } 818 return (contains(value, value.length() - 2, 2, "AS", "OS") || 819 contains(value, value.length() - 1, 1, "A", "O")) && 820 contains(value, index - 1, 4, "ALLE"); 821 } 822 823 /** 824 * Complex condition 0 for 'M'. 825 */ 826 private boolean conditionM0(final String value, final int index) { 827 if (charAt(value, index + 1) == 'M') { 828 return true; 829 } 830 return contains(value, index - 1, 3, "UMB") && 831 ((index + 1) == value.length() - 1 || contains(value, index + 2, 2, "ER")); 832 } 833 834 //-- BEGIN HELPER FUNCTIONS --// 835 836 /** 837 * Determines whether or not a value is of slavo-germanic origin. A value is 838 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 839 */ 840 private boolean isSlavoGermanic(final String value) { 841 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 842 value.contains("CZ") || value.contains("WITZ"); 843 } 844 845 /** 846 * Determines whether or not a character is a vowel or not 847 */ 848 private boolean isVowel(final char ch) { 849 return VOWELS.indexOf(ch) != -1; 850 } 851 852 /** 853 * Determines whether or not the value starts with a silent letter. It will 854 * return {@code true} if the value starts with any of 'GN', 'KN', 855 * 'PN', 'WR' or 'PS'. 856 */ 857 private boolean isSilentStart(final String value) { 858 boolean result = false; 859 for (final String element : SILENT_START) { 860 if (value.startsWith(element)) { 861 result = true; 862 break; 863 } 864 } 865 return result; 866 } 867 868 /** 869 * Cleans the input. 870 */ 871 private String cleanInput(String input) { 872 if (input == null) { 873 return null; 874 } 875 input = input.trim(); 876 if (input.isEmpty()) { 877 return null; 878 } 879 return input.toUpperCase(java.util.Locale.ENGLISH); 880 } 881 882 /* 883 * Gets the character at index {@code index} if available, otherwise 884 * it returns {@code Character.MIN_VALUE} so that there is some sort 885 * of default. 886 */ 887 protected char charAt(final String value, final int index) { 888 if (index < 0 || index >= value.length()) { 889 return Character.MIN_VALUE; 890 } 891 return value.charAt(index); 892 } 893 894 /* 895 * Determines whether {@code value} contains any of the criteria starting at index {@code start} and 896 * matching up to length {@code length}. 897 */ 898 protected static boolean contains(final String value, final int start, final int length, 899 final String... criteria) { 900 boolean result = false; 901 if (start >= 0 && start + length <= value.length()) { 902 final String target = value.substring(start, start + length); 903 904 for (final String element : criteria) { 905 if (target.equals(element)) { 906 result = true; 907 break; 908 } 909 } 910 } 911 return result; 912 } 913 914 //-- BEGIN INNER CLASSES --// 915 916 /** 917 * Inner class for storing results, since there is the optional alternate encoding. 918 */ 919 public class DoubleMetaphoneResult { 920 921 private final StringBuilder primary = new StringBuilder(getMaxCodeLen()); 922 private final StringBuilder alternate = new StringBuilder(getMaxCodeLen()); 923 private final int maxLength; 924 925 public DoubleMetaphoneResult(final int maxLength) { 926 this.maxLength = maxLength; 927 } 928 929 public void append(final char value) { 930 appendPrimary(value); 931 appendAlternate(value); 932 } 933 934 public void append(final char primary, final char alternate) { 935 appendPrimary(primary); 936 appendAlternate(alternate); 937 } 938 939 public void appendPrimary(final char value) { 940 if (this.primary.length() < this.maxLength) { 941 this.primary.append(value); 942 } 943 } 944 945 public void appendAlternate(final char value) { 946 if (this.alternate.length() < this.maxLength) { 947 this.alternate.append(value); 948 } 949 } 950 951 public void append(final String value) { 952 appendPrimary(value); 953 appendAlternate(value); 954 } 955 956 public void append(final String primary, final String alternate) { 957 appendPrimary(primary); 958 appendAlternate(alternate); 959 } 960 961 public void appendPrimary(final String value) { 962 final int addChars = this.maxLength - this.primary.length(); 963 if (value.length() <= addChars) { 964 this.primary.append(value); 965 } else { 966 this.primary.append(value, 0, addChars); 967 } 968 } 969 970 public void appendAlternate(final String value) { 971 final int addChars = this.maxLength - this.alternate.length(); 972 if (value.length() <= addChars) { 973 this.alternate.append(value); 974 } else { 975 this.alternate.append(value, 0, addChars); 976 } 977 } 978 979 public String getPrimary() { 980 return this.primary.toString(); 981 } 982 983 public String getAlternate() { 984 return this.alternate.toString(); 985 } 986 987 public boolean isComplete() { 988 return this.primary.length() >= this.maxLength && 989 this.alternate.length() >= this.maxLength; 990 } 991 } 992}