001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Metaphone value.
025 * <p>
026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028 * </p>
029 * <p>
030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
031 * p 39.</CITE>
032 * </p>
033 * <p>
034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations:
035 * </p>
036 * <ul>
037 * <li><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>
038 *  (broken link 4/30/2013) </li>
039 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a>
040 *  (link checked 4/30/2013) </li>
041 * </ul>
042 * <p>
043 * They have had undocumented changes from the originally published algorithm.
044 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
045 * </p>
046 * <p>
047 * This class is conditionally thread-safe.
048 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)}
049 * but is not volatile, and accesses are not synchronized.
050 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization
051 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
052 * after initial setup.
053 * </p>
054 */
055public class Metaphone implements StringEncoder {
056
057    /**
058     * Five values in the English language
059     */
060    private static final String VOWELS = "AEIOU";
061
062    /**
063     * Variable used in Metaphone algorithm
064     */
065    private static final String FRONTV = "EIY";
066
067    /**
068     * Variable used in Metaphone algorithm
069     */
070    private static final String VARSON = "CSPTG";
071
072    /**
073     * The max code length for metaphone is 4
074     */
075    private int maxCodeLen = 4;
076
077    /**
078     * Find the metaphone value of a String. This is similar to the
079     * soundex algorithm, but better at finding similar sounding words.
080     * All input is converted to upper case.
081     * Limitations: Input format is expected to be a single ASCII word
082     * with only characters in the A - Z range, no punctuation or numbers.
083     *
084     * @param txt String to find the metaphone code for
085     * @return A metaphone code corresponding to the String supplied
086     */
087    public String metaphone(final String txt) {
088        boolean hard = false;
089        final int txtLength;
090        if (txt == null || (txtLength = txt.length()) == 0) {
091            return "";
092        }
093        // single character is itself
094        if (txtLength == 1) {
095            return txt.toUpperCase(java.util.Locale.ENGLISH);
096        }
097
098        final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
099
100        final StringBuilder local = new StringBuilder(40); // manipulate
101        final StringBuilder code = new StringBuilder(10); //   output
102        // handle initial 2 characters exceptions
103        switch(inwd[0]) {
104        case 'K':
105        case 'G':
106        case 'P': /* looking for KN, etc*/
107            if (inwd[1] == 'N') {
108                local.append(inwd, 1, inwd.length - 1);
109            } else {
110                local.append(inwd);
111            }
112            break;
113        case 'A': /* looking for AE */
114            if (inwd[1] == 'E') {
115                local.append(inwd, 1, inwd.length - 1);
116            } else {
117                local.append(inwd);
118            }
119            break;
120        case 'W': /* looking for WR or WH */
121            if (inwd[1] == 'R') {   // WR -> R
122                local.append(inwd, 1, inwd.length - 1);
123                break;
124            }
125            if (inwd[1] == 'H') {
126                local.append(inwd, 1, inwd.length - 1);
127                local.setCharAt(0, 'W'); // WH -> W
128            } else {
129                local.append(inwd);
130            }
131            break;
132        case 'X': /* initial X becomes S */
133            inwd[0] = 'S';
134            local.append(inwd);
135            break;
136        default:
137            local.append(inwd);
138        } // now local has working string with initials fixed
139
140        final int wdsz = local.length();
141        int n = 0;
142
143        while (code.length() < this.getMaxCodeLen() &&
144               n < wdsz ) { // max code size of 4 works well
145            final char symb = local.charAt(n);
146            // remove duplicate letters except C
147            if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
148                n++;
149            } else { // not dup
150                switch(symb) {
151                case 'A':
152                case 'E':
153                case 'I':
154                case 'O':
155                case 'U':
156                    if (n == 0) {
157                        code.append(symb);
158                    }
159                    break; // only use vowel if leading char
160                case 'B':
161                    if ( isPreviousChar(local, n, 'M') &&
162                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
163                        break;
164                    }
165                    code.append(symb);
166                    break;
167                case 'C': // lots of C special cases
168                    /* discard if SCI, SCE or SCY */
169                    if ( isPreviousChar(local, n, 'S') &&
170                         !isLastChar(wdsz, n) &&
171                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
172                        break;
173                    }
174                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
175                        code.append('X');
176                        break;
177                    }
178                    if (!isLastChar(wdsz, n) &&
179                        FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
180                        code.append('S');
181                        break; // CI,CE,CY -> S
182                    }
183                    if (isPreviousChar(local, n, 'S') &&
184                        isNextChar(local, n, 'H') ) { // SCH->sk
185                        code.append('K');
186                        break;
187                    }
188                    if (isNextChar(local, n, 'H')) { // detect CH
189                        if (n == 0 &&
190                            wdsz >= 3 &&
191                            isVowel(local,2) ) { // CH consonant -> K consonant
192                            code.append('K');
193                        } else {
194                            code.append('X'); // CHvowel -> X
195                        }
196                    } else {
197                        code.append('K');
198                    }
199                    break;
200                case 'D':
201                    if (!isLastChar(wdsz, n + 1) &&
202                        isNextChar(local, n, 'G') &&
203                        FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
204                        code.append('J'); n += 2;
205                    } else {
206                        code.append('T');
207                    }
208                    break;
209                case 'G': // GH silent at end or before consonant
210                    if (isLastChar(wdsz, n + 1) &&
211                        isNextChar(local, n, 'H')) {
212                        break;
213                    }
214                    if (!isLastChar(wdsz, n + 1) &&
215                        isNextChar(local,n,'H') &&
216                        !isVowel(local,n+2)) {
217                        break;
218                    }
219                    if (n > 0 &&
220                        ( regionMatch(local, n, "GN") ||
221                          regionMatch(local, n, "GNED") ) ) {
222                        break; // silent G
223                    }
224                    // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
225                    hard = isPreviousChar(local, n, 'G');
226                    if (!isLastChar(wdsz, n) &&
227                        FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
228                        !hard) {
229                        code.append('J');
230                    } else {
231                        code.append('K');
232                    }
233                    break;
234                case 'H':
235                    if (isLastChar(wdsz, n)) {
236                        break; // terminal H
237                    }
238                    if (n > 0 &&
239                        VARSON.indexOf(local.charAt(n - 1)) >= 0) {
240                        break;
241                    }
242                    if (isVowel(local,n+1)) {
243                        code.append('H'); // Hvowel
244                    }
245                    break;
246                case 'F':
247                case 'J':
248                case 'L':
249                case 'M':
250                case 'N':
251                case 'R':
252                    code.append(symb);
253                    break;
254                case 'K':
255                    if (n > 0) { // not initial
256                        if (!isPreviousChar(local, n, 'C')) {
257                            code.append(symb);
258                        }
259                    } else {
260                        code.append(symb); // initial K
261                    }
262                    break;
263                case 'P':
264                    if (isNextChar(local,n,'H')) {
265                        // PH -> F
266                        code.append('F');
267                    } else {
268                        code.append(symb);
269                    }
270                    break;
271                case 'Q':
272                    code.append('K');
273                    break;
274                case 'S':
275                    if (regionMatch(local,n,"SH") ||
276                        regionMatch(local,n,"SIO") ||
277                        regionMatch(local,n,"SIA")) {
278                        code.append('X');
279                    } else {
280                        code.append('S');
281                    }
282                    break;
283                case 'T':
284                    if (regionMatch(local,n,"TIA") ||
285                        regionMatch(local,n,"TIO")) {
286                        code.append('X');
287                        break;
288                    }
289                    if (regionMatch(local,n,"TCH")) {
290                        // Silent if in "TCH"
291                        break;
292                    }
293                    // substitute numeral 0 for TH (resembles theta after all)
294                    if (regionMatch(local,n,"TH")) {
295                        code.append('0');
296                    } else {
297                        code.append('T');
298                    }
299                    break;
300                case 'V':
301                    code.append('F'); break;
302                case 'W':
303                case 'Y': // silent if not followed by vowel
304                    if (!isLastChar(wdsz,n) &&
305                        isVowel(local,n+1)) {
306                        code.append(symb);
307                    }
308                    break;
309                case 'X':
310                    code.append('K');
311                    code.append('S');
312                    break;
313                case 'Z':
314                    code.append('S');
315                    break;
316                default:
317                    // do nothing
318                    break;
319                } // end switch
320                n++;
321            } // end else from symb != 'C'
322            if (code.length() > this.getMaxCodeLen()) {
323                code.setLength(this.getMaxCodeLen());
324            }
325        }
326        return code.toString();
327    }
328
329    private boolean isVowel(final StringBuilder string, final int index) {
330        return VOWELS.indexOf(string.charAt(index)) >= 0;
331    }
332
333    private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
334        boolean matches = false;
335        if( index > 0 &&
336            index < string.length() ) {
337            matches = string.charAt(index - 1) == c;
338        }
339        return matches;
340    }
341
342    private boolean isNextChar(final StringBuilder string, final int index, final char c) {
343        boolean matches = false;
344        if( index >= 0 &&
345            index < string.length() - 1 ) {
346            matches = string.charAt(index + 1) == c;
347        }
348        return matches;
349    }
350
351    private boolean regionMatch(final StringBuilder string, final int index, final String test) {
352        boolean matches = false;
353        if (index >= 0 && index + test.length() - 1 < string.length()) {
354            final String substring = string.substring(index, index + test.length());
355            matches = substring.equals(test);
356        }
357        return matches;
358    }
359
360    private boolean isLastChar(final int wdsz, final int n) {
361        return n + 1 == wdsz;
362    }
363
364    /**
365     * Encodes an Object using the metaphone algorithm.  This method
366     * is provided in order to satisfy the requirements of the
367     * Encoder interface, and will throw an EncoderException if the
368     * supplied object is not of type java.lang.String.
369     *
370     * @param obj Object to encode
371     * @return An object (or type java.lang.String) containing the
372     *         metaphone code which corresponds to the String supplied.
373     * @throws EncoderException if the parameter supplied is not
374     *                          of type java.lang.String
375     */
376    @Override
377    public Object encode(final Object obj) throws EncoderException {
378        if (!(obj instanceof String)) {
379            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
380        }
381        return metaphone((String) obj);
382    }
383
384    /**
385     * Encodes a String using the Metaphone algorithm.
386     *
387     * @param str String object to encode
388     * @return The metaphone code corresponding to the String supplied
389     */
390    @Override
391    public String encode(final String str) {
392        return metaphone(str);
393    }
394
395    /**
396     * Tests is the metaphones of two strings are identical.
397     *
398     * @param str1 First of two strings to compare
399     * @param str2 Second of two strings to compare
400     * @return {@code true} if the metaphones of these strings are identical,
401     *        {@code false} otherwise.
402     */
403    public boolean isMetaphoneEqual(final String str1, final String str2) {
404        return metaphone(str1).equals(metaphone(str2));
405    }
406
407    /**
408     * Returns the maxCodeLen.
409     * @return int
410     */
411    public int getMaxCodeLen() { return this.maxCodeLen; }
412
413    /**
414     * Sets the maxCodeLen.
415     * @param maxCodeLen The maxCodeLen to set
416     */
417    public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
418
419}