001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import java.util.regex.Pattern;
021
022import org.apache.commons.codec.EncoderException;
023import org.apache.commons.codec.StringEncoder;
024
025/**
026 * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
027 * general purpose scheme to find word with similar phonemes.
028 * <p>
029 * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
030 * <p>
031 * Algorithm description:
032 * <pre>
033 * 1. Transcode first characters of name
034 *   1a. MAC ->   MCC
035 *   1b. KN  ->   NN
036 *   1c. K   ->   C
037 *   1d. PH  ->   FF
038 *   1e. PF  ->   FF
039 *   1f. SCH ->   SSS
040 * 2. Transcode last characters of name
041 *   2a. EE, IE          ->   Y
042 *   2b. DT,RT,RD,NT,ND  ->   D
043 * 3. First character of key = first character of name
044 * 4. Transcode remaining characters by following these rules, incrementing by one character each time
045 *   4a. EV  ->   AF  else A,E,I,O,U -> A
046 *   4b. Q   ->   G
047 *   4c. Z   ->   S
048 *   4d. M   ->   N
049 *   4e. KN  ->   N   else K -> C
050 *   4f. SCH ->   SSS
051 *   4g. PH  ->   FF
052 *   4h. H   ->   If previous or next is nonvowel, previous
053 *   4i. W   ->   If previous is vowel, previous
054 *   4j. Add current to key if current != last key character
055 * 5. If last character is S, remove it
056 * 6. If last characters are AY, replace with Y
057 * 7. If last character is A, remove it
058 * 8. Collapse all strings of repeated characters
059 * 9. Add original first character of name as first character of key
060 * </pre>
061 * <p>
062 * This class is immutable and thread-safe.
063 *
064 * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
065 * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
066 * @see Soundex
067 * @since 1.7
068 * @version $Id: Nysiis.java 1429868 2013-01-07 16:08:05Z ggregory $
069 */
070public class Nysiis implements StringEncoder {
071
072    private static final char[] CHARS_A   = new char[] { 'A' };
073    private static final char[] CHARS_AF  = new char[] { 'A', 'F' };
074    private static final char[] CHARS_C   = new char[] { 'C' };
075    private static final char[] CHARS_FF  = new char[] { 'F', 'F' };
076    private static final char[] CHARS_G   = new char[] { 'G' };
077    private static final char[] CHARS_N   = new char[] { 'N' };
078    private static final char[] CHARS_NN  = new char[] { 'N', 'N' };
079    private static final char[] CHARS_S   = new char[] { 'S' };
080    private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
081
082    private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
083    private static final Pattern PAT_KN     = Pattern.compile("^KN");
084    private static final Pattern PAT_K      = Pattern.compile("^K");
085    private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
086    private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
087    private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
088    private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
089
090    private static final char SPACE = ' ';
091    private static final int TRUE_LENGTH = 6;
092
093    /**
094     * Tests if the given character is a vowel.
095     *
096     * @param c
097     *            the character to test
098     * @return {@code true} if the character is a vowel, {@code false} otherwise
099     */
100    private static boolean isVowel(final char c) {
101        return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
102    }
103
104    /**
105     * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
106     * a time: [i-1, i, i+1, i+2].
107     *
108     * @param prev
109     *            the previous character
110     * @param curr
111     *            the current character
112     * @param next
113     *            the next character
114     * @param aNext
115     *            the after next character
116     * @return a transcoded array of characters, starting from the current position
117     */
118    private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
119        // 1. EV -> AF
120        if (curr == 'E' && next == 'V') {
121            return CHARS_AF;
122        }
123
124        // A, E, I, O, U -> A
125        if (isVowel(curr)) {
126            return CHARS_A;
127        }
128
129        // 2. Q -> G, Z -> S, M -> N
130        if (curr == 'Q') {
131            return CHARS_G;
132        } else if (curr == 'Z') {
133            return CHARS_S;
134        } else if (curr == 'M') {
135            return CHARS_N;
136        }
137
138        // 3. KN -> NN else K -> C
139        if (curr == 'K') {
140            if (next == 'N') {
141                return CHARS_NN;
142            } else {
143                return CHARS_C;
144            }
145        }
146
147        // 4. SCH -> SSS
148        if (curr == 'S' && next == 'C' && aNext == 'H') {
149            return CHARS_SSS;
150        }
151
152        // PH -> FF
153        if (curr == 'P' && next == 'H') {
154            return CHARS_FF;
155        }
156
157        // 5. H -> If previous or next is a non vowel, previous.
158        if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
159            return new char[] { prev };
160        }
161
162        // 6. W -> If previous is vowel, previous.
163        if (curr == 'W' && isVowel(prev)) {
164            return new char[] { prev };
165        }
166
167        return new char[] { curr };
168    }
169
170    /** Indicates the strict mode. */
171    private final boolean strict;
172
173    /**
174     * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
175     * i.e. encoded strings have a maximum length of 6.
176     */
177    public Nysiis() {
178        this(true);
179    }
180
181    /**
182     * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
183     *
184     * <ul>
185     *  <li>{@code true}: encoded strings have a maximum length of 6</li>
186     *  <li>{@code false}: encoded strings may have arbitrary length</li>
187     * </ul>
188     *
189     * @param strict
190     *            the strict mode
191     */
192    public Nysiis(final boolean strict) {
193        this.strict = strict;
194    }
195
196    /**
197     * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
198     * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
199     * {@link String}.
200     *
201     * @param obj
202     *            Object to encode
203     * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
204     * @throws EncoderException
205     *            if the parameter supplied is not of a {@link String}
206     * @throws IllegalArgumentException
207     *            if a character is not mapped
208     */
209    @Override
210    public Object encode(final Object obj) throws EncoderException {
211        if (!(obj instanceof String)) {
212            throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
213        }
214        return this.nysiis((String) obj);
215    }
216
217    /**
218     * Encodes a String using the NYSIIS algorithm.
219     *
220     * @param str
221     *            A String object to encode
222     * @return A Nysiis code corresponding to the String supplied
223     * @throws IllegalArgumentException
224     *            if a character is not mapped
225     */
226    @Override
227    public String encode(final String str) {
228        return this.nysiis(str);
229    }
230
231    /**
232     * Indicates the strict mode for this {@link Nysiis} encoder.
233     *
234     * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
235     */
236    public boolean isStrict() {
237        return this.strict;
238    }
239
240    /**
241     * Retrieves the NYSIIS code for a given String object.
242     *
243     * @param str
244     *            String to encode using the NYSIIS algorithm
245     * @return A NYSIIS code for the String supplied
246     */
247    public String nysiis(String str) {
248        if (str == null) {
249            return null;
250        }
251
252        // Use the same clean rules as Soundex
253        str = SoundexUtils.clean(str);
254
255        if (str.length() == 0) {
256            return str;
257        }
258
259        // Translate first characters of name:
260        // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
261        str = PAT_MAC.matcher(str).replaceFirst("MCC");
262        str = PAT_KN.matcher(str).replaceFirst("NN");
263        str = PAT_K.matcher(str).replaceFirst("C");
264        str = PAT_PH_PF.matcher(str).replaceFirst("FF");
265        str = PAT_SCH.matcher(str).replaceFirst("SSS");
266
267        // Translate last characters of name:
268        // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
269        str = PAT_EE_IE.matcher(str).replaceFirst("Y");
270        str = PAT_DT_ETC.matcher(str).replaceFirst("D");
271
272        // First character of key = first character of name.
273        final StringBuilder key = new StringBuilder(str.length());
274        key.append(str.charAt(0));
275
276        // Transcode remaining characters, incrementing by one character each time
277        final char[] chars = str.toCharArray();
278        final int len = chars.length;
279
280        for (int i = 1; i < len; i++) {
281            final char next = i < len - 1 ? chars[i + 1] : SPACE;
282            final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
283            final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
284            System.arraycopy(transcoded, 0, chars, i, transcoded.length);
285
286            // only append the current char to the key if it is different from the last one
287            if (chars[i] != chars[i - 1]) {
288                key.append(chars[i]);
289            }
290        }
291
292        if (key.length() > 1) {
293            char lastChar = key.charAt(key.length() - 1);
294
295            // If last character is S, remove it.
296            if (lastChar == 'S') {
297                key.deleteCharAt(key.length() - 1);
298                lastChar = key.charAt(key.length() - 1);
299            }
300
301            if (key.length() > 2) {
302                final char last2Char = key.charAt(key.length() - 2);
303                // If last characters are AY, replace with Y.
304                if (last2Char == 'A' && lastChar == 'Y') {
305                    key.deleteCharAt(key.length() - 2);
306                }
307            }
308
309            // If last character is A, remove it.
310            if (lastChar == 'A') {
311                key.deleteCharAt(key.length() - 1);
312            }
313        }
314
315        final String string = key.toString();
316        return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
317    }
318
319}