001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes.
025 *
026 * <p>This class is immutable and thread-safe.</p>
027 *
028 * @version $Id: SoundexUtils.java 1429868 2013-01-07 16:08:05Z ggregory $
029 * @since 1.3
030 */
031final class SoundexUtils {
032
033    /**
034     * Cleans up the input string before Soundex processing by only returning
035     * upper case letters.
036     *
037     * @param str
038     *                  The String to clean.
039     * @return A clean String.
040     */
041    static String clean(final String str) {
042        if (str == null || str.length() == 0) {
043            return str;
044        }
045        final int len = str.length();
046        final char[] chars = new char[len];
047        int count = 0;
048        for (int i = 0; i < len; i++) {
049            if (Character.isLetter(str.charAt(i))) {
050                chars[count++] = str.charAt(i);
051            }
052        }
053        if (count == len) {
054            return str.toUpperCase(java.util.Locale.ENGLISH);
055        }
056        return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
057    }
058
059    /**
060     * Encodes the Strings and returns the number of characters in the two
061     * encoded Strings that are the same.
062     * <ul>
063     * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
064     * little or no similarity, and 4 indicates strong similarity or identical
065     * values.</li>
066     * <li>For refined Soundex, the return value can be greater than 4.</li>
067     * </ul>
068     *
069     * @param encoder
070     *                  The encoder to use to encode the Strings.
071     * @param s1
072     *                  A String that will be encoded and compared.
073     * @param s2
074     *                  A String that will be encoded and compared.
075     * @return The number of characters in the two Soundex encoded Strings that
076     *             are the same.
077     *
078     * @see #differenceEncoded(String,String)
079     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
080     *          MS T-SQL DIFFERENCE</a>
081     *
082     * @throws EncoderException
083     *                  if an error occurs encoding one of the strings
084     */
085    static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException {
086        return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
087    }
088
089    /**
090     * Returns the number of characters in the two Soundex encoded Strings that
091     * are the same.
092     * <ul>
093     * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
094     * little or no similarity, and 4 indicates strong similarity or identical
095     * values.</li>
096     * <li>For refined Soundex, the return value can be greater than 4.</li>
097     * </ul>
098     *
099     * @param es1
100     *                  An encoded String.
101     * @param es2
102     *                  An encoded String.
103     * @return The number of characters in the two Soundex encoded Strings that
104     *             are the same.
105     *
106     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
107     *          MS T-SQL DIFFERENCE</a>
108     */
109    static int differenceEncoded(final String es1, final String es2) {
110
111        if (es1 == null || es2 == null) {
112            return 0;
113        }
114        final int lengthToMatch = Math.min(es1.length(), es2.length());
115        int diff = 0;
116        for (int i = 0; i < lengthToMatch; i++) {
117            if (es1.charAt(i) == es2.charAt(i)) {
118                diff++;
119            }
120        }
121        return diff;
122    }
123
124}