001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import org.apache.commons.codec.EncoderException; 021import org.apache.commons.codec.StringEncoder; 022 023/** 024 * Utility methods for {@link Soundex} and {@link RefinedSoundex} classes. 025 * 026 * <p>This class is immutable and thread-safe.</p> 027 * 028 * @version $Id: SoundexUtils.java 1429868 2013-01-07 16:08:05Z ggregory $ 029 * @since 1.3 030 */ 031final class SoundexUtils { 032 033 /** 034 * Cleans up the input string before Soundex processing by only returning 035 * upper case letters. 036 * 037 * @param str 038 * The String to clean. 039 * @return A clean String. 040 */ 041 static String clean(final String str) { 042 if (str == null || str.length() == 0) { 043 return str; 044 } 045 final int len = str.length(); 046 final char[] chars = new char[len]; 047 int count = 0; 048 for (int i = 0; i < len; i++) { 049 if (Character.isLetter(str.charAt(i))) { 050 chars[count++] = str.charAt(i); 051 } 052 } 053 if (count == len) { 054 return str.toUpperCase(java.util.Locale.ENGLISH); 055 } 056 return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH); 057 } 058 059 /** 060 * Encodes the Strings and returns the number of characters in the two 061 * encoded Strings that are the same. 062 * <ul> 063 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates 064 * little or no similarity, and 4 indicates strong similarity or identical 065 * values.</li> 066 * <li>For refined Soundex, the return value can be greater than 4.</li> 067 * </ul> 068 * 069 * @param encoder 070 * The encoder to use to encode the Strings. 071 * @param s1 072 * A String that will be encoded and compared. 073 * @param s2 074 * A String that will be encoded and compared. 075 * @return The number of characters in the two Soundex encoded Strings that 076 * are the same. 077 * 078 * @see #differenceEncoded(String,String) 079 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> 080 * MS T-SQL DIFFERENCE</a> 081 * 082 * @throws EncoderException 083 * if an error occurs encoding one of the strings 084 */ 085 static int difference(final StringEncoder encoder, final String s1, final String s2) throws EncoderException { 086 return differenceEncoded(encoder.encode(s1), encoder.encode(s2)); 087 } 088 089 /** 090 * Returns the number of characters in the two Soundex encoded Strings that 091 * are the same. 092 * <ul> 093 * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates 094 * little or no similarity, and 4 indicates strong similarity or identical 095 * values.</li> 096 * <li>For refined Soundex, the return value can be greater than 4.</li> 097 * </ul> 098 * 099 * @param es1 100 * An encoded String. 101 * @param es2 102 * An encoded String. 103 * @return The number of characters in the two Soundex encoded Strings that 104 * are the same. 105 * 106 * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> 107 * MS T-SQL DIFFERENCE</a> 108 */ 109 static int differenceEncoded(final String es1, final String es2) { 110 111 if (es1 == null || es2 == null) { 112 return 0; 113 } 114 final int lengthToMatch = Math.min(es1.length(), es2.length()); 115 int diff = 0; 116 for (int i = 0; i < lengthToMatch; i++) { 117 if (es1.charAt(i) == es2.charAt(i)) { 118 diff++; 119 } 120 } 121 return diff; 122 } 123 124}