【问题标题】:Jaro–Winkler distance algorithm in C#C# 中的 Jaro-Winkler 距离算法
【发布时间】:2013-10-01 18:39:55
【问题描述】:

如何在 C# 中实现 Jaro-Winkler 距离字符串比较算法?

【问题讨论】:

    标签: string comparison distance jaro-winkler


    【解决方案1】:
    public static class JaroWinklerDistance
    {
        /* The Winkler modification will not be applied unless the 
         * percent match was at or above the mWeightThreshold percent 
         * without the modification. 
         * Winkler's paper used a default value of 0.7
         */
        private static readonly double mWeightThreshold = 0.7;
    
        /* Size of the prefix to be concidered by the Winkler modification. 
         * Winkler's paper used a default value of 4
         */
        private static readonly int mNumChars = 4;
    
    
        /// <summary>
        /// Returns the Jaro-Winkler distance between the specified  
        /// strings. The distance is symmetric and will fall in the 
        /// range 0 (perfect match) to 1 (no match). 
        /// </summary>
        /// <param name="aString1">First String</param>
        /// <param name="aString2">Second String</param>
        /// <returns></returns>
        public static double distance(string aString1, string aString2) {
            return 1.0 - proximity(aString1,aString2);
        }
    
    
        /// <summary>
        /// Returns the Jaro-Winkler distance between the specified  
        /// strings. The distance is symmetric and will fall in the 
        /// range 0 (no match) to 1 (perfect match). 
        /// </summary>
        /// <param name="aString1">First String</param>
        /// <param name="aString2">Second String</param>
        /// <returns></returns>
        public static double proximity(string aString1, string aString2)
        {
            int lLen1 = aString1.Length;
            int lLen2 = aString2.Length;
            if (lLen1 == 0)
                return lLen2 == 0 ? 1.0 : 0.0;
    
            int  lSearchRange = Math.Max(0,Math.Max(lLen1,lLen2)/2 - 1);
    
            // default initialized to false
            bool[] lMatched1 = new bool[lLen1];
            bool[] lMatched2 = new bool[lLen2];
    
            int lNumCommon = 0;
            for (int i = 0; i < lLen1; ++i) {
                int lStart = Math.Max(0,i-lSearchRange);
                int lEnd = Math.Min(i+lSearchRange+1,lLen2);
                for (int j = lStart; j < lEnd; ++j) {
                    if (lMatched2[j]) continue;
                    if (aString1[i] != aString2[j])
                        continue;
                    lMatched1[i] = true;
                    lMatched2[j] = true;
                    ++lNumCommon;
                    break;
                }
            }
            if (lNumCommon == 0) return 0.0;
    
            int lNumHalfTransposed = 0;
            int k = 0;
            for (int i = 0; i < lLen1; ++i) {
                if (!lMatched1[i]) continue;
                while (!lMatched2[k]) ++k;
                if (aString1[i] != aString2[k])
                    ++lNumHalfTransposed;
                ++k;
            }
            // System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
            int lNumTransposed = lNumHalfTransposed/2;
    
            // System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
            double lNumCommonD = lNumCommon;
            double lWeight = (lNumCommonD/lLen1
                             + lNumCommonD/lLen2
                             + (lNumCommon - lNumTransposed)/lNumCommonD)/3.0;
    
            if (lWeight <= mWeightThreshold) return lWeight;
            int lMax = Math.Min(mNumChars,Math.Min(aString1.Length,aString2.Length));
            int lPos = 0;
            while (lPos < lMax && aString1[lPos] == aString2[lPos])
                ++lPos;
            if (lPos == 0) return lWeight;
            return lWeight + 0.1 * lPos * (1.0 - lWeight);
    
        }
    
    
    }
    

    【讨论】:

    • 戴夫,这段代码实际上支持两者。调用proximity(str1, str2) 返回0-1,其中1 是完美匹配。调用 distance(str1, str2) 返回 0-1,其中 0 为完美匹配。
    • 我对您的实现做了一些(有用的)修改:gist.github.com/ronnieoverby/2aa19724199df4ec8af6 最有用的是指定字符比较器的能力。
    • 这匹配 mdq Microsoft.MasterDataServices.DataQuality 程序集 Jaro-Winkler 的分数比任何可用的也适用 Jaro-Winkler 的 Nuget 包都要好。
    • 我也冒昧地将它移植到 TSQL,如果有人感兴趣,您可以在 stackoverflow.com/a/34879177/1856391 找到代码 :)
    • 是的。一些 Nuget 包存在错误,这些错误会导致错误的 Jaro Winkler “相似性”并存在无限循环问题。 fuzzystring.codeplex.com/workitem/list/basic
    【解决方案2】:

    你可以去Lucene.Net看看,

    它实现了 Jaro-Winkler 距离算法,

    而且它的分数和哪个leebickmtu post不一样,

    可以参考

    网址如下:

    http://lucenenet.apache.org/docs/3.0.3/db/d12/_jaro_winkler_distance_8cs_source.html

    【讨论】:

    • 我在 Lucene.Net 上尝试了代码,它给出的结果与我发布的解决方案相同。它与我的代码的接近()调用相匹配。 distance() 调用只是反转间隔。
    • 你是如何实现这段代码的,任何例子都会对我非常有帮助。
    【解决方案3】:

    您可以使用下面的代码,它对所有类型的字符串都非常有效。得到结果后,您需要乘以 100 以获得相似度百分比。希望能解决你的问题。

        public class JaroWinkler
    {
        private const double defaultMismatchScore = 0.0;
        private const double defaultMatchScore = 1.0;
    
        /// <summary>
        /// Gets the similarity between two strings by using the Jaro-Winkler algorithm.
        /// A value of 1 means perfect match. A value of zero represents an absolute no match
        /// </summary>
        /// <param name="_firstWord"></param>
        /// <param name="_secondWord"></param>
        /// <returns>a value between 0-1 of the similarity</returns>
        /// 
        public static double RateSimilarity(string _firstWord, string _secondWord)
        {
            // Converting to lower case is not part of the original Jaro-Winkler implementation
            // But we don't really care about case sensitivity in DIAMOND and wouldn't decrease security names similarity rate just because
            // of Case sensitivity
            _firstWord = _firstWord.ToLower();
            _secondWord = _secondWord.ToLower();
    
            if ((_firstWord != null) && (_secondWord != null))
            {
                if (_firstWord == _secondWord)
                    //return (SqlDouble)defaultMatchScore;
                    return defaultMatchScore;
                else
                {
                    // Get half the length of the string rounded up - (this is the distance used for acceptable transpositions)
                    int halfLength = Math.Min(_firstWord.Length, _secondWord.Length) / 2 + 1;
    
                    // Get common characters
                    StringBuilder common1 = GetCommonCharacters(_firstWord, _secondWord, halfLength);
                    int commonMatches = common1.Length;
    
                    // Check for zero in common
                    if (commonMatches == 0)
                        //return (SqlDouble)defaultMismatchScore;
                        return defaultMismatchScore;
    
                    StringBuilder common2 = GetCommonCharacters(_secondWord, _firstWord, halfLength);
    
                    // Check for same length common strings returning 0 if is not the same
                    if (commonMatches != common2.Length)
                        //return (SqlDouble)defaultMismatchScore;
                        return defaultMismatchScore;
    
                    // Get the number of transpositions
                    int transpositions = 0;
                    for (int i = 0; i < commonMatches; i++)
                    {
                        if (common1[i] != common2[i])
                            transpositions++;
                    }
    
                    int j = 0;
                    j += 1;
    
                    // Calculate Jaro metric
                    transpositions /= 2;
                    double jaroMetric = commonMatches / (3.0 * _firstWord.Length) + commonMatches / (3.0 * _secondWord.Length) + (commonMatches - transpositions) / (3.0 * commonMatches);
                    //return (SqlDouble)jaroMetric;
                    return jaroMetric;
                }
            }
    
            //return (SqlDouble)defaultMismatchScore;
            return defaultMismatchScore;
        }
    
        /// <summary>
        /// Returns a string buffer of characters from string1 within string2 if they are of a given
        /// distance seperation from the position in string1.
        /// </summary>
        /// <param name="firstWord">string one</param>
        /// <param name="secondWord">string two</param>
        /// <param name="separationDistance">separation distance</param>
        /// <returns>A string buffer of characters from string1 within string2 if they are of a given
        /// distance seperation from the position in string1</returns>
        private static StringBuilder GetCommonCharacters(string firstWord, string secondWord, int separationDistance)
        {
            if ((firstWord != null) && (secondWord != null))
            {
                StringBuilder returnCommons = new StringBuilder(20);
                StringBuilder copy = new StringBuilder(secondWord);
                int firstWordLength = firstWord.Length;
                int secondWordLength = secondWord.Length;
    
                for (int i = 0; i < firstWordLength; i++)
                {
                    char character = firstWord[i];
                    bool found = false;
    
                    for (int j = Math.Max(0, i - separationDistance); !found && j < Math.Min(i + separationDistance, secondWordLength); j++)
                    {
                        if (copy[j] == character)
                        {
                            found = true;
                            returnCommons.Append(character);
                            copy[j] = '#';
                        }
                    }
                }
                return returnCommons;
            }
            return null;
        }
    }
    

    【讨论】:

      【解决方案4】:

      使用下面的类来使用 jaro winkler。 我已经自定义了算法 jaro 和 jaro-winkler。

      访问Github 获取 DLL。

      using System;
      using System.Linq;
      
      namespace Search
      {
          public static class EditDistance
          {
              private struct JaroMetrics
              {
                  public int Matches;
      
                  public int Transpositions;
              }
      
              private static EditDistance.JaroMetrics Matches(string s1, string s2)
              {
                  string text;
                  string text2;
                  if (s1.Length > s2.Length)
                  {
                      text = s1;
                      text2 = s2;
                  }
                  else
                  {
                      text = s2;
                      text2 = s1;
                  }
                  int num = Math.Max(text.Length / 2 - 1, 0);
                  int[] array = new int[text2.Length];
                  int i;
                  for (i = 0; i < array.Length; i++)
                  {
                      array[i] = -1;
                  }
                  bool[] array2 = new bool[text.Length];
                  int num2 = 0;
                  for (int j = 0; j < text2.Length; j++)
                  {
                      char c = text2[j];
                      int k = Math.Max(j - num, 0);
                      int num3 = Math.Min(j + num + 1, text.Length);
                      while (k < num3)
                      {
                          if (!array2[k] && c == text[k])
                          {
                              array[j] = k;
                              array2[k] = true;
                              num2++;
                              break;
                          }
                          k++;
                      }
                  }
                  char[] array3 = new char[num2];
                  char[] ms2 = new char[num2];
                  i = 0;
                  int num4 = 0;
                  while (i < text2.Length)
                  {
                      if (array[i] != -1)
                      {
                          array3[num4] = text2[i];
                          num4++;
                      }
                      i++;
                  }
                  i = 0;
                  num4 = 0;
                  while (i < text.Length)
                  {
                      if (array2[i])
                      {
                          ms2[num4] = text[i];
                          num4++;
                      }
                      i++;
                  }
                  int num5 = array3.Where((char t, int mi) => t != ms2[mi]).Count<char>();
                  EditDistance.JaroMetrics result;
                  result.Matches = num2;
                  result.Transpositions = num5 / 2;
                  return result;
              }
      
              
      
              public static float JaroWinkler(this string s1, string s2, float prefixScale, float boostThreshold)
              {
                  prefixScale = ((prefixScale > 0.25f) ? 0.25f : prefixScale);
                  prefixScale = ((prefixScale < 0f) ? 0f : prefixScale);
                  float num = s1.Jaro(s2);
                  int num2 = 0;
                  for (int i = 0; i < Math.Min(s1.Length, s2.Length); i++)
                  {
                      if (s1[i] != s2[i])
                      {
                          break;
                      }
                      num2++;
                  }
                  return (num < boostThreshold) ? num : (num + prefixScale * (float)num2 * (1f - num));
              }
      
              public static float JaroWinkler(this string s1, string s2, float prefixScale)
              {
                  return s1.JaroWinkler(s2, prefixScale, 0.7f);
              }
      
              public static float JaroWinkler(this string s1, string s2)
              {
                  return s1.JaroWinkler(s2, 0.1f, 0.7f);
              }
      
              public static float Jaro(this string s1, string s2)
              {
                  EditDistance.JaroMetrics jaroMetrics = EditDistance.Matches(s1, s2);
                  float num = (float)jaroMetrics.Matches;
                  int transpositions = jaroMetrics.Transpositions;
                  float result;
                  if (num == 0f)
                  {
                      result = 0f;
                  }
                  else
                  {
                      float num2 = (num / (float)s1.Length + num / (float)s2.Length + (num - (float)transpositions) / num) / 3f;
                      result = num2;
                  }
                  return result;
              }
      
              public static int LevenshteinDistance(this string source, string target)
              {
                  int result;
                  if (string.IsNullOrEmpty(source))
                  {
                      if (string.IsNullOrEmpty(target))
                      {
                          result = 0;
                      }
                      else
                      {
                          result = target.Length;
                      }
                  }
                  else if (string.IsNullOrEmpty(target))
                  {
                      result = source.Length;
                  }
                  else
                  {
                      if (source.Length > target.Length)
                      {
                          string text = target;
                          target = source;
                          source = text;
                      }
                      int length = target.Length;
                      int length2 = source.Length;
                      int[,] array = new int[2, length + 1];
                      for (int i = 1; i <= length; i++)
                      {
                          array[0, i] = i;
                      }
                      int num = 0;
                      for (int j = 1; j <= length2; j++)
                      {
                          num = (j & 1);
                          array[num, 0] = j;
                          int num2 = num ^ 1;
                          for (int i = 1; i <= length; i++)
                          {
                              int num3 = (target[i - 1] == source[j - 1]) ? 0 : 1;
                              array[num, i] = Math.Min(Math.Min(array[num2, i] + 1, array[num, i - 1] + 1), array[num2, i - 1] + num3);
                          }
                      }
                      result = array[num, length];
                  }
                  return result;
              }
          }
      }
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 2019-07-20
        • 2021-12-05
        • 2016-01-01
        • 2011-02-20
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2013-04-05
        相关资源
        最近更新 更多