Skip to content

Commit 564df1a

Browse files
authored
Merge pull request #16 from paulirwin/issue/15
Catch up to upstream 1.2.0 release
2 parents cf07b35 + 43bd2ad commit 564df1a

File tree

9 files changed

+214
-27
lines changed

9 files changed

+214
-27
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
2+
<s:Boolean x:Key="/Default/UserDictionary/Words/=Levenshtein/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
namespace F23.StringSimilarity
2+
{
3+
/// <summary>
4+
/// As an adjunct to <see cref="ICharacterSubstitution"/>, this interface
5+
/// allows you to specify the cost of deletion or insertion of a
6+
/// character.
7+
/// </summary>
8+
public interface ICharacterInsDel
9+
{
10+
/// <summary>
11+
/// Computes the deletion cost.
12+
/// </summary>
13+
/// <param name="c">The character being deleted.</param>
14+
/// <returns>The cost to be allocated to deleting the given character,
15+
/// in the range [0, 1].</returns>
16+
double DeletionCost(char c);
17+
18+
/// <summary>
19+
/// Computes the insertion cost.
20+
/// </summary>
21+
/// <param name="c">The character being inserted.</param>
22+
/// <returns>The cost to be allocated to inserting the given character,
23+
/// in the range [0, 1].</returns>
24+
double InsertionCost(char c);
25+
}
26+
}

src/F23.StringSimilarity/Levenshtein.cs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ namespace F23.StringSimilarity
3434
/// change one string into the other.
3535
public class Levenshtein : IMetricStringDistance
3636
{
37+
/// <summary>
38+
/// Equivalent to Distance(s1, s2, Int32.MaxValue).
39+
/// </summary>
40+
/// <param name="s1">The first string to compare.</param>
41+
/// <param name="s2">The second string to compare.</param>
42+
/// <returns>The Levenshtein distance between strings</returns>
43+
public double Distance(string s1, string s2)
44+
{
45+
return Distance(s1, s2, int.MaxValue);
46+
}
47+
3748
/// <summary>
3849
/// The Levenshtein distance, or edit distance, between two words is the
3950
/// Minimum number of single-character edits (insertions, deletions or
@@ -56,9 +67,14 @@ public class Levenshtein : IMetricStringDistance
5667
/// </summary>
5768
/// <param name="s1">The first string to compare.</param>
5869
/// <param name="s2">The second string to compare.</param>
70+
/// <param name="limit">The maximum result to compute before stopping. This
71+
/// means that the calculation can terminate early if you
72+
/// only care about strings with a certain similarity.
73+
/// Set this to Int32.MaxValue if you want to run the
74+
/// calculation to completion in every case.</param>
5975
/// <returns>The Levenshtein distance between strings</returns>
6076
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
61-
public double Distance(string s1, string s2)
77+
public double Distance(string s1, string s2, int limit)
6278
{
6379
if (s1 == null)
6480
{
@@ -105,6 +121,8 @@ public double Distance(string s1, string s2)
105121
// edit distance is delete (i+1) chars from s to match empty t
106122
v1[0] = i + 1;
107123

124+
int minv1 = v1[0];
125+
108126
// use formula to fill in the rest of the row
109127
for (int j = 0; j < s2.Length; j++)
110128
{
@@ -118,10 +136,17 @@ public double Distance(string s1, string s2)
118136
Math.Min(
119137
v0[j + 1] + 1, // Cost of remove
120138
v0[j] + cost)); // Cost of substitution
139+
140+
minv1 = Math.Min(minv1, v1[j + 1]);
141+
}
142+
143+
if (minv1 >= limit)
144+
{
145+
return limit;
121146
}
122147

123148
// copy v1 (current row) to v0 (previous row) for next iteration
124-
//System.arraycopy(v1, 0, v0, 0, v0.length);
149+
// System.arraycopy(v1, 0, v0, 0, v0.length);
125150

126151
// Flip references to current and previous row
127152
vtemp = v0;

src/F23.StringSimilarity/WeightedLevenshtein.cs

Lines changed: 74 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
using F23.StringSimilarity.Interfaces;
2727
// ReSharper disable SuggestVarOrType_Elsewhere
2828
// ReSharper disable TooWideLocalVariableScope
29+
// ReSharper disable IntroduceOptionalParameters.Global
2930

3031
namespace F23.StringSimilarity
3132
{
@@ -34,24 +35,54 @@ namespace F23.StringSimilarity
3435
public class WeightedLevenshtein : IStringDistance
3536
{
3637
private readonly ICharacterSubstitution _characterSubstitution;
38+
private readonly ICharacterInsDel _characterInsDel;
3739

3840
/// <summary>
39-
/// Create a new instance with provided character substitution.
41+
/// Instantiate with provided character substitution.
4042
/// </summary>
4143
/// <param name="characterSubstitution">The strategy to determine character substitution weights.</param>
4244
public WeightedLevenshtein(ICharacterSubstitution characterSubstitution)
45+
: this(characterSubstitution, null)
46+
{
47+
}
48+
49+
/// <summary>
50+
/// Instantiate with provided character substitution, insertion, and
51+
/// deletion weights.
52+
/// </summary>
53+
/// <param name="characterSubstitution">The strategy to determine character substitution weights.</param>
54+
/// <param name="characterInsDel">The strategy to determine character insertion/deletion weights.</param>
55+
public WeightedLevenshtein(ICharacterSubstitution characterSubstitution,
56+
ICharacterInsDel characterInsDel)
4357
{
4458
_characterSubstitution = characterSubstitution;
59+
_characterInsDel = characterInsDel;
60+
}
61+
62+
/// <summary>
63+
/// Equivalent to Distance(s1, s2, Double.MaxValue).
64+
/// </summary>
65+
/// <param name="s1">The first string to compare.</param>
66+
/// <param name="s2">The second string to compare.</param>
67+
/// <returns>The computed weighted Levenshtein distance.</returns>
68+
public double Distance(string s1, string s2)
69+
{
70+
return Distance(s1, s2, double.MaxValue);
4571
}
4672

4773
/// <summary>
4874
/// Compute Levenshtein distance using provided weights for substitution.
4975
/// </summary>
5076
/// <param name="s1">The first string to compare.</param>
5177
/// <param name="s2">The second string to compare.</param>
78+
/// <param name="limit">The maximum result to compute before stopping. This
79+
/// means that the calculation can terminate early if you
80+
/// only care about strings with a certain similarity.
81+
/// Set this to Double.MaxValue if you want to run the
82+
/// calculation to completion in every case.</param>
5283
/// <returns>The computed weighted Levenshtein distance.</returns>
5384
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
54-
public double Distance(string s1, string s2)
85+
public double Distance(string s1, string s2, double limit)
5586
{
5687
if (s1 == null)
5788
{
@@ -78,51 +109,79 @@ public double Distance(string s1, string s2)
78109
return s1.Length;
79110
}
80111

81-
// create two work vectors of integer distances
112+
// create two work vectors of floating point (i.e. weighted) distances
82113
double[] v0 = new double[s2.Length + 1];
83114
double[] v1 = new double[s2.Length + 1];
84115
double[] vtemp;
85116

86117
// initialize v0 (the previous row of distances)
87-
// this row is A[0][i]: edit distance for an empty s
88-
// the distance is just the number of characters to delete from t
89-
for (int i = 0; i < v0.Length; i++)
118+
// this row is A[0][i]: edit distance for an empty s1
119+
// the distance is the cost of inserting each character of s2
120+
v0[0] = 0;
121+
for (int i = 1; i < v0.Length; i++)
90122
{
91-
v0[i] = i;
123+
v0[i] = v0[i - 1] + InsertionCost(s2[i - 1]);
92124
}
93125

94126
for (int i = 0; i < s1.Length; i++)
95127
{
128+
char s1i = s1[i];
129+
double deletionCost = DeletionCost(s1i);
130+
96131
// calculate v1 (current row distances) from the previous row v0
97132
// first element of v1 is A[i+1][0]
98-
// edit distance is delete (i+1) chars from s to match empty t
99-
v1[0] = i + 1;
133+
// Edit distance is the cost of deleting characters from s1
134+
// to match empty t.
135+
v1[0] = v0[0] + deletionCost;
136+
137+
double minv1 = v1[0];
100138

101139
// use formula to fill in the rest of the row
102140
for (int j = 0; j < s2.Length; j++)
103141
{
142+
char s2j = s2[j];
104143
double cost = 0;
105-
if (s1[i] != s2[j])
144+
145+
if (s1i != s2j)
106146
{
107-
cost = _characterSubstitution.Cost(s1[i], s2[j]);
147+
cost = _characterSubstitution.Cost(s1i, s2j);
108148
}
149+
150+
double insertionCost = InsertionCost(s2j);
151+
109152
v1[j + 1] = Math.Min(
110-
v1[j] + 1, // Cost of insertion
153+
v1[j] + insertionCost, // Cost of insertion
111154
Math.Min(
112-
v0[j + 1] + 1, // Cost of remove
155+
v0[j + 1] + deletionCost, // Cost of deletion
113156
v0[j] + cost)); // Cost of substitution
157+
158+
minv1 = Math.Min(minv1, v1[j + 1]);
159+
}
160+
161+
if (minv1 >= limit)
162+
{
163+
return limit;
114164
}
115165

116166
// copy v1 (current row) to v0 (previous row) for next iteration
117-
//System.arraycopy(v1, 0, v0, 0, v0.length);
167+
// System.arraycopy(v1, 0, v0, 0, v0.length);
118168
// Flip references to current and previous row
119169
vtemp = v0;
120170
v0 = v1;
121171
v1 = vtemp;
122-
123172
}
124173

125174
return v0[s2.Length];
126175
}
176+
177+
private double InsertionCost(char c)
178+
{
179+
return _characterInsDel?.InsertionCost(c) ?? 1.0;
180+
}
181+
182+
private double DeletionCost(char c)
183+
{
184+
return _characterInsDel?.DeletionCost(c) ?? 1.0;
185+
}
127186
}
128187
}

test/F23.StringSimilarity.Tests/F23.StringSimilarity.Tests.csproj

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@
1515
</ItemGroup>
1616

1717
<ItemGroup>
18-
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.3.0" />
19-
<PackageReference Include="xunit" Version="2.2.0" />
20-
<PackageReference Include="xunit.runner.visualstudio" Version="2.2.0" />
18+
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.8.0" />
19+
<PackageReference Include="xunit" Version="2.4.0" />
20+
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.0">
21+
<PrivateAssets>all</PrivateAssets>
22+
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
23+
</PackageReference>
2124
</ItemGroup>
2225

2326
<ItemGroup>

test/F23.StringSimilarity.Tests/LevenshteinTest.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ public void TestDistance()
4242
Assert.Equal(expected: 2.0, actual: instance.Distance("My string", "M string2"));
4343
Assert.Equal(expected: 1.0, actual: instance.Distance("My string", "My $tring"));
4444

45+
// With limits.
46+
Assert.Equal(2.0, instance.Distance("My string", "M string2", 4));
47+
Assert.Equal(2.0, instance.Distance("My string", "M string2", 2));
48+
Assert.Equal(1.0, instance.Distance("My string", "M string2", 1));
49+
4550
NullEmptyTests.TestDistance(instance);
4651
}
4752
}

test/F23.StringSimilarity.Tests/Support/ArrayExtensionsTest.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ public void TestWithPadding()
4040

4141
var padded = source.WithPadding(1200);
4242

43-
Assert.Equal(actual: 1200, expected: padded.Length);
43+
Assert.Equal(expected: 1200, actual: padded.Length);
4444

4545
Assert.True(padded.Take(1000).All(x => x == 42));
4646
Assert.True(padded.Skip(1000).Take(200).All(x => x == 0));

test/F23.StringSimilarity.Tests/TestUtil/NullEmptyTests.cs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,16 @@ public static void TestSimilarity(INormalizedStringSimilarity instance)
5454
Assert.Equal(0.0, instance.Similarity("", "foo"), 1);
5555
Assert.Equal(0.0, instance.Similarity("foo", ""), 1);
5656

57-
Assert.Throws(typeof(ArgumentNullException), () => instance.Similarity(null, null));
58-
Assert.Throws(typeof(ArgumentNullException), () => instance.Similarity(null, ""));
59-
Assert.Throws(typeof(ArgumentNullException), () => instance.Similarity("", null));
57+
Assert.Throws<ArgumentNullException>(() => instance.Similarity(null, null));
58+
Assert.Throws<ArgumentNullException>(() => instance.Similarity(null, ""));
59+
Assert.Throws<ArgumentNullException>(() => instance.Similarity("", null));
6060
}
6161

6262
public static void AssertArgumentNullExceptions(IStringDistance instance)
6363
{
64-
Assert.Throws(typeof(ArgumentNullException), () => instance.Distance(null, null));
65-
Assert.Throws(typeof(ArgumentNullException), () => instance.Distance(null, ""));
66-
Assert.Throws(typeof(ArgumentNullException), () => instance.Distance("", null));
64+
Assert.Throws<ArgumentNullException>(() => instance.Distance(null, null));
65+
Assert.Throws<ArgumentNullException>(() => instance.Distance(null, ""));
66+
Assert.Throws<ArgumentNullException>(() => instance.Distance("", null));
6767
}
6868
}
6969
}

test/F23.StringSimilarity.Tests/WeightedLevenshteinTest.cs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,50 @@ public void TestDistance()
3838
Assert.Equal(0.5, instance.Distance("String1", "Srring1"), 1);
3939
Assert.Equal(1.5, instance.Distance("String1", "Srring2"), 1);
4040

41+
// One insert or delete.
42+
Assert.Equal(1.0, instance.Distance("Strng", "String"), 1);
43+
Assert.Equal(1.0, instance.Distance("String", "Strng"), 1);
44+
45+
// With limits.
46+
Assert.Equal(0.0, instance.Distance("String1", "String1", double.MaxValue), 1);
47+
Assert.Equal(0.0, instance.Distance("String1", "String1", 2.0), 1);
48+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", double.MaxValue), 1);
49+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", 2.0), 1);
50+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", 1.5), 1);
51+
Assert.Equal(1.0, instance.Distance("String1", "Srring2", 1.0), 1);
52+
Assert.Equal(4.0, instance.Distance("String1", "Potato", 4.0), 1);
53+
54+
NullEmptyTests.TestDistance(instance);
55+
}
56+
57+
[Fact]
58+
public void TestDistanceCharacterInsDelInterface()
59+
{
60+
var instance = new WeightedLevenshtein(new ExampleCharSub(), new ExampleInsDel());
61+
62+
// Same as testDistance above.
63+
Assert.Equal(0.0, instance.Distance("String1", "String1"), 1);
64+
Assert.Equal(0.5, instance.Distance("String1", "Srring1"), 1);
65+
Assert.Equal(1.5, instance.Distance("String1", "Srring2"), 1);
66+
67+
// Cost of insert of 'i' is less than normal, so these scores are
68+
// different than testDistance above. Note that the cost of delete
69+
// has been set differently than the cost of insert, so the distance
70+
// call is not symmetric in its arguments if an 'i' has changed.
71+
Assert.Equal(0.5, instance.Distance("Strng", "String"), 1);
72+
Assert.Equal(0.8, instance.Distance("String", "Strng"), 1);
73+
Assert.Equal(1.0, instance.Distance("Strig", "String"), 1);
74+
Assert.Equal(1.0, instance.Distance("String", "Strig"), 1);
75+
76+
// Same as above with limits.
77+
Assert.Equal(0.0, instance.Distance("String1", "String1", double.MaxValue), 1);
78+
Assert.Equal(0.0, instance.Distance("String1", "String1", 2.0), 1);
79+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", double.MaxValue), 1);
80+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", 2.0), 1);
81+
Assert.Equal(1.5, instance.Distance("String1", "Srring2", 1.5), 1);
82+
Assert.Equal(1.0, instance.Distance("String1", "Srring2", 1.0), 1);
83+
Assert.Equal(4.0, instance.Distance("String1", "Potato", 4.0), 1);
84+
4185
NullEmptyTests.TestDistance(instance);
4286
}
4387

@@ -58,5 +102,28 @@ public double Cost(char c1, char c2)
58102
return 1.0;
59103
}
60104
}
105+
106+
private class ExampleInsDel : ICharacterInsDel
107+
{
108+
public double DeletionCost(char c)
109+
{
110+
if (c == 'i')
111+
{
112+
return 0.8;
113+
}
114+
115+
return 1.0;
116+
}
117+
118+
public double InsertionCost(char c)
119+
{
120+
if (c == 'i')
121+
{
122+
return 0.5;
123+
}
124+
125+
return 1.0;
126+
}
127+
}
61128
}
62129
}

0 commit comments

Comments
 (0)