26
26
using F23 . StringSimilarity . Interfaces ;
27
27
// ReSharper disable SuggestVarOrType_Elsewhere
28
28
// ReSharper disable TooWideLocalVariableScope
29
+ // ReSharper disable IntroduceOptionalParameters.Global
29
30
30
31
namespace F23 . StringSimilarity
31
32
{
@@ -34,24 +35,54 @@ namespace F23.StringSimilarity
34
35
public class WeightedLevenshtein : IStringDistance
35
36
{
36
37
private readonly ICharacterSubstitution _characterSubstitution ;
38
+ private readonly ICharacterInsDel _characterInsDel ;
37
39
38
40
/// <summary>
39
- /// Create a new instance with provided character substitution.
41
+ /// Instantiate with provided character substitution.
40
42
/// </summary>
41
43
/// <param name="characterSubstitution">The strategy to determine character substitution weights.</param>
42
44
public WeightedLevenshtein ( ICharacterSubstitution characterSubstitution )
45
+ : this ( characterSubstitution , null )
46
+ {
47
+ }
48
+
49
+ /// <summary>
50
+ /// Instantiate with provided character substitution, insertion, and
51
+ /// deletion weights.
52
+ /// </summary>
53
+ /// <param name="characterSubstitution">The strategy to determine character substitution weights.</param>
54
+ /// <param name="characterInsDel">The strategy to determine character insertion/deletion weights.</param>
55
+ public WeightedLevenshtein ( ICharacterSubstitution characterSubstitution ,
56
+ ICharacterInsDel characterInsDel )
43
57
{
44
58
_characterSubstitution = characterSubstitution ;
59
+ _characterInsDel = characterInsDel ;
60
+ }
61
+
62
+ /// <summary>
63
+ /// Equivalent to Distance(s1, s2, Double.MaxValue).
64
+ /// </summary>
65
+ /// <param name="s1">The first string to compare.</param>
66
+ /// <param name="s2">The second string to compare.</param>
67
+ /// <returns>The computed weighted Levenshtein distance.</returns>
68
+ public double Distance ( string s1 , string s2 )
69
+ {
70
+ return Distance ( s1 , s2 , double . MaxValue ) ;
45
71
}
46
72
47
73
/// <summary>
48
74
/// Compute Levenshtein distance using provided weights for substitution.
49
75
/// </summary>
50
76
/// <param name="s1">The first string to compare.</param>
51
77
/// <param name="s2">The second string to compare.</param>
78
+ /// <param name="limit">The maximum result to compute before stopping. This
79
+ /// means that the calculation can terminate early if you
80
+ /// only care about strings with a certain similarity.
81
+ /// Set this to Double.MaxValue if you want to run the
82
+ /// calculation to completion in every case.</param>
52
83
/// <returns>The computed weighted Levenshtein distance.</returns>
53
84
/// <exception cref="ArgumentNullException">If s1 or s2 is null.</exception>
54
- public double Distance ( string s1 , string s2 )
85
+ public double Distance ( string s1 , string s2 , double limit )
55
86
{
56
87
if ( s1 == null )
57
88
{
@@ -78,51 +109,79 @@ public double Distance(string s1, string s2)
78
109
return s1 . Length ;
79
110
}
80
111
81
- // create two work vectors of integer distances
112
+ // create two work vectors of floating point (i.e. weighted) distances
82
113
double [ ] v0 = new double [ s2 . Length + 1 ] ;
83
114
double [ ] v1 = new double [ s2 . Length + 1 ] ;
84
115
double [ ] vtemp ;
85
116
86
117
// initialize v0 (the previous row of distances)
87
- // this row is A[0][i]: edit distance for an empty s
88
- // the distance is just the number of characters to delete from t
89
- for ( int i = 0 ; i < v0 . Length ; i ++ )
118
+ // this row is A[0][i]: edit distance for an empty s1
119
+ // the distance is the cost of inserting each character of s2
120
+ v0 [ 0 ] = 0 ;
121
+ for ( int i = 1 ; i < v0 . Length ; i ++ )
90
122
{
91
- v0 [ i ] = i ;
123
+ v0 [ i ] = v0 [ i - 1 ] + InsertionCost ( s2 [ i - 1 ] ) ;
92
124
}
93
125
94
126
for ( int i = 0 ; i < s1 . Length ; i ++ )
95
127
{
128
+ char s1i = s1 [ i ] ;
129
+ double deletionCost = DeletionCost ( s1i ) ;
130
+
96
131
// calculate v1 (current row distances) from the previous row v0
97
132
// first element of v1 is A[i+1][0]
98
- // edit distance is delete (i+1) chars from s to match empty t
99
- v1 [ 0 ] = i + 1 ;
133
+ // Edit distance is the cost of deleting characters from s1
134
+ // to match empty t.
135
+ v1 [ 0 ] = v0 [ 0 ] + deletionCost ;
136
+
137
+ double minv1 = v1 [ 0 ] ;
100
138
101
139
// use formula to fill in the rest of the row
102
140
for ( int j = 0 ; j < s2 . Length ; j ++ )
103
141
{
142
+ char s2j = s2 [ j ] ;
104
143
double cost = 0 ;
105
- if ( s1 [ i ] != s2 [ j ] )
144
+
145
+ if ( s1i != s2j )
106
146
{
107
- cost = _characterSubstitution . Cost ( s1 [ i ] , s2 [ j ] ) ;
147
+ cost = _characterSubstitution . Cost ( s1i , s2j ) ;
108
148
}
149
+
150
+ double insertionCost = InsertionCost ( s2j ) ;
151
+
109
152
v1 [ j + 1 ] = Math . Min (
110
- v1 [ j ] + 1 , // Cost of insertion
153
+ v1 [ j ] + insertionCost , // Cost of insertion
111
154
Math . Min (
112
- v0 [ j + 1 ] + 1 , // Cost of remove
155
+ v0 [ j + 1 ] + deletionCost , // Cost of deletion
113
156
v0 [ j ] + cost ) ) ; // Cost of substitution
157
+
158
+ minv1 = Math . Min ( minv1 , v1 [ j + 1 ] ) ;
159
+ }
160
+
161
+ if ( minv1 >= limit )
162
+ {
163
+ return limit ;
114
164
}
115
165
116
166
// copy v1 (current row) to v0 (previous row) for next iteration
117
- //System.arraycopy(v1, 0, v0, 0, v0.length);
167
+ // System.arraycopy(v1, 0, v0, 0, v0.length);
118
168
// Flip references to current and previous row
119
169
vtemp = v0 ;
120
170
v0 = v1 ;
121
171
v1 = vtemp ;
122
-
123
172
}
124
173
125
174
return v0 [ s2 . Length ] ;
126
175
}
176
+
177
+ private double InsertionCost ( char c )
178
+ {
179
+ return _characterInsDel ? . InsertionCost ( c ) ?? 1.0 ;
180
+ }
181
+
182
+ private double DeletionCost ( char c )
183
+ {
184
+ return _characterInsDel ? . DeletionCost ( c ) ?? 1.0 ;
185
+ }
127
186
}
128
187
}
0 commit comments