Skip to content

Commit

Permalink
fix potentially incorrect results of JaroWinkler when using high pref…
Browse files Browse the repository at this point in the history
…ix weights
  • Loading branch information
maxbachmann committed Apr 6, 2024
1 parent e6a33b1 commit a781dd8
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 2 deletions.
13 changes: 12 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
Changelog
---------

[3.7.0] - 2024-03-21
[3.8.0] - 2024-04-06
^^^^^^^^^^^^^^^^^^^^
Added
~~~~~
* added ``process.cpdist`` which allows pairwise comparision of two collection of inputs

Fixed
~~~~~
- fix some minor errors in the type hints
- fix potentially incorrect results of JaroWinkler when using high prefix weights


[3.7.0] - 2024-03-21
^^^^^^^^^^^^^^^^^^^^
Changed
~~~~~~~
* reduce importtime
Expand Down
4 changes: 4 additions & 0 deletions src/rapidfuzz/distance/JaroWinkler_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def similarity(
if score_cutoff is None:
score_cutoff = 0

if prefix_weight > 1.0 or prefix_weight < 0.0:
raise ValueError("prefix_weight has to be in the range 0.0 - 1.0")

s1, s2 = conv_sequences(s1, s2)
P_len = len(s1)
T_len = len(s2)
Expand All @@ -83,6 +86,7 @@ def similarity(
Sim = Jaro.similarity(s1, s2, score_cutoff=jaro_score_cutoff)
if Sim > 0.7:
Sim += prefix * prefix_weight * (1.0 - Sim)
Sim = min(Sim, 1.0)

return Sim if Sim >= score_cutoff else 0

Expand Down
16 changes: 16 additions & 0 deletions src/rapidfuzz/distance/metrics_cpp.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,9 @@ def jaro_winkler_distance(s1, s2, *, double prefix_weight=0.1, processor=None, s
if is_none(s1) or is_none(s2):
return 1.0

if prefix_weight > 1.0 or prefix_weight < 0.0:
raise ValueError("prefix_weight has to be in the range 0.0 - 1.0")

cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0, 0.0)
preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc)
return jaro_winkler_distance_func(s1_proc.string, s2_proc.string, prefix_weight, c_score_cutoff)
Expand All @@ -957,6 +960,9 @@ def jaro_winkler_similarity(s1, s2, *, double prefix_weight=0.1, processor=None,
if is_none(s1) or is_none(s2):
return 0.0

if prefix_weight > 1.0 or prefix_weight < 0.0:
raise ValueError("prefix_weight has to be in the range 0.0 - 1.0")

cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0, 1.0)
preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc)
return jaro_winkler_similarity_func(s1_proc.string, s2_proc.string, prefix_weight, c_score_cutoff)
Expand All @@ -967,6 +973,9 @@ def jaro_winkler_normalized_distance(s1, s2, *, double prefix_weight=0.1, proces
if is_none(s1) or is_none(s2):
return 1.0

if prefix_weight > 1.0 or prefix_weight < 0.0:
raise ValueError("prefix_weight has to be in the range 0.0 - 1.0")

cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 1.0, 0.0)
preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc)
return jaro_winkler_normalized_distance_func(s1_proc.string, s2_proc.string, prefix_weight, c_score_cutoff)
Expand All @@ -977,6 +986,9 @@ def jaro_winkler_normalized_similarity(s1, s2, *, double prefix_weight=0.1, proc
if is_none(s1) or is_none(s2):
return 0.0

if prefix_weight > 1.0 or prefix_weight < 0.0:
raise ValueError("prefix_weight has to be in the range 0.0 - 1.0")

cdef double c_score_cutoff = get_score_cutoff_f64(score_cutoff, 0.0, 1.0)
preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc)
return jaro_winkler_normalized_similarity_func(s1_proc.string, s2_proc.string, prefix_weight, c_score_cutoff)
Expand All @@ -988,6 +1000,10 @@ cdef bool JaroWinklerKwargsInit(RF_Kwargs * self, dict kwargs) except False:
raise MemoryError

prefix_weight[0] = kwargs.get("prefix_weight", 0.1)
if prefix_weight[0] > 1.0 or prefix_weight[0] < 0.0:
free(prefix_weight)
raise ValueError("prefix_weight has to be in the range 0.0 - 1.0")

self.context = prefix_weight
self.dtor = KwargsDeinit
return True
Expand Down
10 changes: 10 additions & 0 deletions tests/distance/test_JaroWinkler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
def test_hash_special_case():
assert pytest.approx(JaroWinkler.similarity([0, -1], [0, -2])) == 0.666666

def test_large_prefix_weight():
assert pytest.approx(JaroWinkler.similarity('milyarder', 'milyarderlik',prefix_weight=0.5)) == 1.0
assert pytest.approx(JaroWinkler.similarity('milyarder', 'milyarderlik',prefix_weight=1.0)) == 1.0

def test_invalid_prefix_weight():
with pytest.raises(ValueError, match="prefix_weight has to be in the range 0.0 - 1.0"):
JaroWinkler.similarity('milyarder', 'milyarderlik',prefix_weight=-0.1)

with pytest.raises(ValueError, match="prefix_weight has to be in the range 0.0 - 1.0"):
JaroWinkler.similarity('milyarder', 'milyarderlik',prefix_weight=1.1)

def test_edge_case_lengths():
"""
Expand Down

0 comments on commit a781dd8

Please sign in to comment.