Skip to content

Commit

Permalink
Improved String Grouping (#6)
Browse files Browse the repository at this point in the history
* Improved clustering of strings 
* Update workflow
* Additional tests
* Update Flair dependency
  • Loading branch information
MaartenGr authored Dec 7, 2020
1 parent bbcfe3d commit 74945f8
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.7, 3.8]

steps:
- uses: actions/checkout@v2
Expand Down
3 changes: 3 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
v0.2.2
- Update grouping to include all strings only if identical lists of strings are compared

v0.2.0
- Update naming convention matcher --> model
- Update documentation
Expand Down
2 changes: 1 addition & 1 deletion polyfuzz/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .polyfuzz import PolyFuzz
__version__ = "0.2.1"
__version__ = "0.2.2"
44 changes: 30 additions & 14 deletions polyfuzz/polyfuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,17 @@ def visualize_precision_recall(self,

def group(self,
model: Union[str, BaseMatcher] = None,
link_min_similarity: float = 0.75):
link_min_similarity: float = 0.75,
group_all_strings: bool = False):
""" From the matches, group the `To` matches together using single linkage
Arguments:
model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
link_min_similarity: the minimum similarity between strings before they are grouped
in a single linkage fashion
group_all_strings: if you want to compare a list of strings with itself and then cluster
those strings, set this to True. Otherwise, only the strings that
were mapped To are clustered.
Updates:
self.matches: Adds a column `Group` that is the grouped version of the `To` column
Expand Down Expand Up @@ -223,13 +227,9 @@ def group(self,
elif not model:
model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity)

# Group per model
for name, match in self.matches.items():
strings = list(self.matches[name].To.dropna().unique())
matches = model.match(strings, strings)
clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)
self._map_groups(name, cluster_name_map)
self.clusters[name] = clusters
self.cluster_mappings[name] = cluster_id_map
self._create_groups(name, model, link_min_similarity, group_all_strings)

def get_ids(self) -> Union[str, List[str], None]:
""" Get all model ids for easier access """
Expand Down Expand Up @@ -285,17 +285,33 @@ def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:

return self.cluster_mappings

def _map_groups(self, name: str, cluster_name_map: Mapping[str, str]):
""" Map the 'to' list to groups """
def _create_groups(self,
name: str,
model: BaseMatcher,
link_min_similarity: float,
group_all_strings: bool):
""" Create groups based on either the To mappings if you compare two different lists of strings, or
the From mappings if you compare lists of strings that are equal (set group_all_strings to True)
"""

if group_all_strings:
strings = list(self.matches[name].From.dropna().unique())
else:
strings = list(self.matches[name].To.dropna().unique())

# Create clusters
matches = model.match(strings, strings)
clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)

# Map the `to` list to groups
df = self.matches[name]
df["Group"] = df['To'].map(cluster_name_map).fillna(df['To'])

# Fix that some mappings from "From" end up in "Group"
df.loc[(df.From != df.To) &
(df.From == df.Group), "Group"] = df.loc[(df.From != df.To) &
(df.From == df.Group), "To"]
self.matches[name] = df

# Track clusters and their ids
self.clusters[name] = clusters
self.cluster_mappings[name] = cluster_id_map

def _update_model_ids(self):
""" Update model ids such that there is no overlap between ids """
# Give models a model_id if it didn't already exist
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
]

base_packages = [
"numpy>= 1.18.5",
"numpy>= 1.18.5,<=1.19.4",
"scipy>= 1.3.1",
"pandas>= 0.25.3",
"tqdm>=4.41.1",
Expand All @@ -25,7 +25,7 @@
]

fast_cosine = ["sparse_dot_topn>=0.2.9"]
embeddings_packages = ["flair>= 0.6.1.post1"]
embeddings_packages = ["torch>=1.2.0", "flair>= 0.7"]

extra_packages = embeddings_packages + fast_cosine

Expand All @@ -37,7 +37,7 @@
setup(
name="polyfuzz",
packages=find_packages(exclude=["notebooks", "docs"]),
version="0.2.1",
version="0.2.2",
author="Maarten Grootendorst",
author_email="maartengrootendorst@gmail.com",
description="PolyFuzz performs fuzzy string matching, grouping, and evaluation.",
Expand Down
14 changes: 14 additions & 0 deletions tests/test_polyfuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ def test_grouper(method):
assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1}


def test_grouper_same_list():
model = PolyFuzz("TF-IDF").match(from_list, from_list)
model.group(link_min_similarity=0.75, group_all_strings=True)
matches = model.get_matches()

assert isinstance(matches, pd.DataFrame)
assert matches.Similarity.mean() > 0.3
assert len(matches) == 6
assert list(matches.columns) == ['From', 'To', 'Similarity', 'Group']

assert model.get_clusters() == {1: ['apples', 'apple', 'appl']}
assert model.get_cluster_mappings() == {'apples': 1, 'apple': 1, 'appl': 1}


@pytest.mark.parametrize("method", ["Unknown Model"])
def test_wrongbase_model(method):
with pytest.raises(ValueError):
Expand Down

0 comments on commit 74945f8

Please sign in to comment.