Handle booleans in frequent items and distribution metrics. (#1565)

## Description Count booleans as `"True"` or `"False"` in `FrequentItemsMetric` and 1 or 0 in `DistributionMetric`. - [ ] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md) and the [Code of Conduct](CODE_OF_CONDUCT.md).
whylabs · Sep 20, 2024 · 0bd8ed3 · 0bd8ed3
1 parent 933f988
commit 0bd8ed3
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 2 deletions.
diff --git a/python/tests/core/metrics/test_metrics.py b/python/tests/core/metrics/test_metrics.py
@@ -128,6 +128,31 @@ def test_distribution_metrics_mixed_np_and_list() -> None:
     assert dist.variance == overall.m2 / (overall.n - 1)
 
 
+def test_distribution_metrics_bool() -> None:
+    import whylogs.core.metrics.metrics as met
+
+    met._BOOL_LIST_CHUNK_SIZE = 2
+
+    dist = DistributionMetric.zero()
+    p_col = PreprocessedColumn.apply([True, True, True, True, False, "foo", "bar"])
+    operation_result = dist.columnar_update(p_col)
+    assert operation_result.ok
+    assert round(dist.mean.value, 3) == 0.8
+
+
+def test_distribution_metrics_bool_mixed() -> None:
+    import whylogs.core.metrics.metrics as met
+
+    met._BOOL_LIST_CHUNK_SIZE = 2
+
+    dist = DistributionMetric.zero()
+    p_col = PreprocessedColumn.apply([True, False, 42])
+    operation_result = dist.columnar_update(p_col)
+    assert operation_result.ok
+    assert dist.kll.value.get_n() == 3
+    assert round(dist.avg, 3) == round(43 / 3, 3)
+
+
 def test_track_single_values_profile_mean() -> None:
     data = list(range(30))
     df = pd.DataFrame(data, columns=["col1"])
@@ -201,6 +226,17 @@ def test_frequent_items_handling_int_as_string() -> None:
     assert res.array[0][0].value == "1"  # type: ignore
 
 
+def test_frequent_items_handling_bool_as_string() -> None:
+    import whylogs.core.metrics.metrics as met
+
+    met._BOOL_LIST_CHUNK_SIZE = 2
+    df = pd.DataFrame({"bool": [True, True, True, True, False]})
+
+    res = why.log(df).view().to_pandas()["frequent_items/frequent_strings"]
+    assert res.array[0][0].value == "True"  # type: ignore
+    assert res.array[0][1].value == "False"  # type: ignore
+
+
 def test_frequent_items_bounds_order() -> None:
     df_gamma = pd.DataFrame({"feature1": np.random.gamma(1, 2, 1000).astype(int)})
     df_rand = pd.DataFrame({"feature1": np.random.randint(10000, size=9000)})

diff --git a/python/whylogs/core/metrics/metrics.py b/python/whylogs/core/metrics/metrics.py
@@ -35,6 +35,9 @@
 METRIC = TypeVar("METRIC", bound="Metric")
 
 
+_BOOL_LIST_CHUNK_SIZE = 1000
+
+
 @dataclass(frozen=True)
 class MetricConfig:
     hll_lg_k: int = field(default_factory=lambda: conf.hll_lg_k)
@@ -279,19 +282,37 @@ def columnar_update(self, view: PreprocessedColumn) -> OperationResult:
                         else:
                             first = welford_online_variance_m2(existing=first, new_value=arr[0])
 
-        for lst in [view.list.ints, view.list.floats]:
+        def process_int_list(lst: List[int]) -> None:
+            nonlocal first, second
             if lst is not None and len(lst) > 0:
                 self.kll.value.update_list(num_items=lst)
                 n_b = len(lst)
                 if n_b > 1:
                     mean_b = statistics.mean(lst)
                     m2_b = statistics.variance(lst) * (n_b - 1)
                     second = VarianceM2Result(n=n_b, mean=mean_b, m2=m2_b)
-
                     first = parallel_variance_m2(first=first, second=second)
                 else:
                     first = welford_online_variance_m2(existing=first, new_value=lst[0])
 
+        for lst in [view.list.ints, view.list.floats]:
+            process_int_list(lst)
+
+        if view.bool_count > 0:
+            count = view.bool_count_where_true
+            while count > 0:
+                chunk_size = min(count, _BOOL_LIST_CHUNK_SIZE)
+                chunk = [1] * chunk_size
+                process_int_list(chunk)
+                count -= chunk_size
+
+            count = view.bool_count - view.bool_count_where_true
+            while count > 0:
+                chunk_size = min(count, _BOOL_LIST_CHUNK_SIZE)
+                chunk = [0] * chunk_size
+                process_int_list(chunk)
+                count -= chunk_size
+
         self.mean.set(first.mean)
         self.m2.set(first.m2)
 
@@ -448,6 +469,21 @@ def columnar_update(self, view: PreprocessedColumn) -> OperationResult:
             self.frequent_strings.value.update_int_list(view.list.ints)
             successes += len(view.list.ints)
 
+        if view.bool_count > 0:
+            count = view.bool_count_where_true
+            while count > 0:
+                chunk_size = min(count, _BOOL_LIST_CHUNK_SIZE)
+                chunk = ["True"] * chunk_size
+                self.frequent_strings.value.update_str_list(chunk)
+                count -= chunk_size
+
+            count = view.bool_count - view.bool_count_where_true
+            while count > 0:
+                chunk_size = min(count, _BOOL_LIST_CHUNK_SIZE)
+                chunk = ["False"] * chunk_size
+                self.frequent_strings.value.update_str_list(chunk)
+                count -= chunk_size
+
         if view.list.floats is not None:
             self.frequent_strings.value.update_double_list(view.list.floats)
             successes += len(view.list.floats)