Skip to content

Commit

Permalink
Handle booleans in frequent items and distribution metrics. (#1565)
Browse files Browse the repository at this point in the history
## Description

Count booleans as `"True"` or `"False"` in `FrequentItemsMetric` and 1
or 0 in `DistributionMetric`.

- [ ] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md)
and the [Code of Conduct](CODE_OF_CONDUCT.md).
  • Loading branch information
richard-rogers authored Sep 20, 2024
1 parent 933f988 commit 0bd8ed3
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 2 deletions.
36 changes: 36 additions & 0 deletions python/tests/core/metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,31 @@ def test_distribution_metrics_mixed_np_and_list() -> None:
assert dist.variance == overall.m2 / (overall.n - 1)


def test_distribution_metrics_bool() -> None:
import whylogs.core.metrics.metrics as met

met._BOOL_LIST_CHUNK_SIZE = 2

dist = DistributionMetric.zero()
p_col = PreprocessedColumn.apply([True, True, True, True, False, "foo", "bar"])
operation_result = dist.columnar_update(p_col)
assert operation_result.ok
assert round(dist.mean.value, 3) == 0.8


def test_distribution_metrics_bool_mixed() -> None:
import whylogs.core.metrics.metrics as met

met._BOOL_LIST_CHUNK_SIZE = 2

dist = DistributionMetric.zero()
p_col = PreprocessedColumn.apply([True, False, 42])
operation_result = dist.columnar_update(p_col)
assert operation_result.ok
assert dist.kll.value.get_n() == 3
assert round(dist.avg, 3) == round(43 / 3, 3)


def test_track_single_values_profile_mean() -> None:
data = list(range(30))
df = pd.DataFrame(data, columns=["col1"])
Expand Down Expand Up @@ -201,6 +226,17 @@ def test_frequent_items_handling_int_as_string() -> None:
assert res.array[0][0].value == "1" # type: ignore


def test_frequent_items_handling_bool_as_string() -> None:
import whylogs.core.metrics.metrics as met

met._BOOL_LIST_CHUNK_SIZE = 2
df = pd.DataFrame({"bool": [True, True, True, True, False]})

res = why.log(df).view().to_pandas()["frequent_items/frequent_strings"]
assert res.array[0][0].value == "True" # type: ignore
assert res.array[0][1].value == "False" # type: ignore


def test_frequent_items_bounds_order() -> None:
df_gamma = pd.DataFrame({"feature1": np.random.gamma(1, 2, 1000).astype(int)})
df_rand = pd.DataFrame({"feature1": np.random.randint(10000, size=9000)})
Expand Down
40 changes: 38 additions & 2 deletions python/whylogs/core/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
METRIC = TypeVar("METRIC", bound="Metric")


_BOOL_LIST_CHUNK_SIZE = 1000


@dataclass(frozen=True)
class MetricConfig:
hll_lg_k: int = field(default_factory=lambda: conf.hll_lg_k)
Expand Down Expand Up @@ -279,19 +282,37 @@ def columnar_update(self, view: PreprocessedColumn) -> OperationResult:
else:
first = welford_online_variance_m2(existing=first, new_value=arr[0])

for lst in [view.list.ints, view.list.floats]:
def process_int_list(lst: List[int]) -> None:
nonlocal first, second
if lst is not None and len(lst) > 0:
self.kll.value.update_list(num_items=lst)
n_b = len(lst)
if n_b > 1:
mean_b = statistics.mean(lst)
m2_b = statistics.variance(lst) * (n_b - 1)
second = VarianceM2Result(n=n_b, mean=mean_b, m2=m2_b)

first = parallel_variance_m2(first=first, second=second)
else:
first = welford_online_variance_m2(existing=first, new_value=lst[0])

for lst in [view.list.ints, view.list.floats]:
process_int_list(lst)

if view.bool_count > 0:
count = view.bool_count_where_true
while count > 0:
chunk_size = min(count, _BOOL_LIST_CHUNK_SIZE)
chunk = [1] * chunk_size
process_int_list(chunk)
count -= chunk_size

count = view.bool_count - view.bool_count_where_true
while count > 0:
chunk_size = min(count, _BOOL_LIST_CHUNK_SIZE)
chunk = [0] * chunk_size
process_int_list(chunk)
count -= chunk_size

self.mean.set(first.mean)
self.m2.set(first.m2)

Expand Down Expand Up @@ -448,6 +469,21 @@ def columnar_update(self, view: PreprocessedColumn) -> OperationResult:
self.frequent_strings.value.update_int_list(view.list.ints)
successes += len(view.list.ints)

if view.bool_count > 0:
count = view.bool_count_where_true
while count > 0:
chunk_size = min(count, _BOOL_LIST_CHUNK_SIZE)
chunk = ["True"] * chunk_size
self.frequent_strings.value.update_str_list(chunk)
count -= chunk_size

count = view.bool_count - view.bool_count_where_true
while count > 0:
chunk_size = min(count, _BOOL_LIST_CHUNK_SIZE)
chunk = ["False"] * chunk_size
self.frequent_strings.value.update_str_list(chunk)
count -= chunk_size

if view.list.floats is not None:
self.frequent_strings.value.update_double_list(view.list.floats)
successes += len(view.list.floats)
Expand Down

0 comments on commit 0bd8ed3

Please sign in to comment.