Ranking metrics API improvements (#1507)

## Description Removes the `convert_non_numeric` argument and swap `prediction_column` for `target_column` when only relevance scores are passed. ## Changes - Determines `convert_non_numeric` automatically by looking for strings in the prediction & target columns - Swap `prediction_column` and `target_column` when only one column name is passed  Closes #1504 Closes #1505 - [ ] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md) and the [Code of Conduct](CODE_OF_CONDUCT.md).
whylabs · Apr 25, 2024 · b91949f · b91949f
1 parent 7b83a96
commit b91949f
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 13 deletions.
diff --git a/python/tests/experimental/api/test_logger.py b/python/tests/experimental/api/test_logger.py
@@ -17,7 +17,9 @@ def test_log_batch_ranking_metrics_single_simple():
         }
     )
     result = log_batch_ranking_metrics(
-        data=single_df, prediction_column="raw_predictions", target_column="raw_targets", convert_non_numeric=True
+        data=single_df,
+        prediction_column="raw_predictions",
+        target_column="raw_targets",
     )
     pandas_summary = result.view().to_pandas()
 
@@ -53,10 +55,10 @@ def test_log_batch_ranking_metrics_single_simple():
 
 def test_log_batch_ranking_metrics_binary_simple():
     binary_df = pd.DataFrame(
-        {"raw_predictions": [[True, False, True], [False, False, False], [True, True, False], [False, True, False]]}
+        {"raw_targets": [[True, False, True], [False, False, False], [True, True, False], [False, True, False]]}
     )
 
-    result = log_batch_ranking_metrics(data=binary_df, prediction_column="raw_predictions", k=2)
+    result = log_batch_ranking_metrics(data=binary_df, target_column="raw_targets", k=2)
     pandas_summary = result.view().to_pandas()
 
     k = 2
@@ -109,7 +111,6 @@ def test_log_batch_ranking_metrics_multiple_simple():
         prediction_column="raw_predictions",
         target_column="raw_targets",
         k=k,
-        convert_non_numeric=True,
     )
     pandas_summary = result.view().to_pandas()
 
@@ -139,9 +140,9 @@ def test_log_batch_ranking_metrics_multiple_simple():
 
 
 def test_log_batch_ranking_metrics_default_target():
-    multiple_df = pd.DataFrame({"raw_predictions": [[3, 2, 3, 0, 1, 2, 3, 2]]})
+    multiple_df = pd.DataFrame({"raw_targets": [[3, 2, 3, 0, 1, 2, 3, 2]]})
 
-    result = log_batch_ranking_metrics(data=multiple_df, prediction_column="raw_predictions", k=3)
+    result = log_batch_ranking_metrics(data=multiple_df, target_column="raw_targets", k=3)
     pandas_summary = result.view().to_pandas()
 
     k = 3

diff --git a/python/whylogs/experimental/api/logger/__init__.py b/python/whylogs/experimental/api/logger/__init__.py
@@ -131,18 +131,39 @@ def _calculate_average_precisions(
     return averages
 
 
+def _all_strings(data: pd.Series) -> bool:
+    return all([all([isinstance(y, str) for y in x]) for x in data])
+
+
 def log_batch_ranking_metrics(
     data: pd.core.frame.DataFrame,
     prediction_column: Optional[str] = None,
     target_column: Optional[str] = None,
     score_column: Optional[str] = None,
     k: Optional[int] = None,
-    convert_non_numeric=False,
     schema: Union[DatasetSchema, None] = None,
     log_full_data: bool = False,
 ) -> ViewResultSet:
     """Log ranking metrics for a batch of data.
 
+    You can call the function several ways:
+      - Pass both prediction_column and target_column.
+          - The named columns contain lists of strings. In this case, the prediction column contains the
+            items the model has predicted are relevant, and the target column contains the items that
+            are actually relevant. In this case, relevance is boolean.
+
+          - The prediction column contains lists of integers and the target column contains lists of numbers
+            or booleans. The value at the i-th position in the predicted list is the predicted rank of the i-th
+            element of the domain. The value at the i-th position in the target list is the true relevance score of the
+            i-th element of the domain. The score can be numeric or boolean. Higher scores indicate higher relevance.
+
+      - Pass both target_column and score_column. The value at the i-th position in the target list is the true relevance
+        of the i-th element of the domain (represented as a number, higher being more relevant; or boolean). The value at
+        the i-th position in the score list is the model output for the i-th element of the domain.
+
+      - Pass only target_column. The target column contians lists of numbers or booleans. The list entries are the true
+        relevance of the items predicted by the model in prediction order.
+
     Parameters
     ----------
     data : pd.core.frame.DataFrame
@@ -157,9 +178,6 @@ def log_batch_ranking_metrics(
     k : Optional[int], optional
         Consider the top k ranks for metrics calculation.
         If `None`, use all outputs, by default None
-    convert_non_numeric : bool, optional
-        Indicates whether prediction/target columns are non-numeric.
-        If True, prediction/target should be strings, by default False
     schema : Union[DatasetSchema, None], optional
         Defines the schema for tracking metrics in whylogs, by default None
     log_full_data : bool, optional
@@ -226,19 +244,28 @@ def log_batch_ranking_metrics(
 
         binary_single_df = pd.DataFrame(
             {
-                "raw_predictions": [
+                "raw_targets": [
                     [True, False, True], # First recommended item: Relevant, Second: Not relevant, Third: Relevant
                     [False, False, False], # None of the recommended items are relevant
                     [True, True, False], # First and second recommended items are relevant
                 ]
             }
         )
 
-        result = log_batch_ranking_metrics(data=binary_single_df, prediction_column="raw_predictions", k=3)
+        result = log_batch_ranking_metrics(data=binary_single_df, target_column="raw_targets", k=3)
 
     """
     formatted_data = data.copy(deep=True)  # TODO: does this have to be deep?
 
+    if score_column is not None and prediction_column is not None:
+        raise ValueError("Cannot specify both score_column and prediction_column")
+
+    if prediction_column is None and score_column is None and target_column is not None:
+        # https://github.com/whylabs/whylogs/issues/1505
+        # The column use logic is complex, so just swapping them here for this case
+        # rather than unraveling all the use cases.
+        prediction_column, target_column = target_column, prediction_column
+
     if prediction_column is None:
         if score_column is not None and target_column is not None:
             prediction_column = "__predictions"
@@ -248,7 +275,7 @@ def log_batch_ranking_metrics(
                 lambda row: list(np.argsort(np.argsort(-np.array(row))) + 1)
             )
         else:
-            raise ValueError("Either prediction_column or score+target columns must be specified")
+            raise ValueError("Either target_column or score+target columns must be specified")
 
     relevant_cols = [prediction_column]
 
@@ -280,6 +307,10 @@ def log_batch_ranking_metrics(
     if k and k < 1:
         raise ValueError("k must be a positive integer")
 
+    convert_non_numeric = _all_strings(formatted_data[prediction_column]) and _all_strings(
+        formatted_data[target_column]
+    )
+
     row_wise_functions = RowWiseMetrics(target_column, prediction_column, convert_non_numeric)
     formatted_data["count_at_k"] = formatted_data.apply(row_wise_functions.relevant_counter, args=(k,), axis=1)
     formatted_data["count_all"] = formatted_data.apply(row_wise_functions.relevant_counter, args=(_max_k,), axis=1)