From 016be70acb7de455d77929e9c1965a0718eea6a0 Mon Sep 17 00:00:00 2001
From: rly <rly@lbl.gov>
Date: Thu, 4 Apr 2024 02:18:36 -0700
Subject: [PATCH] Minor updates to schema, update get_class and tests

---
 hdmf_ml/results_table.py      | 205 +++++++++++++++++-----------------
 hdmf_ml/schema/ml_table.yaml  |   4 +-
 hdmf_ml/schema/namespace.yaml |   4 +-
 tests/test_results_table.py   |   6 +-
 4 files changed, 112 insertions(+), 107 deletions(-)

diff --git a/hdmf_ml/results_table.py b/hdmf_ml/results_table.py
index 3403995..fb8b458 100644
--- a/hdmf_ml/results_table.py
+++ b/hdmf_ml/results_table.py
@@ -1,16 +1,29 @@
 from hdmf.utils import docval, popargs
 from hdmf.backends.hdf5 import H5DataIO
-from hdmf.common import get_class, register_class
+from hdmf.common import get_class, register_class, VectorData
 import numpy as np
 from sklearn.preprocessing import LabelEncoder
 
 
 data_type = ("array_data", "data")
 
+SupervisedOutput = get_class("SupervisedOutput", "hdmf-ml")
+TrainValidationTestSplit = get_class("TrainValidationTestSplit", "hdmf-ml")
+CrossValidationSplit = get_class("CrossValidationSplit", "hdmf-ml")
+ClassProbability = get_class("ClassProbability", "hdmf-ml")
+ClassLabel = get_class("ClassLabel", "hdmf-ml")
+TopKProbabilities = get_class("TopKProbabilities", "hdmf-ml")
+TopKClasses = get_class("TopKClasses", "hdmf-ml")
+RegressionOutput = get_class("RegressionOutput", "hdmf-ml")
+ClusterLabel = get_class("ClusterLabel", "hdmf-ml")
+EmbeddedValues = get_class("EmbeddedValues", "hdmf-ml")
+
+_AutoGenResultsTable = get_class("ResultsTable", "hdmf-ml")
+
 
 @register_class("ResultsTable", "hdmf-ml")
-class ResultsTable(get_class("ResultsTable", "hdmf-ml")):
-    # override the auto-generated ResultsTable class
+class ResultsTable(_AutoGenResultsTable):
+    # extend the auto-generated ResultsTable class
 
     @docval(
         {
@@ -48,9 +61,13 @@ def n_samples(self):
         {"name": "name", "type": str, "doc": "the name of this column"},
         {"name": "description", "type": str, "doc": "a description for this column"},
         {
-            "name": "dim2",
+            "name": "dim2_kwarg",
             "type": str,
-            "doc": "the argument holding the second dimension",
+            "doc": (
+                "the name of the argument in kwargs holding the size of the other dimension(s)"
+                "as an int for a 2D shape or a list/tuple/1-D array for an N-D shape where "
+                "N is equal to the length of the list/tuple/1-D array + 1"
+            ),
             "default": None,
         },
         {
@@ -63,11 +80,12 @@ def n_samples(self):
     )
     def __add_col(self, **kwargs):
         """A helper function to handle boiler-plate code for adding columns to a ResultsTable"""
-        cls, data, name, description, dim2, dtype = popargs(
-            "cls", "data", "name", "description", "dim2", "dtype", kwargs
+        cls, data, name, description, dim2_kwarg, dtype = popargs(
+            "cls", "data", "name", "description", "dim2_kwarg", "dtype", kwargs
         )
-        if dim2 is not None:
-            dim2 = kwargs.pop(dim2)
+        # get the size of the other dimension(s) from kwargs
+        if dim2_kwarg is not None:
+            dim2 = kwargs.pop(dim2_kwarg)
         if data is None:
             if self.n_samples is None:
                 raise ValueError(
@@ -78,15 +96,20 @@ def __add_col(self, **kwargs):
             shape = (self.n_samples,)
             if dim2 is not None:
                 if isinstance(dim2, (int, np.integer)):
+                    # dim2 is an integer, so column is 2D
                     shape = (self.n_samples, dim2)
                 elif isinstance(dim2, (list, tuple)):
+                    # dim2 is a list or tuple, so shape is N-D
                     shape = (self.n_samples, *dim2)
                 elif isinstance(dim2, np.array) and len(dim2.shape) == 1:
+                    # dim2 is a 1D array, so shape is N-D
                     shape = (self.n_samples, *dim2)
                 else:
                     ValueError(
-                        f"Unrecognized type for dim2: {type(dim2)} - expected integether or 1-D array-like"
+                        f"Unrecognized type for dim2: {type(dim2)} - expected integer or 1-D array-like"
                     )
+
+            # create empty DataIO object
             data = H5DataIO(shape=shape, dtype=dtype)
 
         if name in self:
@@ -95,14 +118,12 @@ def __add_col(self, **kwargs):
             )
         if len(self.id) == 0:
             self.id.extend(np.arange(len(data)))
-        if len(self.id) != len(data):
+        elif len(self.id) != len(data):
             raise ValueError(
                 f"New column {name} of length {len(data)} is not the same length as "
                 f"existings columns of length {len(self.id)}"
             )
 
-        if isinstance(cls, str):
-            cls = get_class(cls, "hdmf-ml")
         self.add_column(
             data=data, name=name, description=description, col_cls=cls, **kwargs
         )
@@ -115,15 +136,9 @@ def __add_col(self, **kwargs):
         {
             "name": "data",
             "type": data_type,
-            "doc": "data for this column",
+            "doc": "train/validation/test mask (enum: train, validation, test) for each sample",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "tvt_split",
-        },
         {
             "name": "description",
             "type": str,
@@ -133,23 +148,18 @@ def __add_col(self, **kwargs):
     )
     def add_tvt_split(self, **kwargs):
         """Add mask of 0, 1, 2 indicating which samples were used for training, validation, and testing."""
+        kwargs["name"] = "tvt_split"
         kwargs["enum"] = ["train", "validate", "test"]
         kwargs["dtype"] = int
-        return self.__add_col("TrainValidationTestSplit", **kwargs)
+        return self.__add_col(TrainValidationTestSplit, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "train-validation-test split data",
+            "doc": "cross-validation split labels (int) for each sample",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "tvt_split",
-        },
         {
             "name": "description",
             "type": str,
@@ -165,30 +175,27 @@ def add_tvt_split(self, **kwargs):
     )
     def add_cv_split(self, **kwargs):
         """Add cross-validation split mask"""
+        kwargs["name"] = "cv_split"
         if kwargs["data"] is None or isinstance(kwargs["data"], H5DataIO):
             if kwargs["n_splits"] is None:
                 raise ValueError("n_splits must be specified if not passing data in")
         else:
             if kwargs["n_splits"] is None:
+                # set n_splits to one more than the max value of the data
                 kwargs["n_splits"] = np.max(kwargs["data"]) + 1
         if not isinstance(kwargs["n_splits"], (int, np.integer)):
+            # this should have been checked in docval?
             raise ValueError("Got non-integer data for cross-validation split")
         kwargs["dtype"] = int
-        return self.__add_col("CrossValidationSplit", **kwargs)
+        return self.__add_col(CrossValidationSplit, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "ground truth labels for each sample",
+            "doc": "ground truth labels (int, bytes, or str) for each sample",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "true_label",
-        },
         {
             "name": "description",
             "type": str,
@@ -197,27 +204,23 @@ def add_cv_split(self, **kwargs):
         },
     )
     def add_true_label(self, **kwargs):
-        """Add ground truth labels for each sample"""
+        """Add ground truth labels (int, bytes, or str) for each sample"""
+        kwargs["name"] = "true_label"
         if isinstance(kwargs["data"][0], (bytes, str)):
+            # if data are strings, convert to enum data type (data are ints, enum elements are strings)
             enc = LabelEncoder()
-            kwargs["data"] = enc.fit_transform(kwargs["data"])
+            kwargs["data"] = np.uint(enc.fit_transform(kwargs["data"]))
             kwargs["enum"] = enc.classes_
         kwargs["dtype"] = int
-        return self.__add_col("VectorData", **kwargs)
+        return self.__add_col(VectorData, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "probability of sample for each class",
+            "doc": "probability of sample (float) for each class",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "predicted_probability",
-        },
         {
             "name": "description",
             "type": str,
@@ -227,29 +230,26 @@ def add_true_label(self, **kwargs):
         {
             "name": "n_classes",
             "type": int,
-            "doc": "the number of classes",
+            "doc": "the number of classes, used to define the shape of the column only if data is None",
             "default": None,
         },
     )
     def add_predicted_probability(self, **kwargs):
         """Add probability of the sample for each class in the model"""
+        kwargs["name"] = "predicted_probability"
         kwargs["dtype"] = float
-        kwargs["dim2"] = "n_classes"
-        return self.__add_col("ClassProbability", **kwargs)
+        kwargs["dim2_kwarg"] = "n_classes"
+        # n_classes kwarg is passed into __add_col and will be read as the length of the second dimension
+        # of the data only if the data kwarg is None.
+        return self.__add_col(ClassProbability, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "predicted class label for each sample",
+            "doc": "predicted class label (int) for each sample",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "predicted_class",
-        },
         {
             "name": "description",
             "type": str,
@@ -258,23 +258,18 @@ def add_predicted_probability(self, **kwargs):
         },
     )
     def add_predicted_class(self, **kwargs):
-        """Add predicted class label for each sample"""
+        """Add predicted class label (int) for each sample"""
+        kwargs["name"] = "predicted_class"
         kwargs["dtype"] = int
-        return self.__add_col("ClassLabel", **kwargs)
+        return self.__add_col(ClassLabel, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "predicted value for each sample",
+            "doc": "predicted value (float) for each sample",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "predicted_value",
-        },
         {
             "name": "description",
             "type": str,
@@ -284,29 +279,29 @@ def add_predicted_class(self, **kwargs):
         {
             "name": "n_dims",
             "type": int,
-            "doc": "the number of dimensions in the regression output",
+            "doc": (
+                "the number of dimensions in the regression output, "
+                "used to define the shape of the column only if data is None"
+            ),
             "default": None,
         },
     )
     def add_predicted_value(self, **kwargs):
         """Add predicted value (i.e. from a regression model) for each sample"""
+        kwargs["name"] = "predicted_value"
         kwargs["dtype"] = float
-        kwargs["dim2"] = "n_dims"
-        return self.__add_col("RegressionOutput", **kwargs)
+        kwargs["dim2_kwarg"] = "n_dims"
+        # n_dims kwarg is passed into __add_col and will be read as the length of the second dimension
+        # of the data only if the data kwarg is None.
+        return self.__add_col(RegressionOutput, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "cluster label for each sample",
+            "doc": "cluster label (int) for each sample",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "cluster_label",
-        },
         {
             "name": "description",
             "type": str,
@@ -316,22 +311,17 @@ def add_predicted_value(self, **kwargs):
     )
     def add_cluster_label(self, **kwargs):
         """Add cluster label for each sample"""
+        kwargs["name"] = "cluster_label"
         kwargs["dtype"] = int
-        return self.__add_col("ClusterLabel", **kwargs)
+        return self.__add_col(ClusterLabel, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "embedding of each sample",
+            "doc": "embedding (float) of each sample",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "embedding",
-        },
         {
             "name": "description",
             "type": str,
@@ -341,48 +331,56 @@ def add_cluster_label(self, **kwargs):
         {
             "name": "n_dims",
             "type": int,
-            "doc": "the number of dimensions in the embedding",
+            "doc": (
+                "the number of dimensions in the embedding, "
+                "used to define the shape of the column only if data is None"
+            ),
             "default": None,
         },
     )
     def add_embedding(self, **kwargs):
         """Add embedding (a.k.a. transformation or representation) of each sample"""
+        kwargs["name"] = "embedding"
         kwargs["dtype"] = float
-        kwargs["dim2"] = "n_dims"
-        return self.__add_col("EmbeddedValues", **kwargs)
+        kwargs["dim2_kwarg"] = "n_dims"
+        # n_dims kwarg is passed into __add_col and will be read as the length of the second dimension
+        # of the data only if the data kwarg is None.
+        return self.__add_col(EmbeddedValues, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "top-k predicted classes for each sample",
+            "doc": "top-k predicted classes (int) for each sample",
             "default": None,
         },
-        {
-            "name": "name",
-            "type": str,
-            "doc": "the name of this column",
-            "default": "topk_classes",
-        },
         {
             "name": "description",
             "type": str,
             "doc": "a description for this column",
             "default": "the top k predicted classes",
         },
-        {"name": "k", "type": int, "doc": "the number of top classes", "default": None},
+        {
+            "name": "k",
+            "type": int,
+            "doc": "the number of top classes, used to define the shape of the column only if data is None",
+            "default": None,
+        },
     )
     def add_topk_classes(self, **kwargs):
         """Add the top *k* predicted classes for each sample"""
+        kwargs["name"] = "topk_classes"
         kwargs["dtype"] = int
-        kwargs["dim2"] = "k"
-        return self.__add_col("TopKClasses", **kwargs)
+        kwargs["dim2_kwarg"] = "k"
+        # k kwarg is passed into __add_col and will be read as the length of the second dimension
+        # of the data only if the data kwarg is None.
+        return self.__add_col(TopKClasses, **kwargs)
 
     @docval(
         {
             "name": "data",
             "type": data_type,
-            "doc": "probabilities of the top-k predicted classes for each sample",
+            "doc": "probabilities (float) of the top-k predicted classes for each sample",
             "default": None,
         },
         {
@@ -397,10 +395,17 @@ def add_topk_classes(self, **kwargs):
             "doc": "a description for this column",
             "default": "the probabilities of the top k predicted classes",
         },
-        {"name": "k", "type": int, "doc": "the number of top classes", "default": None},
+        {
+            "name": "k",
+            "type": int,
+            "doc": "the number of top predicted classes, used to define the shape of the column only if data is None",
+            "default": None,
+        },
     )
     def add_topk_probabilities(self, **kwargs):
         """Add probabilities for the top *k* predicted classes for each sample"""
         kwargs["dtype"] = float
-        kwargs["dim2"] = "k"
-        return self.__add_col("TopKProbabilities", **kwargs)
+        kwargs["dim2_kwarg"] = "k"
+        # k kwarg is passed into __add_col and will be read as the length of the second dimension
+        # of the data only if the data kwarg is None.
+        return self.__add_col(TopKProbabilities, **kwargs)
diff --git a/hdmf_ml/schema/ml_table.yaml b/hdmf_ml/schema/ml_table.yaml
index a48d02e..fd348ba 100644
--- a/hdmf_ml/schema/ml_table.yaml
+++ b/hdmf_ml/schema/ml_table.yaml
@@ -67,7 +67,7 @@ datasets:
       reftype: object
       target_type: VectorData
     required: false
-    doc: The training labels that were used. Reference the true_label column if present
+    doc: The training labels that were used. Reference the `true_label` column if present
       in the same ResultsTable.
 
 - data_type_def: EmbeddedValues
@@ -100,7 +100,7 @@ groups:
   - name: true_label
     data_type_inc: VectorData
     doc: A column to store the true labels for each sample. To store labels as strings, use EnumData.
-      The training_labels attribute on other columns in the ResultsTable should reference this column
+      The `training_labels` attribute on other columns in the ResultsTable should reference this column,
       if present.
     quantity: '?'
   - name: predicted_probability
diff --git a/hdmf_ml/schema/namespace.yaml b/hdmf_ml/schema/namespace.yaml
index cade32e..19e8f57 100644
--- a/hdmf_ml/schema/namespace.yaml
+++ b/hdmf_ml/schema/namespace.yaml
@@ -1,6 +1,6 @@
 namespaces:
 - name: hdmf-ml
-  doc: An extension to HDMF for storing data to be using for machine learning
+  doc: An extension to HDMF for storing results of machine learning algorithms
   author:
   - Andrew Tritt
   - Ryan Ly
@@ -14,4 +14,4 @@ namespaces:
   - doc: structures for storing ML data in a tabular fashion
     source: ml_table.yaml
     title: Machine learning table
-  version: 0.0.1
+  version: 0.1.0
diff --git a/tests/test_results_table.py b/tests/test_results_table.py
index 0df193f..ce2c089 100644
--- a/tests/test_results_table.py
+++ b/tests/test_results_table.py
@@ -39,14 +39,14 @@ def test_add_col_diff_len(self):
 
     def test_add_col_dupe_name(self):
         rt = ResultsTable(name="foo", description="a test results table")
-        rt.add_tvt_split([0, 1, 2, 0, 1])
+        rt.add_tvt_split(np.uint([0, 1, 2, 0, 1]))
         msg = "Column 'tvt_split' already exists in ResultsTable 'foo'"
         with self.assertRaisesRegex(ValueError, msg):
-            rt.add_tvt_split([0, 1, 2, 0, 1])
+            rt.add_tvt_split(np.uint([0, 1, 2, 0, 1]))
 
     def test_add_tvt_split(self):
         rt = ResultsTable(name="foo", description="a test results table")
-        rt.add_tvt_split([0, 1, 2, 0, 1])
+        rt.add_tvt_split(np.uint([0, 1, 2, 0, 1]))
         with self.get_hdf5io() as io:
             io.write(rt)