From 016be70acb7de455d77929e9c1965a0718eea6a0 Mon Sep 17 00:00:00 2001 From: rly Date: Thu, 4 Apr 2024 02:18:36 -0700 Subject: [PATCH] Minor updates to schema, update get_class and tests --- hdmf_ml/results_table.py | 205 +++++++++++++++++----------------- hdmf_ml/schema/ml_table.yaml | 4 +- hdmf_ml/schema/namespace.yaml | 4 +- tests/test_results_table.py | 6 +- 4 files changed, 112 insertions(+), 107 deletions(-) diff --git a/hdmf_ml/results_table.py b/hdmf_ml/results_table.py index 3403995..fb8b458 100644 --- a/hdmf_ml/results_table.py +++ b/hdmf_ml/results_table.py @@ -1,16 +1,29 @@ from hdmf.utils import docval, popargs from hdmf.backends.hdf5 import H5DataIO -from hdmf.common import get_class, register_class +from hdmf.common import get_class, register_class, VectorData import numpy as np from sklearn.preprocessing import LabelEncoder data_type = ("array_data", "data") +SupervisedOutput = get_class("SupervisedOutput", "hdmf-ml") +TrainValidationTestSplit = get_class("TrainValidationTestSplit", "hdmf-ml") +CrossValidationSplit = get_class("CrossValidationSplit", "hdmf-ml") +ClassProbability = get_class("ClassProbability", "hdmf-ml") +ClassLabel = get_class("ClassLabel", "hdmf-ml") +TopKProbabilities = get_class("TopKProbabilities", "hdmf-ml") +TopKClasses = get_class("TopKClasses", "hdmf-ml") +RegressionOutput = get_class("RegressionOutput", "hdmf-ml") +ClusterLabel = get_class("ClusterLabel", "hdmf-ml") +EmbeddedValues = get_class("EmbeddedValues", "hdmf-ml") + +_AutoGenResultsTable = get_class("ResultsTable", "hdmf-ml") + @register_class("ResultsTable", "hdmf-ml") -class ResultsTable(get_class("ResultsTable", "hdmf-ml")): - # override the auto-generated ResultsTable class +class ResultsTable(_AutoGenResultsTable): + # extend the auto-generated ResultsTable class @docval( { @@ -48,9 +61,13 @@ def n_samples(self): {"name": "name", "type": str, "doc": "the name of this column"}, {"name": "description", "type": str, "doc": "a description for this column"}, { - "name": "dim2", + "name": "dim2_kwarg", "type": str, - "doc": "the argument holding the second dimension", + "doc": ( + "the name of the argument in kwargs holding the size of the other dimension(s)" + "as an int for a 2D shape or a list/tuple/1-D array for an N-D shape where " + "N is equal to the length of the list/tuple/1-D array + 1" + ), "default": None, }, { @@ -63,11 +80,12 @@ def n_samples(self): ) def __add_col(self, **kwargs): """A helper function to handle boiler-plate code for adding columns to a ResultsTable""" - cls, data, name, description, dim2, dtype = popargs( - "cls", "data", "name", "description", "dim2", "dtype", kwargs + cls, data, name, description, dim2_kwarg, dtype = popargs( + "cls", "data", "name", "description", "dim2_kwarg", "dtype", kwargs ) - if dim2 is not None: - dim2 = kwargs.pop(dim2) + # get the size of the other dimension(s) from kwargs + if dim2_kwarg is not None: + dim2 = kwargs.pop(dim2_kwarg) if data is None: if self.n_samples is None: raise ValueError( @@ -78,15 +96,20 @@ def __add_col(self, **kwargs): shape = (self.n_samples,) if dim2 is not None: if isinstance(dim2, (int, np.integer)): + # dim2 is an integer, so column is 2D shape = (self.n_samples, dim2) elif isinstance(dim2, (list, tuple)): + # dim2 is a list or tuple, so shape is N-D shape = (self.n_samples, *dim2) elif isinstance(dim2, np.array) and len(dim2.shape) == 1: + # dim2 is a 1D array, so shape is N-D shape = (self.n_samples, *dim2) else: ValueError( - f"Unrecognized type for dim2: {type(dim2)} - expected integether or 1-D array-like" + f"Unrecognized type for dim2: {type(dim2)} - expected integer or 1-D array-like" ) + + # create empty DataIO object data = H5DataIO(shape=shape, dtype=dtype) if name in self: @@ -95,14 +118,12 @@ def __add_col(self, **kwargs): ) if len(self.id) == 0: self.id.extend(np.arange(len(data))) - if len(self.id) != len(data): + elif len(self.id) != len(data): raise ValueError( f"New column {name} of length {len(data)} is not the same length as " f"existings columns of length {len(self.id)}" ) - if isinstance(cls, str): - cls = get_class(cls, "hdmf-ml") self.add_column( data=data, name=name, description=description, col_cls=cls, **kwargs ) @@ -115,15 +136,9 @@ def __add_col(self, **kwargs): { "name": "data", "type": data_type, - "doc": "data for this column", + "doc": "train/validation/test mask (enum: train, validation, test) for each sample", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "tvt_split", - }, { "name": "description", "type": str, @@ -133,23 +148,18 @@ def __add_col(self, **kwargs): ) def add_tvt_split(self, **kwargs): """Add mask of 0, 1, 2 indicating which samples were used for training, validation, and testing.""" + kwargs["name"] = "tvt_split" kwargs["enum"] = ["train", "validate", "test"] kwargs["dtype"] = int - return self.__add_col("TrainValidationTestSplit", **kwargs) + return self.__add_col(TrainValidationTestSplit, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "train-validation-test split data", + "doc": "cross-validation split labels (int) for each sample", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "tvt_split", - }, { "name": "description", "type": str, @@ -165,30 +175,27 @@ def add_tvt_split(self, **kwargs): ) def add_cv_split(self, **kwargs): """Add cross-validation split mask""" + kwargs["name"] = "cv_split" if kwargs["data"] is None or isinstance(kwargs["data"], H5DataIO): if kwargs["n_splits"] is None: raise ValueError("n_splits must be specified if not passing data in") else: if kwargs["n_splits"] is None: + # set n_splits to one more than the max value of the data kwargs["n_splits"] = np.max(kwargs["data"]) + 1 if not isinstance(kwargs["n_splits"], (int, np.integer)): + # this should have been checked in docval? raise ValueError("Got non-integer data for cross-validation split") kwargs["dtype"] = int - return self.__add_col("CrossValidationSplit", **kwargs) + return self.__add_col(CrossValidationSplit, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "ground truth labels for each sample", + "doc": "ground truth labels (int, bytes, or str) for each sample", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "true_label", - }, { "name": "description", "type": str, @@ -197,27 +204,23 @@ def add_cv_split(self, **kwargs): }, ) def add_true_label(self, **kwargs): - """Add ground truth labels for each sample""" + """Add ground truth labels (int, bytes, or str) for each sample""" + kwargs["name"] = "true_label" if isinstance(kwargs["data"][0], (bytes, str)): + # if data are strings, convert to enum data type (data are ints, enum elements are strings) enc = LabelEncoder() - kwargs["data"] = enc.fit_transform(kwargs["data"]) + kwargs["data"] = np.uint(enc.fit_transform(kwargs["data"])) kwargs["enum"] = enc.classes_ kwargs["dtype"] = int - return self.__add_col("VectorData", **kwargs) + return self.__add_col(VectorData, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "probability of sample for each class", + "doc": "probability of sample (float) for each class", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "predicted_probability", - }, { "name": "description", "type": str, @@ -227,29 +230,26 @@ def add_true_label(self, **kwargs): { "name": "n_classes", "type": int, - "doc": "the number of classes", + "doc": "the number of classes, used to define the shape of the column only if data is None", "default": None, }, ) def add_predicted_probability(self, **kwargs): """Add probability of the sample for each class in the model""" + kwargs["name"] = "predicted_probability" kwargs["dtype"] = float - kwargs["dim2"] = "n_classes" - return self.__add_col("ClassProbability", **kwargs) + kwargs["dim2_kwarg"] = "n_classes" + # n_classes kwarg is passed into __add_col and will be read as the length of the second dimension + # of the data only if the data kwarg is None. + return self.__add_col(ClassProbability, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "predicted class label for each sample", + "doc": "predicted class label (int) for each sample", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "predicted_class", - }, { "name": "description", "type": str, @@ -258,23 +258,18 @@ def add_predicted_probability(self, **kwargs): }, ) def add_predicted_class(self, **kwargs): - """Add predicted class label for each sample""" + """Add predicted class label (int) for each sample""" + kwargs["name"] = "predicted_class" kwargs["dtype"] = int - return self.__add_col("ClassLabel", **kwargs) + return self.__add_col(ClassLabel, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "predicted value for each sample", + "doc": "predicted value (float) for each sample", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "predicted_value", - }, { "name": "description", "type": str, @@ -284,29 +279,29 @@ def add_predicted_class(self, **kwargs): { "name": "n_dims", "type": int, - "doc": "the number of dimensions in the regression output", + "doc": ( + "the number of dimensions in the regression output, " + "used to define the shape of the column only if data is None" + ), "default": None, }, ) def add_predicted_value(self, **kwargs): """Add predicted value (i.e. from a regression model) for each sample""" + kwargs["name"] = "predicted_value" kwargs["dtype"] = float - kwargs["dim2"] = "n_dims" - return self.__add_col("RegressionOutput", **kwargs) + kwargs["dim2_kwarg"] = "n_dims" + # n_dims kwarg is passed into __add_col and will be read as the length of the second dimension + # of the data only if the data kwarg is None. + return self.__add_col(RegressionOutput, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "cluster label for each sample", + "doc": "cluster label (int) for each sample", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "cluster_label", - }, { "name": "description", "type": str, @@ -316,22 +311,17 @@ def add_predicted_value(self, **kwargs): ) def add_cluster_label(self, **kwargs): """Add cluster label for each sample""" + kwargs["name"] = "cluster_label" kwargs["dtype"] = int - return self.__add_col("ClusterLabel", **kwargs) + return self.__add_col(ClusterLabel, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "embedding of each sample", + "doc": "embedding (float) of each sample", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "embedding", - }, { "name": "description", "type": str, @@ -341,48 +331,56 @@ def add_cluster_label(self, **kwargs): { "name": "n_dims", "type": int, - "doc": "the number of dimensions in the embedding", + "doc": ( + "the number of dimensions in the embedding, " + "used to define the shape of the column only if data is None" + ), "default": None, }, ) def add_embedding(self, **kwargs): """Add embedding (a.k.a. transformation or representation) of each sample""" + kwargs["name"] = "embedding" kwargs["dtype"] = float - kwargs["dim2"] = "n_dims" - return self.__add_col("EmbeddedValues", **kwargs) + kwargs["dim2_kwarg"] = "n_dims" + # n_dims kwarg is passed into __add_col and will be read as the length of the second dimension + # of the data only if the data kwarg is None. + return self.__add_col(EmbeddedValues, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "top-k predicted classes for each sample", + "doc": "top-k predicted classes (int) for each sample", "default": None, }, - { - "name": "name", - "type": str, - "doc": "the name of this column", - "default": "topk_classes", - }, { "name": "description", "type": str, "doc": "a description for this column", "default": "the top k predicted classes", }, - {"name": "k", "type": int, "doc": "the number of top classes", "default": None}, + { + "name": "k", + "type": int, + "doc": "the number of top classes, used to define the shape of the column only if data is None", + "default": None, + }, ) def add_topk_classes(self, **kwargs): """Add the top *k* predicted classes for each sample""" + kwargs["name"] = "topk_classes" kwargs["dtype"] = int - kwargs["dim2"] = "k" - return self.__add_col("TopKClasses", **kwargs) + kwargs["dim2_kwarg"] = "k" + # k kwarg is passed into __add_col and will be read as the length of the second dimension + # of the data only if the data kwarg is None. + return self.__add_col(TopKClasses, **kwargs) @docval( { "name": "data", "type": data_type, - "doc": "probabilities of the top-k predicted classes for each sample", + "doc": "probabilities (float) of the top-k predicted classes for each sample", "default": None, }, { @@ -397,10 +395,17 @@ def add_topk_classes(self, **kwargs): "doc": "a description for this column", "default": "the probabilities of the top k predicted classes", }, - {"name": "k", "type": int, "doc": "the number of top classes", "default": None}, + { + "name": "k", + "type": int, + "doc": "the number of top predicted classes, used to define the shape of the column only if data is None", + "default": None, + }, ) def add_topk_probabilities(self, **kwargs): """Add probabilities for the top *k* predicted classes for each sample""" kwargs["dtype"] = float - kwargs["dim2"] = "k" - return self.__add_col("TopKProbabilities", **kwargs) + kwargs["dim2_kwarg"] = "k" + # k kwarg is passed into __add_col and will be read as the length of the second dimension + # of the data only if the data kwarg is None. + return self.__add_col(TopKProbabilities, **kwargs) diff --git a/hdmf_ml/schema/ml_table.yaml b/hdmf_ml/schema/ml_table.yaml index a48d02e..fd348ba 100644 --- a/hdmf_ml/schema/ml_table.yaml +++ b/hdmf_ml/schema/ml_table.yaml @@ -67,7 +67,7 @@ datasets: reftype: object target_type: VectorData required: false - doc: The training labels that were used. Reference the true_label column if present + doc: The training labels that were used. Reference the `true_label` column if present in the same ResultsTable. - data_type_def: EmbeddedValues @@ -100,7 +100,7 @@ groups: - name: true_label data_type_inc: VectorData doc: A column to store the true labels for each sample. To store labels as strings, use EnumData. - The training_labels attribute on other columns in the ResultsTable should reference this column + The `training_labels` attribute on other columns in the ResultsTable should reference this column, if present. quantity: '?' - name: predicted_probability diff --git a/hdmf_ml/schema/namespace.yaml b/hdmf_ml/schema/namespace.yaml index cade32e..19e8f57 100644 --- a/hdmf_ml/schema/namespace.yaml +++ b/hdmf_ml/schema/namespace.yaml @@ -1,6 +1,6 @@ namespaces: - name: hdmf-ml - doc: An extension to HDMF for storing data to be using for machine learning + doc: An extension to HDMF for storing results of machine learning algorithms author: - Andrew Tritt - Ryan Ly @@ -14,4 +14,4 @@ namespaces: - doc: structures for storing ML data in a tabular fashion source: ml_table.yaml title: Machine learning table - version: 0.0.1 + version: 0.1.0 diff --git a/tests/test_results_table.py b/tests/test_results_table.py index 0df193f..ce2c089 100644 --- a/tests/test_results_table.py +++ b/tests/test_results_table.py @@ -39,14 +39,14 @@ def test_add_col_diff_len(self): def test_add_col_dupe_name(self): rt = ResultsTable(name="foo", description="a test results table") - rt.add_tvt_split([0, 1, 2, 0, 1]) + rt.add_tvt_split(np.uint([0, 1, 2, 0, 1])) msg = "Column 'tvt_split' already exists in ResultsTable 'foo'" with self.assertRaisesRegex(ValueError, msg): - rt.add_tvt_split([0, 1, 2, 0, 1]) + rt.add_tvt_split(np.uint([0, 1, 2, 0, 1])) def test_add_tvt_split(self): rt = ResultsTable(name="foo", description="a test results table") - rt.add_tvt_split([0, 1, 2, 0, 1]) + rt.add_tvt_split(np.uint([0, 1, 2, 0, 1])) with self.get_hdf5io() as io: io.write(rt)