diff --git a/pyproject.toml b/pyproject.toml index 0679c95..4da1d89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ packages = ["theseus"] [project] name = "theseus" -version = "1.1.0" +version = "1.2.0" description = "A general template for various Deep Learning tasks. Strongly relies on Pytorch" readme = "README.md" requires-python = ">=3.6" diff --git a/theseus/__init__.py b/theseus/__init__.py index 167706f..8c69bec 100644 --- a/theseus/__init__.py +++ b/theseus/__init__.py @@ -11,7 +11,7 @@ __author__ = "kaylode" __license__ = "MIT" __copyright__ = "Copyright 2020-present Kaylode" -__version__ = "1.1.0" +__version__ = "1.2.0" from .base import * from .registry import Registry diff --git a/theseus/base/metrics/accuracy.py b/theseus/base/metrics/accuracy.py index 2a8d7a7..3720077 100644 --- a/theseus/base/metrics/accuracy.py +++ b/theseus/base/metrics/accuracy.py @@ -15,15 +15,15 @@ def __init__(self, label_type: str = "multiclass", **kwargs): self.threshold = kwargs.get("threshold", 0.5) self.reset() - def update(self, output: Dict[str, Any], batch: Dict[str, Any]): + def update(self, outputs: Dict[str, Any], batch: Dict[str, Any]): """ Perform calculation based on prediction and targets """ - output = output["outputs"] + outputs = outputs["outputs"] target = batch["targets"] prediction = logits2labels( - output, label_type=self.type, threshold=self.threshold + outputs, label_type=self.type, threshold=self.threshold ) correct = (prediction.view(-1) == target.view(-1)).sum() diff --git a/theseus/tabular/base/preprocessors/splitter.py b/theseus/tabular/base/preprocessors/splitter.py index d9163e0..edf34a7 100644 --- a/theseus/tabular/base/preprocessors/splitter.py +++ b/theseus/tabular/base/preprocessors/splitter.py @@ -1,5 +1,6 @@ import os import os.path as osp +import random from sklearn.model_selection import StratifiedKFold, train_test_split @@ -27,6 +28,7 @@ def __init__( "default", "stratified", "stratifiedkfold", + "unique", ], "splitter type not supported" self.splitter_type = splitter_type @@ -49,6 +51,10 @@ def __init__( elif self.splitter_type == "default": assert ratio is not None, "should specify ratio" self.ratio = ratio + elif self.splitter_type == "unique": + assert ratio is not None, "should specify ratio" + self.splitter = random.sample + self.ratio = ratio def run(self, df): num_samples, num_features = df.shape @@ -63,6 +69,16 @@ def run(self, df): ) train_df.to_csv(osp.join(self.save_folder, "train.csv"), index=False) val_df.to_csv(osp.join(self.save_folder, "val.csv"), index=False) + elif self.splitter_type == "unique": + unique_values = df[self.label_column].unique().tolist() + num_unique_samples = len(unique_values) + train_idx = self.splitter( + unique_values, int(num_unique_samples * self.ratio) + ) + train_df = df[df[self.label_column].isin(train_idx)] + val_df = df[~df[self.label_column].isin(train_idx)] + train_df.to_csv(osp.join(self.save_folder, "train.csv"), index=False) + val_df.to_csv(osp.join(self.save_folder, "val.csv"), index=False) else: x, y = ( df.drop(self.label_column, axis=1).values,