From afb024bf7c0506aa63c929d5f79b83494484c316 Mon Sep 17 00:00:00 2001
From: omsh <omar.shouman@gmail.com>
Date: Sun, 7 Jul 2024 20:37:26 +0200
Subject: [PATCH 1/4] trigger pypi only on main

---
 .github/workflows/pypi.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 07f1f3c1..0291a54d 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -6,6 +6,8 @@ on:
     workflows: ["Build"]
     types:
       - completed
+    branches:
+      - main
 
 jobs:
   release:

From 0ca1f1dcbfd5d639b1d5d1765f7cab48864470b2 Mon Sep 17 00:00:00 2001
From: omsh <omar.shouman@gmail.com>
Date: Sun, 7 Jul 2024 20:58:20 +0200
Subject: [PATCH 2/4] trigger pypi workflow only on main and build succeeds on
 main

---
 .github/workflows/pypi.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 0291a54d..b298c196 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -12,7 +12,10 @@ on:
 jobs:
   release:
     runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/main'}}
+    if: |
+      github.event.workflow_run.conclusion == 'success' &&
+      github.event.workflow_run.head_branch == 'main' &&
+      github.ref == 'refs/heads/main'
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python

From fcbe4db8573b7e5ade3721029f2a4e17a3300adb Mon Sep 17 00:00:00 2001
From: Omar Shouman <o.shouman@tum.de>
Date: Tue, 1 Oct 2024 09:50:30 +0200
Subject: [PATCH 3/4] Fix/alphabet encoding dataset refactoring (#43)

* test dataset fix

* refactoring dataset classes with  config class

* processors changes to handle alphabet scenarios

* model alphabet and embedding count fix

* added tests for processors and extended tests for datasets

* dev version 0
---
 src/dlomix/__init__.py                    |   2 +-
 src/dlomix/data/charge_state.py           |  29 +--
 src/dlomix/data/dataset.py                | 259 ++++++++++++----------
 src/dlomix/data/dataset_config.py         |   6 +
 src/dlomix/data/fragment_ion_intensity.py |  29 +--
 src/dlomix/data/processing/__init__.py    |  12 +-
 src/dlomix/data/processing/processors.py  |  66 +++++-
 src/dlomix/data/retention_time.py         |  29 +--
 src/dlomix/models/prosit.py               |   4 +-
 tests/test_datasets.py                    |   4 +
 tests/test_processors.py                  | 185 ++++++++++++++++
 11 files changed, 422 insertions(+), 203 deletions(-)
 create mode 100644 tests/test_processors.py

diff --git a/src/dlomix/__init__.py b/src/dlomix/__init__.py
index ab953da5..0d293ee8 100644
--- a/src/dlomix/__init__.py
+++ b/src/dlomix/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.2"
+__version__ = "0.1.3dev0"
 
 META_DATA = {
     "author": "Omar Shouman",
diff --git a/src/dlomix/data/charge_state.py b/src/dlomix/data/charge_state.py
index 3042d898..b960cc0d 100644
--- a/src/dlomix/data/charge_state.py
+++ b/src/dlomix/data/charge_state.py
@@ -2,6 +2,7 @@
 
 from ..constants import ALPHABET_UNMOD
 from .dataset import PeptideDataset
+from .dataset_config import DatasetConfig
 from .dataset_utils import EncodingScheme
 
 
@@ -60,29 +61,5 @@ def __init__(
         num_proc: Optional[int] = None,
         batch_processing_size: int = 1000,
     ):
-        super().__init__(
-            data_source,
-            val_data_source,
-            test_data_source,
-            data_format,
-            sequence_column,
-            label_column,
-            val_ratio,
-            max_seq_len,
-            dataset_type,
-            batch_size,
-            model_features,
-            dataset_columns_to_keep,
-            features_to_extract,
-            pad,
-            padding_value,
-            alphabet,
-            with_termini,
-            encoding_scheme,
-            processed,
-            enable_tf_dataset_cache,
-            disable_cache,
-            auto_cleanup_cache,
-            num_proc,
-            batch_processing_size,
-        )
+        kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
+        super().__init__(DatasetConfig(**kwargs))
diff --git a/src/dlomix/data/dataset.py b/src/dlomix/data/dataset.py
index db5d8900..42aa8614 100644
--- a/src/dlomix/data/dataset.py
+++ b/src/dlomix/data/dataset.py
@@ -19,6 +19,7 @@
 )
 from .processing.processors import (
     FunctionProcessor,
+    PeptideDatasetBaseProcessor,
     SequenceEncodingProcessor,
     SequencePaddingProcessor,
     SequenceParsingProcessor,
@@ -105,79 +106,60 @@ class PeptideDataset:
     DEFAULT_SPLIT_NAMES = ["train", "val", "test"]
     CONFIG_JSON_NAME = "dlomix_peptide_dataset_config.json"
 
-    def __init__(
-        self,
-        data_source: Union[str, List],
-        val_data_source: Union[str, List],
-        test_data_source: Union[str, List],
-        data_format: str,
-        sequence_column: str,
-        label_column: str,
-        val_ratio: float,
-        max_seq_len: int,
-        dataset_type: str,
-        batch_size: int,
-        model_features: Optional[List[str]],
-        dataset_columns_to_keep: Optional[List[str]],
-        features_to_extract: Optional[List[Union[Callable, str]]] = None,
-        pad: bool = True,
-        padding_value: int = 0,
-        alphabet: Dict = ALPHABET_UNMOD,
-        with_termini: bool = True,
-        encoding_scheme: Union[str, EncodingScheme] = EncodingScheme.UNMOD,
-        processed: bool = False,
-        enable_tf_dataset_cache: bool = False,
-        disable_cache: bool = False,
-        auto_cleanup_cache: bool = True,
-        num_proc: Optional[int] = None,
-        batch_processing_size: Optional[int] = 1000,
-    ):
+    def __init__(self, config: DatasetConfig):
         super(PeptideDataset, self).__init__()
-        self.data_source = data_source
-        self.val_data_source = val_data_source
-        self.test_data_source = test_data_source
+        self.__dict__.update(**config.__dict__)
+
+        # self.data_source = data_source
+        # self.val_data_source = val_data_source
+        # self.test_data_source = test_data_source
 
-        self.data_format = data_format
+        # self.data_format = data_format
 
-        self.sequence_column = sequence_column
-        self.label_column = label_column
+        # self.sequence_column = sequence_column
+        # self.label_column = label_column
 
-        self.val_ratio = val_ratio
-        self.max_seq_len = max_seq_len
-        self.dataset_type = dataset_type
-        self.batch_size = batch_size
-        self.model_features = model_features
+        # self.val_ratio = val_ratio
+        # self.max_seq_len = max_seq_len
+        # self.dataset_type = dataset_type
+        # self.batch_size = batch_size
+        # self.model_features = model_features
 
         # to be kept in the hf dataset, but not returned in the tensor dataset
-        if dataset_columns_to_keep is None:
+        if config.dataset_columns_to_keep is None:
             self.dataset_columns_to_keep = []
         else:
-            self.dataset_columns_to_keep = dataset_columns_to_keep
-
-        self.features_to_extract = features_to_extract
-        self.pad = pad
-        self.padding_value = padding_value
-        self.alphabet = alphabet
-        self.with_termini = with_termini
-        self.encoding_scheme = EncodingScheme(encoding_scheme)
-        self.processed = processed
-        self.enable_tf_dataset_cache = enable_tf_dataset_cache
-        self.disable_cache = disable_cache
-        self.auto_cleanup_cache = auto_cleanup_cache
+            self.dataset_columns_to_keep = config.dataset_columns_to_keep
+
+        # self.features_to_extract = features_to_extract
+        # self.pad = pad
+        # self.padding_value = padding_value
+        # self.alphabet = alphabet
+        # self.with_termini = with_termini
+        self.encoding_scheme = EncodingScheme(config.encoding_scheme)
+        # self.processed = processed
+        # self.enable_tf_dataset_cache = enable_tf_dataset_cache
+        # self.disable_cache = disable_cache
+        # self.auto_cleanup_cache = auto_cleanup_cache
         self._set_hf_cache_management()
 
-        self.extended_alphabet = self.alphabet.copy()
+        self.extended_alphabet = None
+        self.learning_alphabet_mode = True
 
-        self._refresh_config()
+        if self.alphabet:
+            self.extended_alphabet = self.alphabet.copy()
+            self.learning_alphabet_mode = False
+
+        self._config = config
 
         if not self.processed:
             self.hf_dataset: Optional[Union[Dataset, DatasetDict]] = None
             self._empty_dataset_mode = False
             self._is_predefined_split = False
             self._test_set_only = False
-            self._num_proc = num_proc
+            self._num_proc = config.num_proc
             self._set_num_proc()
-            self._batch_processing_size = batch_processing_size
+            # self.batch_processing_size = config.batch_processing_size
 
             self._data_files_available_splits = {}
             self._load_dataset()
@@ -216,26 +198,26 @@ def _set_hf_cache_management(self):
             disable_caching()
 
     def _refresh_config(self):
-        self._config = DatasetConfig(
-            data_source=self.data_source,
-            val_data_source=self.val_data_source,
-            test_data_source=self.test_data_source,
-            data_format=self.data_format,
-            sequence_column=self.sequence_column,
-            label_column=self.label_column,
-            val_ratio=self.val_ratio,
-            max_seq_len=self.max_seq_len,
-            dataset_type=self.dataset_type,
-            batch_size=self.batch_size,
-            model_features=self.model_features,
-            dataset_columns_to_keep=self.dataset_columns_to_keep,
-            features_to_extract=self.features_to_extract,
-            pad=self.pad,
-            padding_value=self.padding_value,
-            alphabet=self.alphabet,
-            encoding_scheme=self.encoding_scheme,
-            processed=self.processed,
-        )
+        self._config = DatasetConfig(**self._config.__dict__)
+        #     data_source=self.data_source,
+        #     val_data_source=self.val_data_source,
+        #     test_data_source=self.test_data_source,
+        #     data_format=self.data_format,
+        #     sequence_column=self.sequence_column,
+        #     label_column=self.label_column,
+        #     val_ratio=self.val_ratio,
+        #     max_seq_len=self.max_seq_len,
+        #     dataset_type=self.dataset_type,
+        #     batch_size=self.batch_size,
+        #     model_features=self.model_features,
+        #     dataset_columns_to_keep=self.dataset_columns_to_keep,
+        #     features_to_extract=self.features_to_extract,
+        #     pad=self.pad,
+        #     padding_value=self.padding_value,
+        #     alphabet=self.alphabet,
+        #     encoding_scheme=self.encoding_scheme,
+        #     processed=self.processed,
+        # )
 
         self._config._additional_data.update(
             {
@@ -311,6 +293,7 @@ def _load_from_inmemory_hf_dataset(self):
                 split: f"in-memory Dataset object - {split}"
                 for split in self.hf_dataset
             }
+
         elif isinstance(self.data_source, Dataset):
             self.hf_dataset = DatasetDict()
             self.hf_dataset[PeptideDataset.DEFAULT_SPLIT_NAMES[0]] = self.data_source
@@ -327,7 +310,12 @@ def _decide_on_splitting(self):
 
         # one non-train data source provided -> if test, then test only, if val, then do not split
         if count_loaded_data_sources == 1:
-            if self.test_data_source is not None:
+            if (
+                self.test_data_source is not None
+                or PeptideDataset.DEFAULT_SPLIT_NAMES[2]
+                in self._data_files_available_splits
+            ):
+                # test data source provided OR hugging face dataset with test split only
                 self._test_set_only = True
             if self.val_data_source is not None:
                 self._is_predefined_split = True
@@ -417,8 +405,8 @@ def _configure_encoding_step(self):
                 sequence_column_name=self.sequence_column,
                 alphabet=self.extended_alphabet,
                 batched=True,
+                extend_alphabet=self.learning_alphabet_mode,
             )
-
         else:
             raise NotImplementedError(
                 f"Encoding scheme {self.encoding_scheme} is not implemented. Available encoding schemes are: {list(EncodingScheme.__members__)}."
@@ -494,20 +482,47 @@ def _configure_feature_extraction_step(self):
 
     def _apply_processing_pipeline(self):
         for processor in self._processors:
-            logger.info(f"Applying step: {processor.__class__.__name__}...")
-            logger.debug(f"Applying step with arguments:\n\n{processor}...")
-            self.hf_dataset = self.hf_dataset.map(
-                processor,
-                desc=f"Mapping {processor.__class__.__name__}",
-                batched=processor.batched,
-                batch_size=self._batch_processing_size,
-                num_proc=self._num_proc,
-            )
-            logger.info(f"Done with step: {processor.__class__.__name__}.\n")
+            for split in self.hf_dataset.keys():
+                logger.info(
+                    f"Applying step: {processor.__class__.__name__} on split {split}..."
+                )
+                print(
+                    f"Applying step: {processor.__class__.__name__} on split {split}..."
+                )
+                logger.debug(
+                    f"Applying step with arguments:\n\n{processor} on split {split}..."
+                )
 
-            if isinstance(processor, SequencePaddingProcessor):
-                for split in self.hf_dataset.keys():
-                    if split != "test":
+                # split-specific logic for encoding
+                if isinstance(processor, SequenceEncodingProcessor):
+                    if split in PeptideDataset.DEFAULT_SPLIT_NAMES[0:2]:
+                        # train/val split -> learn the alphabet unless otherwise specified
+                        self._apply_processor_to_split(processor, split)
+
+                        self.extended_alphabet = processor.alphabet.copy()
+
+                    elif split == PeptideDataset.DEFAULT_SPLIT_NAMES[2]:
+                        # test split -> use the learned alphabet from the train/val split
+                        # and enable fallback to encoding unseen (AA, PTM) as unmodified Amino acids
+                        processor.extend_alphabet = False
+                        processor.set_alphabet(self.extended_alphabet)
+                        processor.set_fallback(True)
+
+                        self._apply_processor_to_split(processor, split)
+
+                    else:
+                        raise Warning(
+                            f"When applying processors, found split '{split}' which is not a valid split name. Please use one of the default split names: {PeptideDataset.DEFAULT_SPLIT_NAMES} to ensure correct behavior."
+                        )
+                else:
+                    # --------------------------------------------------------------------
+                    # split-agnostic logic -> run processor for all splits
+                    self._apply_processor_to_split(processor, split)
+                    # --------------------------------------------------------------------
+
+                # split-specific logic for truncating train/val sequences only after padding
+                if isinstance(processor, SequencePaddingProcessor):
+                    if split != PeptideDataset.DEFAULT_SPLIT_NAMES[2]:
                         logger.info(
                             f"Removing truncated sequences in the {split} split ..."
                         )
@@ -516,11 +531,25 @@ def _apply_processing_pipeline(self):
                             lambda batch: batch[processor.KEEP_COLUMN_NAME],
                             batched=True,
                             num_proc=self._num_proc,
-                            batch_size=self._batch_processing_size,
+                            batch_size=self.batch_processing_size,
                         )
-                self.hf_dataset = self.hf_dataset.remove_columns(
-                    processor.KEEP_COLUMN_NAME
-                )
+
+                logger.info(f"Done with step: {processor.__class__.__name__}.\n")
+
+        self.hf_dataset = self.hf_dataset.remove_columns(
+            SequencePaddingProcessor.KEEP_COLUMN_NAME
+        )
+
+    def _apply_processor_to_split(
+        self, processor: PeptideDatasetBaseProcessor, split: str
+    ):
+        self.hf_dataset[split] = self.hf_dataset[split].map(
+            processor,
+            desc=f"Mapping {processor.__class__.__name__}",
+            batched=processor.batched,
+            batch_size=self.batch_processing_size,
+            num_proc=self._num_proc,
+        )
 
     def _cast_model_feature_types_to_float(self):
         for split in self.hf_dataset.keys():
@@ -540,7 +569,7 @@ def _cast_model_feature_types_to_float(self):
             self.hf_dataset[split] = self.hf_dataset[split].cast(
                 new_features,
                 num_proc=self._num_proc,
-                batch_size=self._batch_processing_size,
+                batch_size=self.batch_processing_size,
             )
 
     def _cleanup_temp_dataset_cache_files(self):
@@ -593,26 +622,26 @@ def load_from_disk(cls, path: str):
 
     @classmethod
     def from_dataset_config(cls, config: DatasetConfig):
-        d = cls(
-            data_source=config.data_source,
-            val_data_source=config.val_data_source,
-            test_data_source=config.test_data_source,
-            data_format=config.data_format,
-            sequence_column=config.sequence_column,
-            label_column=config.label_column,
-            val_ratio=config.val_ratio,
-            max_seq_len=config.max_seq_len,
-            dataset_type=config.dataset_type,
-            batch_size=config.batch_size,
-            model_features=config.model_features,
-            dataset_columns_to_keep=config.dataset_columns_to_keep,
-            features_to_extract=config.features_to_extract,
-            pad=config.pad,
-            padding_value=config.padding_value,
-            alphabet=config.alphabet,
-            encoding_scheme=config.encoding_scheme,
-            processed=config.processed,
-        )
+        d = cls(**config.__dict__)
+        # data_source=config.data_source,
+        # val_data_source=config.val_data_source,
+        # test_data_source=config.test_data_source,
+        # data_format=config.data_format,
+        # sequence_column=config.sequence_column,
+        # label_column=config.label_column,
+        # val_ratio=config.val_ratio,
+        # max_seq_len=config.max_seq_len,
+        # dataset_type=config.dataset_type,
+        # batch_size=config.batch_size,
+        # model_features=config.model_features,
+        # dataset_columns_to_keep=config.dataset_columns_to_keep,
+        # features_to_extract=config.features_to_extract,
+        # pad=config.pad,
+        # padding_value=config.padding_value,
+        # alphabet=config.alphabet,
+        # encoding_scheme=config.encoding_scheme,
+        # processed=config.processed,
+        # )
 
         for k, v in config._additional_data.items():
             setattr(d, k, v)
diff --git a/src/dlomix/data/dataset_config.py b/src/dlomix/data/dataset_config.py
index 2ef10764..b309396f 100644
--- a/src/dlomix/data/dataset_config.py
+++ b/src/dlomix/data/dataset_config.py
@@ -28,8 +28,14 @@ class DatasetConfig:
     pad: bool
     padding_value: int
     alphabet: Dict
+    with_termini: bool
     encoding_scheme: Union[str, EncodingScheme]
     processed: bool
+    enable_tf_dataset_cache: bool
+    disable_cache: bool
+    auto_cleanup_cache: bool
+    num_proc: Optional[int]
+    batch_processing_size: int
     _additional_data: dict = field(default_factory=dict)
 
     def save_config_json(self, path: str):
diff --git a/src/dlomix/data/fragment_ion_intensity.py b/src/dlomix/data/fragment_ion_intensity.py
index 8fe42e44..a28c4f23 100644
--- a/src/dlomix/data/fragment_ion_intensity.py
+++ b/src/dlomix/data/fragment_ion_intensity.py
@@ -2,6 +2,7 @@
 
 from ..constants import ALPHABET_UNMOD
 from .dataset import PeptideDataset
+from .dataset_config import DatasetConfig
 from .dataset_utils import EncodingScheme
 
 
@@ -61,29 +62,5 @@ def __init__(
         num_proc: Optional[int] = None,
         batch_processing_size: int = 1000,
     ):
-        super().__init__(
-            data_source,
-            val_data_source,
-            test_data_source,
-            data_format,
-            sequence_column,
-            label_column,
-            val_ratio,
-            max_seq_len,
-            dataset_type,
-            batch_size,
-            model_features,
-            dataset_columns_to_keep,
-            features_to_extract,
-            pad,
-            padding_value,
-            alphabet,
-            with_termini,
-            encoding_scheme,
-            processed,
-            enable_tf_dataset_cache,
-            disable_cache,
-            auto_cleanup_cache,
-            num_proc,
-            batch_processing_size,
-        )
+        kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
+        super().__init__(DatasetConfig(**kwargs))
diff --git a/src/dlomix/data/processing/__init__.py b/src/dlomix/data/processing/__init__.py
index 62b3748e..83830e31 100644
--- a/src/dlomix/data/processing/__init__.py
+++ b/src/dlomix/data/processing/__init__.py
@@ -6,13 +6,23 @@
     FeatureExtractor,
     LookupFeatureExtractor,
 )
-from .processors import FunctionProcessor, SequenceParsingProcessor
+from .processors import (
+    FunctionProcessor,
+    SequenceEncodingProcessor,
+    SequencePaddingProcessor,
+    SequenceParsingProcessor,
+    SequencePTMRemovalProcessor,
+)
 
 __all__ = [
     "AVAILABLE_FEATURE_EXTRACTORS",
     "LookupFeatureExtractor",
     "FeatureExtractor",
     "FunctionProcessor",
+    "SequenceParsingProcessor",
+    "SequenceEncodingProcessor",
+    "SequencePaddingProcessor",
+    "SequencePTMRemovalProcessor",
 ]
 
 d = dict(
diff --git a/src/dlomix/data/processing/processors.py b/src/dlomix/data/processing/processors.py
index f14154bf..b84d9638 100644
--- a/src/dlomix/data/processing/processors.py
+++ b/src/dlomix/data/processing/processors.py
@@ -1,5 +1,6 @@
 import abc
 import re
+from typing import Optional
 
 
 class PeptideDatasetBaseProcessor(abc.ABC):
@@ -240,18 +241,46 @@ class SequenceEncodingProcessor(PeptideDatasetBaseProcessor):
     ----------
     sequence_column_name : str
         Name of the column containing the peptide sequence.
-    alphabet : dict
-        Dictionary mapping amino acids to integers.
+    alphabet : dict (default=None)
+        Dictionary mapping amino acids to integers. If None, the alphabet will be learned from the data.
     batched : bool (default=False)
         Whether to process data in batches.
     """
 
     def __init__(
-        self, sequence_column_name: str, alphabet: dict, batched: bool = False
+        self,
+        sequence_column_name: str,
+        alphabet: Optional[dict] = None,
+        batched: bool = False,
+        extend_alphabet: bool = False,
+        unknown_token: int = 0,
+        fallback_unmodified: bool = False,
     ):
         super().__init__(sequence_column_name, batched)
 
-        self.alphabet = alphabet
+        self.extend_alphabet = extend_alphabet
+
+        self.alphabet = {}
+        self.set_alphabet(alphabet)
+        self.set_fallback(fallback_unmodified)
+
+        self.unknown_token = unknown_token
+
+    def set_alphabet(self, alphabet):
+        if alphabet and not self.extend_alphabet:
+            self.alphabet = alphabet
+            self._encode = self._encode_with_vocab
+        else:
+            self._encode = self._encode_learn_vocab
+
+    def set_fallback(self, fallback_unmodified):
+        self.fallback_unmodified = fallback_unmodified
+        if self.fallback_unmodified:
+            self._encode = self._encode_with_vocab_fallback
+            if len(self.alphabet) == 0:
+                raise ValueError(
+                    "Alphabet must be provided if fallback_unmodified is True, to encode unseen modifications with the respective unmodified amino acid token."
+                )
 
     def batch_process(self, input_data, **kwargs):
         return {
@@ -267,8 +296,33 @@ def single_process(self, input_data, **kwargs):
             )
         }
 
-    def _encode(self, sequence):
-        encoded = [self.alphabet.get(amino_acid) for amino_acid in sequence]
+    def _encode_learn_vocab(self, sequence):
+        encoded = []
+        for amino_acid in sequence:
+            if amino_acid not in self.alphabet:
+                self.alphabet[amino_acid] = len(self.alphabet)
+            encoded.append(self.alphabet.get(amino_acid))
+
+        return encoded
+
+    def _encode_with_vocab(self, sequence):
+        encoded = [
+            self.alphabet.get(amino_acid, self.unknown_token) for amino_acid in sequence
+        ]
+        return encoded
+
+    def _encode_with_vocab_fallback(self, sequence):
+        encoded = []
+        for amino_acid in sequence:
+            if amino_acid not in self.alphabet:
+                if amino_acid.startswith(("[")):
+                    amino_acid = "[]-"
+                elif amino_acid.startswith(("-[")):
+                    amino_acid = "-[]"
+                else:
+                    amino_acid = amino_acid[0]
+
+            encoded.append(self.alphabet.get(amino_acid, self.unknown_token))
 
         return encoded
 
diff --git a/src/dlomix/data/retention_time.py b/src/dlomix/data/retention_time.py
index e1f3465c..3944da38 100644
--- a/src/dlomix/data/retention_time.py
+++ b/src/dlomix/data/retention_time.py
@@ -2,6 +2,7 @@
 
 from ..constants import ALPHABET_UNMOD
 from .dataset import PeptideDataset
+from .dataset_config import DatasetConfig
 from .dataset_utils import EncodingScheme
 
 
@@ -60,29 +61,5 @@ def __init__(
         num_proc: Optional[int] = None,
         batch_processing_size: int = 1000,
     ):
-        super().__init__(
-            data_source,
-            val_data_source,
-            test_data_source,
-            data_format,
-            sequence_column,
-            label_column,
-            val_ratio,
-            max_seq_len,
-            dataset_type,
-            batch_size,
-            model_features,
-            dataset_columns_to_keep,
-            features_to_extract,
-            pad,
-            padding_value,
-            alphabet,
-            with_termini,
-            encoding_scheme,
-            processed,
-            enable_tf_dataset_cache,
-            disable_cache,
-            auto_cleanup_cache,
-            num_proc,
-            batch_processing_size,
-        )
+        kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
+        super().__init__(DatasetConfig(**kwargs))
diff --git a/src/dlomix/models/prosit.py b/src/dlomix/models/prosit.py
index c709db47..20875352 100644
--- a/src/dlomix/models/prosit.py
+++ b/src/dlomix/models/prosit.py
@@ -44,7 +44,7 @@ def __init__(
         super(PrositRetentionTimePredictor, self).__init__()
 
         # tie the count of embeddings to the size of the vocabulary (count of amino acids)
-        self.embeddings_count = len(alphabet) + 2
+        self.embeddings_count = len(alphabet)
 
         self.dropout_rate = dropout_rate
         self.latent_dropout_rate = latent_dropout_rate
@@ -194,7 +194,7 @@ def __init__(
             self.alphabet = ALPHABET_UNMOD
 
         # tie the count of embeddings to the size of the vocabulary (count of amino acids)
-        self.embeddings_count = len(self.alphabet) + 2
+        self.embeddings_count = len(self.alphabet)
 
         self.embedding = tf.keras.layers.Embedding(
             input_dim=self.embeddings_count,
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 369cdfc0..6325b86a 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -217,3 +217,7 @@ def test_csv_intensitydataset():
         intensity_dataset[FragmentIonIntensityDataset.DEFAULT_SPLIT_NAMES[1]].num_rows
         > 0
     )
+
+    # test saving and loading datasets with config
+
+    # test learning alphabet for train/val and then using it for test with fallback
diff --git a/tests/test_processors.py b/tests/test_processors.py
new file mode 100644
index 00000000..f4c2a8f1
--- /dev/null
+++ b/tests/test_processors.py
@@ -0,0 +1,185 @@
+import logging
+import urllib.request
+import zipfile
+from os import makedirs
+from os.path import exists, join
+
+import pytest
+
+from dlomix.data.processing import (
+    FunctionProcessor,
+    SequenceEncodingProcessor,
+    SequencePaddingProcessor,
+    SequenceParsingProcessor,
+    SequencePTMRemovalProcessor,
+)
+
+logger = logging.getLogger(__name__)
+
+SEQ_COLUMN = "sequence"
+
+SEQUENCE_UNMODIFIED = "[]-DEL-[]"
+PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI = ["D", "E", "L"]
+
+SEQUENCE_MODIFIED = "[]-HC[UNIMOD:4]VD-[]"
+PARSED_SEQUENCE_MODIFIED_NO_TERMINI = ["H", "C[UNIMOD:4]", "V", "D"]
+
+SEQUENCE_MODIFIED_WITH_N_MOD = "[UNIMOD:737]-ILC[UNIMOD:4]SIQGFK[UNIMOD:737]D-[]"
+PARSED_SEQUENCE_MODIFIED_WITH_N_MOD_NO_TERMINI = [
+    "I",
+    "L",
+    "C[UNIMOD:4]",
+    "S",
+    "I",
+    "Q",
+    "G",
+    "F",
+    "K[UNIMOD:737]",
+    "D",
+]
+
+
+def assert_parsed_data(
+    parsed_data, new_sequence_column_value, parsed_sequence_value, n_term, c_term
+):
+    assert SequenceParsingProcessor.PARSED_COL_NAMES["seq"] in parsed_data.keys()
+    assert SequenceParsingProcessor.PARSED_COL_NAMES["n_term"] in parsed_data.keys()
+    assert SequenceParsingProcessor.PARSED_COL_NAMES["c_term"] in parsed_data
+    assert SEQ_COLUMN in parsed_data.keys()
+
+    assert (
+        parsed_data[SequenceParsingProcessor.PARSED_COL_NAMES["seq"]]
+        == parsed_sequence_value
+    )
+    assert parsed_data[SequenceParsingProcessor.PARSED_COL_NAMES["n_term"]] == n_term
+    assert parsed_data[SequenceParsingProcessor.PARSED_COL_NAMES["c_term"]] == c_term
+    assert parsed_data[SEQ_COLUMN] == new_sequence_column_value
+
+
+def test_sequence_parsing_processor_unmodified():
+    p = SequenceParsingProcessor(sequence_column_name=SEQ_COLUMN, with_termini=False)
+    input_data = {SEQ_COLUMN: SEQUENCE_UNMODIFIED}
+
+    parsed = p(input_data)
+    logger.info(parsed)
+
+    assert_parsed_data(
+        parsed,
+        PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI,
+        PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI,
+        "[]-",
+        "-[]",
+    )
+
+
+def test_sequence_parsing_processor_batched():
+    p = SequenceParsingProcessor(
+        sequence_column_name=SEQ_COLUMN, with_termini=False, batched=True
+    )
+    input_data = {SEQ_COLUMN: [SEQUENCE_UNMODIFIED]}
+
+    parsed = p(input_data)
+    logger.info(parsed)
+
+    assert_parsed_data(
+        parsed,
+        [PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI],
+        [PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI],
+        ["[]-"],
+        ["-[]"],
+    )
+
+
+def test_sequence_parsing_processor_with_termini():
+    p = SequenceParsingProcessor(sequence_column_name=SEQ_COLUMN, with_termini=True)
+    input_data = {SEQ_COLUMN: SEQUENCE_UNMODIFIED}
+
+    parsed = p(input_data)
+    logger.info(parsed)
+
+    assert_parsed_data(
+        parsed,
+        ["[]-", *PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI, "-[]"],
+        PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI,
+        "[]-",
+        "-[]",
+    )
+
+
+def test_sequence_parsing_processor_with_modifications():
+    p = SequenceParsingProcessor(sequence_column_name=SEQ_COLUMN, with_termini=True)
+    input_data = {SEQ_COLUMN: SEQUENCE_MODIFIED}
+
+    parsed = p(input_data)
+    logger.info(parsed)
+
+    assert_parsed_data(
+        parsed,
+        ["[]-", *PARSED_SEQUENCE_MODIFIED_NO_TERMINI, "-[]"],
+        PARSED_SEQUENCE_MODIFIED_NO_TERMINI,
+        "[]-",
+        "-[]",
+    )
+
+
+def test_sequence_parsing_processor_with_modifications_and_nterm_mods():
+    p = SequenceParsingProcessor(sequence_column_name=SEQ_COLUMN, with_termini=True)
+    input_data = {SEQ_COLUMN: SEQUENCE_MODIFIED_WITH_N_MOD}
+
+    parsed = p(input_data)
+    logger.info(parsed)
+
+    assert_parsed_data(
+        parsed,
+        ["[UNIMOD:737]-", *PARSED_SEQUENCE_MODIFIED_WITH_N_MOD_NO_TERMINI, "-[]"],
+        PARSED_SEQUENCE_MODIFIED_WITH_N_MOD_NO_TERMINI,
+        "[UNIMOD:737]-",
+        "-[]",
+    )
+
+
+def test_sequence_padding_processor_keep():
+    length = 5
+    p = SequencePaddingProcessor(sequence_column_name=SEQ_COLUMN, max_length=length)
+    input_data = {SEQ_COLUMN: PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI}
+    padded = p(input_data)
+    logger.info(padded)
+
+    assert len(padded[SEQ_COLUMN]) == length
+    assert padded[SEQ_COLUMN] == ["D", "E", "L", 0, 0]
+    assert padded[SequencePaddingProcessor.KEEP_COLUMN_NAME]
+
+
+def test_sequence_padding_processor_drop():
+    length = 2
+    p = SequencePaddingProcessor(sequence_column_name=SEQ_COLUMN, max_length=length)
+    input_data = {SEQ_COLUMN: PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI}
+    padded = p(input_data)
+    logger.info(padded)
+
+    assert padded[SEQ_COLUMN] == PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI[:length]
+    assert not padded[SequencePaddingProcessor.KEEP_COLUMN_NAME]
+
+
+def test_sequence_encoding_processor():
+    pass
+
+
+def test_sequence_encoding_processor_with_fixed_alphabet():
+    pass
+
+
+def test_sequence_encoding_processor_with_extend_alphabet_enabled():
+    pass
+
+
+def test_sequence_encoding_processor_with_fallback_enabled():
+    pass
+
+
+def test_sequence_ptm_removal_processor():
+    pass
+
+
+def test_function_processor():
+    pass

From 5bb4f14a54c8a5152f05424fce1fc75ac51823b4 Mon Sep 17 00:00:00 2001
From: omsh <omar.shouman@gmail.com>
Date: Tue, 1 Oct 2024 09:59:26 +0200
Subject: [PATCH 4/4] version 0.1.3

---
 src/dlomix/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dlomix/__init__.py b/src/dlomix/__init__.py
index 0d293ee8..4154a85e 100644
--- a/src/dlomix/__init__.py
+++ b/src/dlomix/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.3dev0"
+__version__ = "0.1.3"
 
 META_DATA = {
     "author": "Omar Shouman",