From afb024bf7c0506aa63c929d5f79b83494484c316 Mon Sep 17 00:00:00 2001 From: omsh Date: Sun, 7 Jul 2024 20:37:26 +0200 Subject: [PATCH 1/4] trigger pypi only on main --- .github/workflows/pypi.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 07f1f3c1..0291a54d 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -6,6 +6,8 @@ on: workflows: ["Build"] types: - completed + branches: + - main jobs: release: From 0ca1f1dcbfd5d639b1d5d1765f7cab48864470b2 Mon Sep 17 00:00:00 2001 From: omsh Date: Sun, 7 Jul 2024 20:58:20 +0200 Subject: [PATCH 2/4] trigger pypi workflow only on main and build succeeds on main --- .github/workflows/pypi.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 0291a54d..b298c196 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -12,7 +12,10 @@ on: jobs: release: runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/main'}} + if: | + github.event.workflow_run.conclusion == 'success' && + github.event.workflow_run.head_branch == 'main' && + github.ref == 'refs/heads/main' steps: - uses: actions/checkout@v4 - name: Set up Python From fcbe4db8573b7e5ade3721029f2a4e17a3300adb Mon Sep 17 00:00:00 2001 From: Omar Shouman Date: Tue, 1 Oct 2024 09:50:30 +0200 Subject: [PATCH 3/4] Fix/alphabet encoding dataset refactoring (#43) * test dataset fix * refactoring dataset classes with config class * processors changes to handle alphabet scenarios * model alphabet and embedding count fix * added tests for processors and extended tests for datasets * dev version 0 --- src/dlomix/__init__.py | 2 +- src/dlomix/data/charge_state.py | 29 +-- src/dlomix/data/dataset.py | 259 ++++++++++++---------- src/dlomix/data/dataset_config.py | 6 + src/dlomix/data/fragment_ion_intensity.py | 29 +-- src/dlomix/data/processing/__init__.py | 12 +- src/dlomix/data/processing/processors.py | 66 +++++- src/dlomix/data/retention_time.py | 29 +-- src/dlomix/models/prosit.py | 4 +- tests/test_datasets.py | 4 + tests/test_processors.py | 185 ++++++++++++++++ 11 files changed, 422 insertions(+), 203 deletions(-) create mode 100644 tests/test_processors.py diff --git a/src/dlomix/__init__.py b/src/dlomix/__init__.py index ab953da5..0d293ee8 100644 --- a/src/dlomix/__init__.py +++ b/src/dlomix/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.2" +__version__ = "0.1.3dev0" META_DATA = { "author": "Omar Shouman", diff --git a/src/dlomix/data/charge_state.py b/src/dlomix/data/charge_state.py index 3042d898..b960cc0d 100644 --- a/src/dlomix/data/charge_state.py +++ b/src/dlomix/data/charge_state.py @@ -2,6 +2,7 @@ from ..constants import ALPHABET_UNMOD from .dataset import PeptideDataset +from .dataset_config import DatasetConfig from .dataset_utils import EncodingScheme @@ -60,29 +61,5 @@ def __init__( num_proc: Optional[int] = None, batch_processing_size: int = 1000, ): - super().__init__( - data_source, - val_data_source, - test_data_source, - data_format, - sequence_column, - label_column, - val_ratio, - max_seq_len, - dataset_type, - batch_size, - model_features, - dataset_columns_to_keep, - features_to_extract, - pad, - padding_value, - alphabet, - with_termini, - encoding_scheme, - processed, - enable_tf_dataset_cache, - disable_cache, - auto_cleanup_cache, - num_proc, - batch_processing_size, - ) + kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + super().__init__(DatasetConfig(**kwargs)) diff --git a/src/dlomix/data/dataset.py b/src/dlomix/data/dataset.py index db5d8900..42aa8614 100644 --- a/src/dlomix/data/dataset.py +++ b/src/dlomix/data/dataset.py @@ -19,6 +19,7 @@ ) from .processing.processors import ( FunctionProcessor, + PeptideDatasetBaseProcessor, SequenceEncodingProcessor, SequencePaddingProcessor, SequenceParsingProcessor, @@ -105,79 +106,60 @@ class PeptideDataset: DEFAULT_SPLIT_NAMES = ["train", "val", "test"] CONFIG_JSON_NAME = "dlomix_peptide_dataset_config.json" - def __init__( - self, - data_source: Union[str, List], - val_data_source: Union[str, List], - test_data_source: Union[str, List], - data_format: str, - sequence_column: str, - label_column: str, - val_ratio: float, - max_seq_len: int, - dataset_type: str, - batch_size: int, - model_features: Optional[List[str]], - dataset_columns_to_keep: Optional[List[str]], - features_to_extract: Optional[List[Union[Callable, str]]] = None, - pad: bool = True, - padding_value: int = 0, - alphabet: Dict = ALPHABET_UNMOD, - with_termini: bool = True, - encoding_scheme: Union[str, EncodingScheme] = EncodingScheme.UNMOD, - processed: bool = False, - enable_tf_dataset_cache: bool = False, - disable_cache: bool = False, - auto_cleanup_cache: bool = True, - num_proc: Optional[int] = None, - batch_processing_size: Optional[int] = 1000, - ): + def __init__(self, config: DatasetConfig): super(PeptideDataset, self).__init__() - self.data_source = data_source - self.val_data_source = val_data_source - self.test_data_source = test_data_source + self.__dict__.update(**config.__dict__) + + # self.data_source = data_source + # self.val_data_source = val_data_source + # self.test_data_source = test_data_source - self.data_format = data_format + # self.data_format = data_format - self.sequence_column = sequence_column - self.label_column = label_column + # self.sequence_column = sequence_column + # self.label_column = label_column - self.val_ratio = val_ratio - self.max_seq_len = max_seq_len - self.dataset_type = dataset_type - self.batch_size = batch_size - self.model_features = model_features + # self.val_ratio = val_ratio + # self.max_seq_len = max_seq_len + # self.dataset_type = dataset_type + # self.batch_size = batch_size + # self.model_features = model_features # to be kept in the hf dataset, but not returned in the tensor dataset - if dataset_columns_to_keep is None: + if config.dataset_columns_to_keep is None: self.dataset_columns_to_keep = [] else: - self.dataset_columns_to_keep = dataset_columns_to_keep - - self.features_to_extract = features_to_extract - self.pad = pad - self.padding_value = padding_value - self.alphabet = alphabet - self.with_termini = with_termini - self.encoding_scheme = EncodingScheme(encoding_scheme) - self.processed = processed - self.enable_tf_dataset_cache = enable_tf_dataset_cache - self.disable_cache = disable_cache - self.auto_cleanup_cache = auto_cleanup_cache + self.dataset_columns_to_keep = config.dataset_columns_to_keep + + # self.features_to_extract = features_to_extract + # self.pad = pad + # self.padding_value = padding_value + # self.alphabet = alphabet + # self.with_termini = with_termini + self.encoding_scheme = EncodingScheme(config.encoding_scheme) + # self.processed = processed + # self.enable_tf_dataset_cache = enable_tf_dataset_cache + # self.disable_cache = disable_cache + # self.auto_cleanup_cache = auto_cleanup_cache self._set_hf_cache_management() - self.extended_alphabet = self.alphabet.copy() + self.extended_alphabet = None + self.learning_alphabet_mode = True - self._refresh_config() + if self.alphabet: + self.extended_alphabet = self.alphabet.copy() + self.learning_alphabet_mode = False + + self._config = config if not self.processed: self.hf_dataset: Optional[Union[Dataset, DatasetDict]] = None self._empty_dataset_mode = False self._is_predefined_split = False self._test_set_only = False - self._num_proc = num_proc + self._num_proc = config.num_proc self._set_num_proc() - self._batch_processing_size = batch_processing_size + # self.batch_processing_size = config.batch_processing_size self._data_files_available_splits = {} self._load_dataset() @@ -216,26 +198,26 @@ def _set_hf_cache_management(self): disable_caching() def _refresh_config(self): - self._config = DatasetConfig( - data_source=self.data_source, - val_data_source=self.val_data_source, - test_data_source=self.test_data_source, - data_format=self.data_format, - sequence_column=self.sequence_column, - label_column=self.label_column, - val_ratio=self.val_ratio, - max_seq_len=self.max_seq_len, - dataset_type=self.dataset_type, - batch_size=self.batch_size, - model_features=self.model_features, - dataset_columns_to_keep=self.dataset_columns_to_keep, - features_to_extract=self.features_to_extract, - pad=self.pad, - padding_value=self.padding_value, - alphabet=self.alphabet, - encoding_scheme=self.encoding_scheme, - processed=self.processed, - ) + self._config = DatasetConfig(**self._config.__dict__) + # data_source=self.data_source, + # val_data_source=self.val_data_source, + # test_data_source=self.test_data_source, + # data_format=self.data_format, + # sequence_column=self.sequence_column, + # label_column=self.label_column, + # val_ratio=self.val_ratio, + # max_seq_len=self.max_seq_len, + # dataset_type=self.dataset_type, + # batch_size=self.batch_size, + # model_features=self.model_features, + # dataset_columns_to_keep=self.dataset_columns_to_keep, + # features_to_extract=self.features_to_extract, + # pad=self.pad, + # padding_value=self.padding_value, + # alphabet=self.alphabet, + # encoding_scheme=self.encoding_scheme, + # processed=self.processed, + # ) self._config._additional_data.update( { @@ -311,6 +293,7 @@ def _load_from_inmemory_hf_dataset(self): split: f"in-memory Dataset object - {split}" for split in self.hf_dataset } + elif isinstance(self.data_source, Dataset): self.hf_dataset = DatasetDict() self.hf_dataset[PeptideDataset.DEFAULT_SPLIT_NAMES[0]] = self.data_source @@ -327,7 +310,12 @@ def _decide_on_splitting(self): # one non-train data source provided -> if test, then test only, if val, then do not split if count_loaded_data_sources == 1: - if self.test_data_source is not None: + if ( + self.test_data_source is not None + or PeptideDataset.DEFAULT_SPLIT_NAMES[2] + in self._data_files_available_splits + ): + # test data source provided OR hugging face dataset with test split only self._test_set_only = True if self.val_data_source is not None: self._is_predefined_split = True @@ -417,8 +405,8 @@ def _configure_encoding_step(self): sequence_column_name=self.sequence_column, alphabet=self.extended_alphabet, batched=True, + extend_alphabet=self.learning_alphabet_mode, ) - else: raise NotImplementedError( f"Encoding scheme {self.encoding_scheme} is not implemented. Available encoding schemes are: {list(EncodingScheme.__members__)}." @@ -494,20 +482,47 @@ def _configure_feature_extraction_step(self): def _apply_processing_pipeline(self): for processor in self._processors: - logger.info(f"Applying step: {processor.__class__.__name__}...") - logger.debug(f"Applying step with arguments:\n\n{processor}...") - self.hf_dataset = self.hf_dataset.map( - processor, - desc=f"Mapping {processor.__class__.__name__}", - batched=processor.batched, - batch_size=self._batch_processing_size, - num_proc=self._num_proc, - ) - logger.info(f"Done with step: {processor.__class__.__name__}.\n") + for split in self.hf_dataset.keys(): + logger.info( + f"Applying step: {processor.__class__.__name__} on split {split}..." + ) + print( + f"Applying step: {processor.__class__.__name__} on split {split}..." + ) + logger.debug( + f"Applying step with arguments:\n\n{processor} on split {split}..." + ) - if isinstance(processor, SequencePaddingProcessor): - for split in self.hf_dataset.keys(): - if split != "test": + # split-specific logic for encoding + if isinstance(processor, SequenceEncodingProcessor): + if split in PeptideDataset.DEFAULT_SPLIT_NAMES[0:2]: + # train/val split -> learn the alphabet unless otherwise specified + self._apply_processor_to_split(processor, split) + + self.extended_alphabet = processor.alphabet.copy() + + elif split == PeptideDataset.DEFAULT_SPLIT_NAMES[2]: + # test split -> use the learned alphabet from the train/val split + # and enable fallback to encoding unseen (AA, PTM) as unmodified Amino acids + processor.extend_alphabet = False + processor.set_alphabet(self.extended_alphabet) + processor.set_fallback(True) + + self._apply_processor_to_split(processor, split) + + else: + raise Warning( + f"When applying processors, found split '{split}' which is not a valid split name. Please use one of the default split names: {PeptideDataset.DEFAULT_SPLIT_NAMES} to ensure correct behavior." + ) + else: + # -------------------------------------------------------------------- + # split-agnostic logic -> run processor for all splits + self._apply_processor_to_split(processor, split) + # -------------------------------------------------------------------- + + # split-specific logic for truncating train/val sequences only after padding + if isinstance(processor, SequencePaddingProcessor): + if split != PeptideDataset.DEFAULT_SPLIT_NAMES[2]: logger.info( f"Removing truncated sequences in the {split} split ..." ) @@ -516,11 +531,25 @@ def _apply_processing_pipeline(self): lambda batch: batch[processor.KEEP_COLUMN_NAME], batched=True, num_proc=self._num_proc, - batch_size=self._batch_processing_size, + batch_size=self.batch_processing_size, ) - self.hf_dataset = self.hf_dataset.remove_columns( - processor.KEEP_COLUMN_NAME - ) + + logger.info(f"Done with step: {processor.__class__.__name__}.\n") + + self.hf_dataset = self.hf_dataset.remove_columns( + SequencePaddingProcessor.KEEP_COLUMN_NAME + ) + + def _apply_processor_to_split( + self, processor: PeptideDatasetBaseProcessor, split: str + ): + self.hf_dataset[split] = self.hf_dataset[split].map( + processor, + desc=f"Mapping {processor.__class__.__name__}", + batched=processor.batched, + batch_size=self.batch_processing_size, + num_proc=self._num_proc, + ) def _cast_model_feature_types_to_float(self): for split in self.hf_dataset.keys(): @@ -540,7 +569,7 @@ def _cast_model_feature_types_to_float(self): self.hf_dataset[split] = self.hf_dataset[split].cast( new_features, num_proc=self._num_proc, - batch_size=self._batch_processing_size, + batch_size=self.batch_processing_size, ) def _cleanup_temp_dataset_cache_files(self): @@ -593,26 +622,26 @@ def load_from_disk(cls, path: str): @classmethod def from_dataset_config(cls, config: DatasetConfig): - d = cls( - data_source=config.data_source, - val_data_source=config.val_data_source, - test_data_source=config.test_data_source, - data_format=config.data_format, - sequence_column=config.sequence_column, - label_column=config.label_column, - val_ratio=config.val_ratio, - max_seq_len=config.max_seq_len, - dataset_type=config.dataset_type, - batch_size=config.batch_size, - model_features=config.model_features, - dataset_columns_to_keep=config.dataset_columns_to_keep, - features_to_extract=config.features_to_extract, - pad=config.pad, - padding_value=config.padding_value, - alphabet=config.alphabet, - encoding_scheme=config.encoding_scheme, - processed=config.processed, - ) + d = cls(**config.__dict__) + # data_source=config.data_source, + # val_data_source=config.val_data_source, + # test_data_source=config.test_data_source, + # data_format=config.data_format, + # sequence_column=config.sequence_column, + # label_column=config.label_column, + # val_ratio=config.val_ratio, + # max_seq_len=config.max_seq_len, + # dataset_type=config.dataset_type, + # batch_size=config.batch_size, + # model_features=config.model_features, + # dataset_columns_to_keep=config.dataset_columns_to_keep, + # features_to_extract=config.features_to_extract, + # pad=config.pad, + # padding_value=config.padding_value, + # alphabet=config.alphabet, + # encoding_scheme=config.encoding_scheme, + # processed=config.processed, + # ) for k, v in config._additional_data.items(): setattr(d, k, v) diff --git a/src/dlomix/data/dataset_config.py b/src/dlomix/data/dataset_config.py index 2ef10764..b309396f 100644 --- a/src/dlomix/data/dataset_config.py +++ b/src/dlomix/data/dataset_config.py @@ -28,8 +28,14 @@ class DatasetConfig: pad: bool padding_value: int alphabet: Dict + with_termini: bool encoding_scheme: Union[str, EncodingScheme] processed: bool + enable_tf_dataset_cache: bool + disable_cache: bool + auto_cleanup_cache: bool + num_proc: Optional[int] + batch_processing_size: int _additional_data: dict = field(default_factory=dict) def save_config_json(self, path: str): diff --git a/src/dlomix/data/fragment_ion_intensity.py b/src/dlomix/data/fragment_ion_intensity.py index 8fe42e44..a28c4f23 100644 --- a/src/dlomix/data/fragment_ion_intensity.py +++ b/src/dlomix/data/fragment_ion_intensity.py @@ -2,6 +2,7 @@ from ..constants import ALPHABET_UNMOD from .dataset import PeptideDataset +from .dataset_config import DatasetConfig from .dataset_utils import EncodingScheme @@ -61,29 +62,5 @@ def __init__( num_proc: Optional[int] = None, batch_processing_size: int = 1000, ): - super().__init__( - data_source, - val_data_source, - test_data_source, - data_format, - sequence_column, - label_column, - val_ratio, - max_seq_len, - dataset_type, - batch_size, - model_features, - dataset_columns_to_keep, - features_to_extract, - pad, - padding_value, - alphabet, - with_termini, - encoding_scheme, - processed, - enable_tf_dataset_cache, - disable_cache, - auto_cleanup_cache, - num_proc, - batch_processing_size, - ) + kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + super().__init__(DatasetConfig(**kwargs)) diff --git a/src/dlomix/data/processing/__init__.py b/src/dlomix/data/processing/__init__.py index 62b3748e..83830e31 100644 --- a/src/dlomix/data/processing/__init__.py +++ b/src/dlomix/data/processing/__init__.py @@ -6,13 +6,23 @@ FeatureExtractor, LookupFeatureExtractor, ) -from .processors import FunctionProcessor, SequenceParsingProcessor +from .processors import ( + FunctionProcessor, + SequenceEncodingProcessor, + SequencePaddingProcessor, + SequenceParsingProcessor, + SequencePTMRemovalProcessor, +) __all__ = [ "AVAILABLE_FEATURE_EXTRACTORS", "LookupFeatureExtractor", "FeatureExtractor", "FunctionProcessor", + "SequenceParsingProcessor", + "SequenceEncodingProcessor", + "SequencePaddingProcessor", + "SequencePTMRemovalProcessor", ] d = dict( diff --git a/src/dlomix/data/processing/processors.py b/src/dlomix/data/processing/processors.py index f14154bf..b84d9638 100644 --- a/src/dlomix/data/processing/processors.py +++ b/src/dlomix/data/processing/processors.py @@ -1,5 +1,6 @@ import abc import re +from typing import Optional class PeptideDatasetBaseProcessor(abc.ABC): @@ -240,18 +241,46 @@ class SequenceEncodingProcessor(PeptideDatasetBaseProcessor): ---------- sequence_column_name : str Name of the column containing the peptide sequence. - alphabet : dict - Dictionary mapping amino acids to integers. + alphabet : dict (default=None) + Dictionary mapping amino acids to integers. If None, the alphabet will be learned from the data. batched : bool (default=False) Whether to process data in batches. """ def __init__( - self, sequence_column_name: str, alphabet: dict, batched: bool = False + self, + sequence_column_name: str, + alphabet: Optional[dict] = None, + batched: bool = False, + extend_alphabet: bool = False, + unknown_token: int = 0, + fallback_unmodified: bool = False, ): super().__init__(sequence_column_name, batched) - self.alphabet = alphabet + self.extend_alphabet = extend_alphabet + + self.alphabet = {} + self.set_alphabet(alphabet) + self.set_fallback(fallback_unmodified) + + self.unknown_token = unknown_token + + def set_alphabet(self, alphabet): + if alphabet and not self.extend_alphabet: + self.alphabet = alphabet + self._encode = self._encode_with_vocab + else: + self._encode = self._encode_learn_vocab + + def set_fallback(self, fallback_unmodified): + self.fallback_unmodified = fallback_unmodified + if self.fallback_unmodified: + self._encode = self._encode_with_vocab_fallback + if len(self.alphabet) == 0: + raise ValueError( + "Alphabet must be provided if fallback_unmodified is True, to encode unseen modifications with the respective unmodified amino acid token." + ) def batch_process(self, input_data, **kwargs): return { @@ -267,8 +296,33 @@ def single_process(self, input_data, **kwargs): ) } - def _encode(self, sequence): - encoded = [self.alphabet.get(amino_acid) for amino_acid in sequence] + def _encode_learn_vocab(self, sequence): + encoded = [] + for amino_acid in sequence: + if amino_acid not in self.alphabet: + self.alphabet[amino_acid] = len(self.alphabet) + encoded.append(self.alphabet.get(amino_acid)) + + return encoded + + def _encode_with_vocab(self, sequence): + encoded = [ + self.alphabet.get(amino_acid, self.unknown_token) for amino_acid in sequence + ] + return encoded + + def _encode_with_vocab_fallback(self, sequence): + encoded = [] + for amino_acid in sequence: + if amino_acid not in self.alphabet: + if amino_acid.startswith(("[")): + amino_acid = "[]-" + elif amino_acid.startswith(("-[")): + amino_acid = "-[]" + else: + amino_acid = amino_acid[0] + + encoded.append(self.alphabet.get(amino_acid, self.unknown_token)) return encoded diff --git a/src/dlomix/data/retention_time.py b/src/dlomix/data/retention_time.py index e1f3465c..3944da38 100644 --- a/src/dlomix/data/retention_time.py +++ b/src/dlomix/data/retention_time.py @@ -2,6 +2,7 @@ from ..constants import ALPHABET_UNMOD from .dataset import PeptideDataset +from .dataset_config import DatasetConfig from .dataset_utils import EncodingScheme @@ -60,29 +61,5 @@ def __init__( num_proc: Optional[int] = None, batch_processing_size: int = 1000, ): - super().__init__( - data_source, - val_data_source, - test_data_source, - data_format, - sequence_column, - label_column, - val_ratio, - max_seq_len, - dataset_type, - batch_size, - model_features, - dataset_columns_to_keep, - features_to_extract, - pad, - padding_value, - alphabet, - with_termini, - encoding_scheme, - processed, - enable_tf_dataset_cache, - disable_cache, - auto_cleanup_cache, - num_proc, - batch_processing_size, - ) + kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + super().__init__(DatasetConfig(**kwargs)) diff --git a/src/dlomix/models/prosit.py b/src/dlomix/models/prosit.py index c709db47..20875352 100644 --- a/src/dlomix/models/prosit.py +++ b/src/dlomix/models/prosit.py @@ -44,7 +44,7 @@ def __init__( super(PrositRetentionTimePredictor, self).__init__() # tie the count of embeddings to the size of the vocabulary (count of amino acids) - self.embeddings_count = len(alphabet) + 2 + self.embeddings_count = len(alphabet) self.dropout_rate = dropout_rate self.latent_dropout_rate = latent_dropout_rate @@ -194,7 +194,7 @@ def __init__( self.alphabet = ALPHABET_UNMOD # tie the count of embeddings to the size of the vocabulary (count of amino acids) - self.embeddings_count = len(self.alphabet) + 2 + self.embeddings_count = len(self.alphabet) self.embedding = tf.keras.layers.Embedding( input_dim=self.embeddings_count, diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 369cdfc0..6325b86a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -217,3 +217,7 @@ def test_csv_intensitydataset(): intensity_dataset[FragmentIonIntensityDataset.DEFAULT_SPLIT_NAMES[1]].num_rows > 0 ) + + # test saving and loading datasets with config + + # test learning alphabet for train/val and then using it for test with fallback diff --git a/tests/test_processors.py b/tests/test_processors.py new file mode 100644 index 00000000..f4c2a8f1 --- /dev/null +++ b/tests/test_processors.py @@ -0,0 +1,185 @@ +import logging +import urllib.request +import zipfile +from os import makedirs +from os.path import exists, join + +import pytest + +from dlomix.data.processing import ( + FunctionProcessor, + SequenceEncodingProcessor, + SequencePaddingProcessor, + SequenceParsingProcessor, + SequencePTMRemovalProcessor, +) + +logger = logging.getLogger(__name__) + +SEQ_COLUMN = "sequence" + +SEQUENCE_UNMODIFIED = "[]-DEL-[]" +PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI = ["D", "E", "L"] + +SEQUENCE_MODIFIED = "[]-HC[UNIMOD:4]VD-[]" +PARSED_SEQUENCE_MODIFIED_NO_TERMINI = ["H", "C[UNIMOD:4]", "V", "D"] + +SEQUENCE_MODIFIED_WITH_N_MOD = "[UNIMOD:737]-ILC[UNIMOD:4]SIQGFK[UNIMOD:737]D-[]" +PARSED_SEQUENCE_MODIFIED_WITH_N_MOD_NO_TERMINI = [ + "I", + "L", + "C[UNIMOD:4]", + "S", + "I", + "Q", + "G", + "F", + "K[UNIMOD:737]", + "D", +] + + +def assert_parsed_data( + parsed_data, new_sequence_column_value, parsed_sequence_value, n_term, c_term +): + assert SequenceParsingProcessor.PARSED_COL_NAMES["seq"] in parsed_data.keys() + assert SequenceParsingProcessor.PARSED_COL_NAMES["n_term"] in parsed_data.keys() + assert SequenceParsingProcessor.PARSED_COL_NAMES["c_term"] in parsed_data + assert SEQ_COLUMN in parsed_data.keys() + + assert ( + parsed_data[SequenceParsingProcessor.PARSED_COL_NAMES["seq"]] + == parsed_sequence_value + ) + assert parsed_data[SequenceParsingProcessor.PARSED_COL_NAMES["n_term"]] == n_term + assert parsed_data[SequenceParsingProcessor.PARSED_COL_NAMES["c_term"]] == c_term + assert parsed_data[SEQ_COLUMN] == new_sequence_column_value + + +def test_sequence_parsing_processor_unmodified(): + p = SequenceParsingProcessor(sequence_column_name=SEQ_COLUMN, with_termini=False) + input_data = {SEQ_COLUMN: SEQUENCE_UNMODIFIED} + + parsed = p(input_data) + logger.info(parsed) + + assert_parsed_data( + parsed, + PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI, + PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI, + "[]-", + "-[]", + ) + + +def test_sequence_parsing_processor_batched(): + p = SequenceParsingProcessor( + sequence_column_name=SEQ_COLUMN, with_termini=False, batched=True + ) + input_data = {SEQ_COLUMN: [SEQUENCE_UNMODIFIED]} + + parsed = p(input_data) + logger.info(parsed) + + assert_parsed_data( + parsed, + [PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI], + [PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI], + ["[]-"], + ["-[]"], + ) + + +def test_sequence_parsing_processor_with_termini(): + p = SequenceParsingProcessor(sequence_column_name=SEQ_COLUMN, with_termini=True) + input_data = {SEQ_COLUMN: SEQUENCE_UNMODIFIED} + + parsed = p(input_data) + logger.info(parsed) + + assert_parsed_data( + parsed, + ["[]-", *PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI, "-[]"], + PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI, + "[]-", + "-[]", + ) + + +def test_sequence_parsing_processor_with_modifications(): + p = SequenceParsingProcessor(sequence_column_name=SEQ_COLUMN, with_termini=True) + input_data = {SEQ_COLUMN: SEQUENCE_MODIFIED} + + parsed = p(input_data) + logger.info(parsed) + + assert_parsed_data( + parsed, + ["[]-", *PARSED_SEQUENCE_MODIFIED_NO_TERMINI, "-[]"], + PARSED_SEQUENCE_MODIFIED_NO_TERMINI, + "[]-", + "-[]", + ) + + +def test_sequence_parsing_processor_with_modifications_and_nterm_mods(): + p = SequenceParsingProcessor(sequence_column_name=SEQ_COLUMN, with_termini=True) + input_data = {SEQ_COLUMN: SEQUENCE_MODIFIED_WITH_N_MOD} + + parsed = p(input_data) + logger.info(parsed) + + assert_parsed_data( + parsed, + ["[UNIMOD:737]-", *PARSED_SEQUENCE_MODIFIED_WITH_N_MOD_NO_TERMINI, "-[]"], + PARSED_SEQUENCE_MODIFIED_WITH_N_MOD_NO_TERMINI, + "[UNIMOD:737]-", + "-[]", + ) + + +def test_sequence_padding_processor_keep(): + length = 5 + p = SequencePaddingProcessor(sequence_column_name=SEQ_COLUMN, max_length=length) + input_data = {SEQ_COLUMN: PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI} + padded = p(input_data) + logger.info(padded) + + assert len(padded[SEQ_COLUMN]) == length + assert padded[SEQ_COLUMN] == ["D", "E", "L", 0, 0] + assert padded[SequencePaddingProcessor.KEEP_COLUMN_NAME] + + +def test_sequence_padding_processor_drop(): + length = 2 + p = SequencePaddingProcessor(sequence_column_name=SEQ_COLUMN, max_length=length) + input_data = {SEQ_COLUMN: PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI} + padded = p(input_data) + logger.info(padded) + + assert padded[SEQ_COLUMN] == PARSED_SEQUENCE_UNMODIFIED_NO_TERMINI[:length] + assert not padded[SequencePaddingProcessor.KEEP_COLUMN_NAME] + + +def test_sequence_encoding_processor(): + pass + + +def test_sequence_encoding_processor_with_fixed_alphabet(): + pass + + +def test_sequence_encoding_processor_with_extend_alphabet_enabled(): + pass + + +def test_sequence_encoding_processor_with_fallback_enabled(): + pass + + +def test_sequence_ptm_removal_processor(): + pass + + +def test_function_processor(): + pass From 5bb4f14a54c8a5152f05424fce1fc75ac51823b4 Mon Sep 17 00:00:00 2001 From: omsh Date: Tue, 1 Oct 2024 09:59:26 +0200 Subject: [PATCH 4/4] version 0.1.3 --- src/dlomix/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dlomix/__init__.py b/src/dlomix/__init__.py index 0d293ee8..4154a85e 100644 --- a/src/dlomix/__init__.py +++ b/src/dlomix/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.3dev0" +__version__ = "0.1.3" META_DATA = { "author": "Omar Shouman",