-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/detectability integration hf updates (#45)
* First push detectability model * test update notebbok 1 * Update on Notebooks * updated and merged constants * dataset class * detectability models * detectability report * notebook walkthrough detectability * notebook branch install * temp version update * version * fix unintended splitting * updated notebooks * remove detectability test dataset csv file * minor changes * removed unnecessary notebooks * renamed notebooks * final review on detectability notebooks --------- Co-authored-by: naimakg <naimakg@gmail.com>
- Loading branch information
Showing
19 changed files
with
3,027 additions
and
159 deletions.
There are no files selected for viewing
768 changes: 768 additions & 0 deletions
768
notebooks/Example_Detectability_Model_Walkthrough_prediction_colab.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
1,146 changes: 1,146 additions & 0 deletions
1,146
notebooks/Example_Detectability_Model_Walkthrough_training_and_fine_tuning.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
2 changes: 2 additions & 0 deletions
2
pretrained_models/original_detectability_base_model/checkpoint
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
model_checkpoint_path: "base_attention_model_es_final" | ||
all_model_checkpoint_paths: "base_attention_model_es_final" |
Binary file added
BIN
+1.44 MB
...s/original_detectability_base_model/original_detectability_base_model.data-00000-of-00001
Binary file not shown.
Binary file added
BIN
+4.54 KB
pretrained_models/original_detectability_base_model/original_detectability_base_model.index
Binary file not shown.
2 changes: 2 additions & 0 deletions
2
pretrained_models/original_detectability_fine_tuned_model_FINAL/checkpoint
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
model_checkpoint_path: "fine_tuned_weights_attention_model_FINAL_NON" | ||
all_model_checkpoint_paths: "fine_tuned_weights_attention_model_FINAL_NON" |
Binary file added
BIN
+1.44 MB
..._fine_tuned_model_FINAL/original_detectability_fine_tuned_model_FINAL.data-00000-of-00001
Binary file not shown.
Binary file added
BIN
+4.54 KB
..._detectability_fine_tuned_model_FINAL/original_detectability_fine_tuned_model_FINAL.index
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
__version__ = "0.1.3" | ||
__version__ = "0.1.3dev" | ||
|
||
META_DATA = { | ||
"author": "Omar Shouman", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from typing import Callable, Dict, List, Optional, Union | ||
|
||
from ..constants import ALPHABET_UNMOD | ||
from .dataset import PeptideDataset | ||
from .dataset_config import DatasetConfig | ||
from .dataset_utils import EncodingScheme | ||
|
||
|
||
class DetectabilityDataset(PeptideDataset): | ||
""" | ||
A dataset class for handling Detectability prediction data. | ||
Args: | ||
data_source (Optional[Union[str, List]]): The path or list of paths to the data source file(s). | ||
val_data_source (Optional[Union[str, List]]): The path or list of paths to the validation data source file(s). | ||
test_data_source (Optional[Union[str, List]]): The path or list of paths to the test data source file(s). | ||
data_format (str): The format of the data source file(s). Default is "parquet". | ||
sequence_column (str): The name of the column containing the peptide sequences. Default is "Sequences". | ||
label_column (str): The name of the column containing the class labels. Default is "Classes". | ||
val_ratio (float): The ratio of validation data to split from the training data. Default is 0.2. | ||
max_seq_len (Union[int, str]): The maximum length of the peptide sequences. Default is 30. | ||
dataset_type (str): The type of dataset to use. Default is "tf". | ||
batch_size (int): The batch size for training and evaluation. Default is 256. | ||
model_features (Optional[List[str]]): The list of features to use for the model. Default is None. | ||
dataset_columns_to_keep (Optional[List[str]]): The list of columns to keep in the dataset. Default is ["Proteins"]. | ||
features_to_extract (Optional[List[Union[Callable, str]]]): The list of features to extract from the dataset. Default is None. | ||
pad (bool): Whether to pad the sequences to the maximum length. Default is True. | ||
padding_value (int): The value to use for padding. Default is 0. | ||
alphabet (Dict): The mapping of characters to integers for encoding the sequences. Default is ALPHABET_UNMOD. | ||
with_termini (bool): Whether to add the N- and C-termini in the sequence column, even if they do not exist. Defaults to True. | ||
encoding_scheme (Union[str, EncodingScheme]): The encoding scheme to use for encoding the sequences. Default is EncodingScheme.UNMOD. | ||
processed (bool): Whether the data has been preprocessed. Default is False. | ||
enable_tf_dataset_cache (bool): Flag to indicate whether to enable TensorFlow Dataset caching (call `.cahce()` on the generate TF Datasets). | ||
disable_cache (bool): Whether to disable Hugging Face datasets caching. Default is False. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
data_source: Optional[Union[str, List]] = None, | ||
val_data_source: Optional[Union[str, List]] = None, | ||
test_data_source: Optional[Union[str, List]] = None, | ||
data_format: str = "csv", | ||
sequence_column: str = "Sequences", | ||
label_column: str = "Classes", | ||
val_ratio: float = 0.2, | ||
max_seq_len: Union[int, str] = 40, | ||
dataset_type: str = "tf", | ||
batch_size: int = 256, | ||
model_features: Optional[List[str]] = None, | ||
dataset_columns_to_keep: Optional[List[str]] = ["Proteins"], | ||
features_to_extract: Optional[List[Union[Callable, str]]] = None, | ||
pad: bool = True, | ||
padding_value: int = 0, | ||
alphabet: Dict = ALPHABET_UNMOD, | ||
with_termini: bool = True, | ||
encoding_scheme: Union[str, EncodingScheme] = EncodingScheme.UNMOD, | ||
processed: bool = False, | ||
enable_tf_dataset_cache: bool = False, | ||
disable_cache: bool = False, | ||
auto_cleanup_cache: bool = True, | ||
num_proc: Optional[int] = None, | ||
batch_processing_size: int = 1000, | ||
): | ||
kwargs = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} | ||
super().__init__(DatasetConfig(**kwargs)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import numpy as np | ||
|
||
CLASSES_LABELS = ['Non-Flyer', 'Weak Flyer', 'Intermediate Flyer', 'Strong Flyer'] | ||
|
||
alphabet = ['0', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] | ||
|
||
aa_to_int_dict = dict((aa, i) for i, aa in enumerate(alphabet)) | ||
|
||
int_to_aa_dict = dict((i, aa) for i, aa in enumerate(alphabet)) | ||
|
||
padding_char = np.zeros(len(alphabet)) | ||
padding_char[0] = 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
from .base import * | ||
from .deepLC import * | ||
from .detectability import * | ||
from .prosit import * | ||
|
||
__all__ = [ | ||
"RetentionTimePredictor", | ||
"PrositRetentionTimePredictor", | ||
"DeepLCRetentionTimePredictor", | ||
"PrositIntensityPredictor", | ||
"DetectabilityModel", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import tensorflow as tf | ||
|
||
from ..constants import CLASSES_LABELS, padding_char | ||
|
||
|
||
class DetectabilityModel(tf.keras.Model): | ||
def __init__( | ||
self, | ||
num_units, | ||
num_clases=len(CLASSES_LABELS), | ||
name="autoencoder", | ||
padding_char=padding_char, | ||
**kwargs | ||
): | ||
super(DetectabilityModel, self).__init__(name=name, **kwargs) | ||
|
||
self.num_units = num_units | ||
self.num_clases = num_clases | ||
self.padding_char = padding_char | ||
self.alphabet_size = len(padding_char) | ||
self.one_hot_encoder = tf.keras.layers.Lambda( | ||
lambda x: tf.one_hot(tf.cast(x, "int32"), depth=self.alphabet_size) | ||
) | ||
self.encoder = Encoder(self.num_units) | ||
self.decoder = Decoder(self.num_units, self.num_clases) | ||
|
||
def call(self, inputs): | ||
onehot_inputs = self.one_hot_encoder(inputs) | ||
enc_outputs, enc_state_f, enc_state_b = self.encoder(onehot_inputs) | ||
|
||
dec_outputs = tf.concat([enc_state_f, enc_state_b], axis=-1) | ||
|
||
decoder_inputs = { | ||
"decoder_outputs": dec_outputs, | ||
"state_f": enc_state_f, | ||
"state_b": enc_state_b, | ||
"encoder_outputs": enc_outputs, | ||
} | ||
|
||
decoder_output = self.decoder(decoder_inputs) | ||
|
||
return decoder_output | ||
|
||
|
||
class Encoder(tf.keras.layers.Layer): | ||
def __init__(self, units, name="encoder", **kwargs): | ||
super(Encoder, self).__init__(name=name, **kwargs) | ||
|
||
self.units = units | ||
|
||
self.mask_enco = tf.keras.layers.Masking(mask_value=padding_char) | ||
|
||
self.encoder_gru = tf.keras.layers.GRU( | ||
self.units, | ||
return_sequences=True, | ||
return_state=True, | ||
recurrent_initializer="glorot_uniform", | ||
) | ||
|
||
self.encoder_bi = tf.keras.layers.Bidirectional(self.encoder_gru) | ||
|
||
def call(self, inputs): | ||
mask_ = self.mask_enco.compute_mask(inputs) | ||
|
||
mask_bi = self.encoder_bi.compute_mask(inputs, mask_) | ||
|
||
encoder_outputs, encoder_state_f, encoder_state_b = self.encoder_bi( | ||
inputs, initial_state=None, mask=mask_bi | ||
) | ||
|
||
return encoder_outputs, encoder_state_f, encoder_state_b | ||
|
||
|
||
class BahdanauAttention(tf.keras.layers.Layer): | ||
def __init__(self, units, name="attention_layer", **kwargs): | ||
super(BahdanauAttention, self).__init__(name=name, **kwargs) | ||
self.W1 = tf.keras.layers.Dense(units) | ||
self.W2 = tf.keras.layers.Dense(units) | ||
self.V = tf.keras.layers.Dense(1) | ||
|
||
def call(self, inputs): | ||
query = inputs["query"] | ||
values = inputs["values"] | ||
|
||
query_with_time_axis = tf.expand_dims(query, axis=1) | ||
|
||
scores = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values))) | ||
|
||
attention_weights = tf.nn.softmax(scores, axis=1) | ||
|
||
context_vector = attention_weights * values | ||
|
||
context_vector = tf.reduce_sum(context_vector, axis=1) | ||
|
||
return context_vector | ||
|
||
|
||
class Decoder(tf.keras.layers.Layer): | ||
def __init__(self, units, num_classes, name="decoder", **kwargs): | ||
super(Decoder, self).__init__(name=name, **kwargs) | ||
self.units = units | ||
self.num_classes = num_classes | ||
|
||
self.decoder_gru = tf.keras.layers.GRU( | ||
self.units, return_state=True, recurrent_initializer="glorot_uniform" | ||
) | ||
|
||
self.attention = BahdanauAttention(self.units) | ||
|
||
self.decoder_bi = tf.keras.layers.Bidirectional(self.decoder_gru) | ||
|
||
self.decoder_dense = tf.keras.layers.Dense( | ||
self.num_classes, activation=tf.nn.softmax | ||
) | ||
|
||
def call(self, inputs): | ||
decoder_outputs = inputs["decoder_outputs"] | ||
state_f = inputs["state_f"] | ||
state_b = inputs["state_b"] | ||
encoder_outputs = inputs["encoder_outputs"] | ||
|
||
states = [state_f, state_b] | ||
|
||
attention_inputs = {"query": decoder_outputs, "values": encoder_outputs} | ||
|
||
context_vector = self.attention(attention_inputs) | ||
|
||
context_vector = tf.expand_dims(context_vector, axis=1) | ||
|
||
x = context_vector | ||
|
||
( | ||
decoder_outputs, | ||
decoder_state_forward, | ||
decoder_state_backward, | ||
) = self.decoder_bi(x, initial_state=states) | ||
|
||
x = self.decoder_dense(decoder_outputs) | ||
# x = tf.expand_dims(x, axis = 1) | ||
return x |
Oops, something went wrong.