From ec0e3edc9c63dbce272744da214610447d3850c0 Mon Sep 17 00:00:00 2001 From: RobbinBouwmeester Date: Wed, 15 Nov 2023 10:13:17 +0100 Subject: [PATCH] Update deeplc.py --- deeplc/deeplc.py | 500 ++++++++++++++++++++++++++++++----------------- 1 file changed, 319 insertions(+), 181 deletions(-) diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index 15cb280..5c2c29e 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -12,8 +12,7 @@ __credits__ = [ "Robbin Bouwmeester", "Ralf Gabriels", - "Arthur Declercq" - "Lennart Martens", + "Arthur Declercq" "Lennart Martens", "Sven Degroeve", ] @@ -26,7 +25,7 @@ DEFAULT_MODELS = [ "mods/full_hc_PXD005573_mcp_1fd8363d9af9dcad3be7553c39396960.hdf5", "mods/full_hc_PXD005573_mcp_8c22d89667368f2f02ad996469ba157e.hdf5", - "mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5" + "mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5", ] DEFAULT_MODELS = [os.path.join(deeplc_dir, dm) for dm in DEFAULT_MODELS] @@ -58,8 +57,8 @@ warnings.filterwarnings("ignore", category=UserWarning) # Supress warnings (or at least try...) -logging.getLogger('tensorflow').setLevel(logging.ERROR) -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +logging.getLogger("tensorflow").setLevel(logging.ERROR) +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" import numpy as np import pandas as pd @@ -114,23 +113,30 @@ from deeplc.feat_extractor import FeatExtractor from pygam import LinearGAM, s + def warn(*args, **kwargs): pass + + import warnings + warnings.warn = warn -warnings.filterwarnings('ignore', category=DeprecationWarning) -warnings.filterwarnings('ignore', category=FutureWarning) +warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore", category=FutureWarning) logger = logging.getLogger(__name__) + def split_list(a, n): k, m = divmod(len(a), n) - return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) + def divide_chunks(l, n): - for i in range(0, len(l), n): - yield l[i:i + n] + for i in range(0, len(l), n): + yield l[i : i + n] + def reset_keras(): """Reset Keras session.""" @@ -216,7 +222,7 @@ def __init__( deeplc_retrain=False, predict_ccs=False, n_epochs=20, - ): + ): # if a config file is defined overwrite standard parameters if config_file: cparser = ConfigParser() @@ -320,19 +326,23 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]): list_of_psms = [] if not self.predict_ccs: - for seq,mod,ident in zip(seqs,mods,identifiers): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) + for seq, mod, ident in zip(seqs, mods, identifiers): + list_of_psms.append( + PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident) + ) else: - for seq,mod,ident,z in zip(seqs,mods,identifiers,charges): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod,z),spectrum_id=ident)) + for seq, mod, ident, z in zip(seqs, mods, identifiers, charges): + list_of_psms.append( + PSM(peptide=peprec_to_proforma(seq, mod, z), spectrum_id=ident) + ) psm_list = PSMList(psm_list=list_of_psms) - return self.f_extractor.full_feat_extract(psm_list,predict_ccs=self.predict_ccs) + return self.f_extractor.full_feat_extract( + psm_list, predict_ccs=self.predict_ccs + ) - def do_f_extraction_pd(self, - df_instances, - charges=[]): + def do_f_extraction_pd(self, df_instances, charges=[]): """ Extract all features we can extract; without parallelization; use if you want to run feature extraction with a single thread; and use a @@ -352,15 +362,31 @@ def do_f_extraction_pd(self, list_of_psms = [] if len(charges) == 0: - for seq,mod,ident in zip(df_instances["seq"],df_instances["modifications"],df_instances.index): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) + for seq, mod, ident in zip( + df_instances["seq"], df_instances["modifications"], df_instances.index + ): + list_of_psms.append( + PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident) + ) else: - for seq,mod,ident,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident)) + for seq, mod, ident, z in zip( + df_instances["seq"], + df_instances["modifications"], + df_instances.index, + charges=df_instances["charges"], + ): + list_of_psms.append( + PSM( + peptide=peprec_to_proforma(seq, mod, charge=z), + spectrum_id=ident, + ) + ) psm_list = PSMList(psm_list=list_of_psms) - return self.f_extractor.full_feat_extract(psm_list,predict_ccs=self.predict_ccs) + return self.f_extractor.full_feat_extract( + psm_list, predict_ccs=self.predict_ccs + ) def do_f_extraction_pd_parallel(self, df_instances): """ @@ -379,32 +405,26 @@ def do_f_extraction_pd_parallel(self, df_instances): pd.DataFrame feature matrix """ - #self.n_jobs = 1 + # self.n_jobs = 1 - df_instances_split = np.array_split(df_instances, math.ceil(self.n_jobs/4.0)) + df_instances_split = np.array_split(df_instances, math.ceil(self.n_jobs / 4.0)) if multiprocessing.current_process().daemon: logger.warning( "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children." ) pool = multiprocessing.dummy.Pool(1) else: - pool = multiprocessing.Pool(math.ceil(self.n_jobs/4.0)) + pool = multiprocessing.Pool(math.ceil(self.n_jobs / 4.0)) if self.n_jobs == 1: df = self.do_f_extraction_pd(df_instances) else: - df = pd.concat( - pool.map( - self.do_f_extraction_pd, - df_instances_split)) + df = pd.concat(pool.map(self.do_f_extraction_pd, df_instances_split)) pool.close() pool.join() return df - def do_f_extraction_psm_list( - self, - psm_list - ): + def do_f_extraction_psm_list(self, psm_list): """ Extract all features we can extract; without parallelization; use if you want to run feature extraction with a single thread; and use a @@ -421,12 +441,11 @@ def do_f_extraction_psm_list( pd.DataFrame feature matrix """ - return self.f_extractor.full_feat_extract(psm_list,predict_ccs=self.predict_ccs) + return self.f_extractor.full_feat_extract( + psm_list, predict_ccs=self.predict_ccs + ) - def do_f_extraction_psm_list_parallel( - self, - psm_list - ): + def do_f_extraction_psm_list_parallel(self, psm_list): """ Extract all features we can extract; without parallelization; use if you want to run feature extraction with a single thread; and use a @@ -444,10 +463,12 @@ def do_f_extraction_psm_list_parallel( feature matrix """ # TODO for multiproc I am still expecting a pd dataframe, this is not the case anymore, they are dicts - self.n_jobs = 1 + # self.n_jobs = 1 logger.debug("prepare feature extraction") if multiprocessing.current_process().daemon: - logger.warning("DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children.") + logger.warning( + "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children." + ) psm_list_split = split_list(psm_list, self.n_jobs) pool = multiprocessing.dummy.Pool(1) elif self.n_jobs > 1: @@ -461,8 +482,8 @@ def do_f_extraction_psm_list_parallel( else: logger.debug("start feature extraction") all_feats_async = pool.map_async( - self.do_f_extraction_psm_list, - psm_list_split) + self.do_f_extraction_psm_list, psm_list_split + ) logger.debug("wait for feature extraction") all_feats_async.wait() @@ -475,7 +496,7 @@ def do_f_extraction_psm_list_parallel( return all_feats - def calibration_core(self,uncal_preds,cal_dict,cal_min,cal_max): + def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max): cal_preds = [] if len(uncal_preds) == 0: return np.array(cal_preds) @@ -500,32 +521,39 @@ def calibration_core(self,uncal_preds,cal_dict,cal_min,cal_max): cal_preds.append(slope * (uncal_pred) + intercept) return np.array(cal_preds) - def make_preds_core_library(self, - psm_list=[], - calibrate=True, - mod_name=None - ): + def make_preds_core_library(self, psm_list=[], calibrate=True, mod_name=None): ret_preds = [] for psm in psm_list: - ret_preds.append(LIBRARY[psm.peptidoform.proforma+"|"+mod_name]) + ret_preds.append(LIBRARY[psm.peptidoform.proforma + "|" + mod_name]) if calibrate: try: - ret_preds = self.calibration_core(ret_preds,self.calibrate_dict[mod_name],self.calibrate_min[mod_name],self.calibrate_max[mod_name]) + ret_preds = self.calibration_core( + ret_preds, + self.calibrate_dict[mod_name], + self.calibrate_min[mod_name], + self.calibrate_max[mod_name], + ) except: - ret_preds = self.calibration_core(ret_preds,self.calibrate_dict,self.calibrate_min,self.calibrate_max) - + ret_preds = self.calibration_core( + ret_preds, + self.calibrate_dict, + self.calibrate_min, + self.calibrate_max, + ) + return ret_preds - def make_preds_core(self, - X=[], - X_sum=[], - X_global=[], - X_hc=[], - psm_list=[], - calibrate=True, - mod_name=None - ): + def make_preds_core( + self, + X=[], + X_sum=[], + X_global=[], + X_hc=[], + psm_list=[], + calibrate=True, + mod_name=None, + ): """ Make predictions for sequences Parameters @@ -561,13 +589,17 @@ def make_preds_core(self, if len(X) == 0 and len(psm_list) > 0: if self.verbose: logger.debug("Extracting features for the CNN model ...") - #X = self.do_f_extraction_psm_list(psm_list) + # X = self.do_f_extraction_psm_list(psm_list) X = self.do_f_extraction_psm_list_parallel(psm_list) X_sum = np.stack(list(X["matrix_sum"].values())) - X_global = np.concatenate((np.stack(list(X["matrix_all"].values())), - np.stack(list(X["pos_matrix"].values()))), - axis=1) + X_global = np.concatenate( + ( + np.stack(list(X["matrix_all"].values())), + np.stack(list(X["pos_matrix"].values())), + ), + axis=1, + ) X_hc = np.stack(list(X["matrix_hc"].values())) X = np.stack(list(X["matrix"].values())) elif len(X) == 0 and len(psm_list) == 0: @@ -575,34 +607,39 @@ def make_preds_core(self, ret_preds = [] - mod = load_model( - mod_name, - custom_objects={'': lrelu} - ) + mod = load_model(mod_name, custom_objects={"": lrelu}) try: X ret_preds = mod.predict( - [X, X_sum, X_global, X_hc], batch_size=self.batch_num_tf).flatten() + [X, X_sum, X_global, X_hc], batch_size=self.batch_num_tf + ).flatten() except UnboundLocalError: logger.debug("X is empty, skipping...") ret_preds = [] if calibrate: try: - ret_preds = self.calibration_core(ret_preds,self.calibrate_dict[mod_name],self.calibrate_min[mod_name],self.calibrate_max[mod_name]) + ret_preds = self.calibration_core( + ret_preds, + self.calibrate_dict[mod_name], + self.calibrate_min[mod_name], + self.calibrate_max[mod_name], + ) except: - ret_preds = self.calibration_core(ret_preds,self.calibrate_dict,self.calibrate_min,self.calibrate_max) - + ret_preds = self.calibration_core( + ret_preds, + self.calibrate_dict, + self.calibrate_min, + self.calibrate_max, + ) + clear_session() gc.collect() return ret_preds - def make_preds(self, - psm_list=None, - infile="", - calibrate=True, - seq_df=None, - mod_name=None): + def make_preds( + self, psm_list=None, infile="", calibrate=True, seq_df=None, mod_name=None + ): """ Make predictions for sequences, in batches if required. @@ -634,17 +671,37 @@ def make_preds(self, if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] if self.predict_ccs: - for seq,mod,ident,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["charge"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident)) + for seq, mod, ident, z in zip( + seq_df["seq"], + seq_df["modifications"], + seq_df.index, + seq_df["charge"], + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod, charge=z), + spectrum_id=ident, + ) + ) else: - for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident)) + for seq, mod, ident in zip( + seq_df["seq"], seq_df["modifications"], seq_df.index + ): + list_of_psms.append( + PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident) + ) psm_list = PSMList(psm_list=list_of_psms) - + if len(infile) > 0: psm_list = read_file(infile) if "msms" in infile and ".txt" in infile: - mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() + mapper = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "unimod/map_mq_file.csv", + ), + index_col=0, + )["value"].to_dict() psm_list.rename_modifications(mapper) ret_preds_batches = [] @@ -656,78 +713,113 @@ def make_preds(self, X = self.do_f_extraction_psm_list_parallel(psm_list_t) X_sum = np.stack(list(X["matrix_sum"].values())) - X_global = np.concatenate((np.stack(list(X["matrix_all"].values())), - np.stack(list(X["pos_matrix"].values()))), - axis=1) + X_global = np.concatenate( + ( + np.stack(list(X["matrix_all"].values())), + np.stack(list(X["pos_matrix"].values())), + ), + axis=1, + ) X_hc = np.stack(list(X["matrix_hc"].values())) X = np.stack(list(X["matrix"].values())) else: return [] if isinstance(self.model, dict): - for m_group_name,m_name in self.model.items(): - ret_preds.append(self.make_preds_core(X=X, - X_sum=X_sum, - X_global=X_global, - X_hc=X_hc, - calibrate=calibrate, - mod_name=m_name)) - ret_preds = np.array([sum(a)/len(a) for a in zip(*ret_preds)]) + for m_group_name, m_name in self.model.items(): + ret_preds.append( + self.make_preds_core( + X=X, + X_sum=X_sum, + X_global=X_global, + X_hc=X_hc, + calibrate=calibrate, + mod_name=m_name, + ) + ) + ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds)]) elif mod_name is not None: - ret_preds = self.make_preds_core(X=X, - X_sum=X_sum, - X_global=X_global, - X_hc=X_hc, - calibrate=calibrate, - mod_name=mod_name) + ret_preds = self.make_preds_core( + X=X, + X_sum=X_sum, + X_global=X_global, + X_hc=X_hc, + calibrate=calibrate, + mod_name=mod_name, + ) elif isinstance(self.model, list): for m_name in self.model: - ret_preds.append(self.make_preds_core(X=X, - X_sum=X_sum, - X_global=X_global, - X_hc=X_hc, - calibrate=calibrate, - mod_name=m_name)) - ret_preds = np.array([sum(a)/len(a) for a in zip(*ret_preds)]) + ret_preds.append( + self.make_preds_core( + X=X, + X_sum=X_sum, + X_global=X_global, + X_hc=X_hc, + calibrate=calibrate, + mod_name=m_name, + ) + ) + ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds)]) else: - ret_preds = self.make_preds_core(X=X, - X_sum=X_sum, - X_global=X_global, - X_hc=X_hc, - calibrate=calibrate, - mod_name=self.model) + ret_preds = self.make_preds_core( + X=X, + X_sum=X_sum, + X_global=X_global, + X_hc=X_hc, + calibrate=calibrate, + mod_name=self.model, + ) ret_preds_batches.extend(ret_preds) return ret_preds_batches # TODO make this multithreaded # should be possible with the batched list - def calibrate_preds_func_pygam(self, - psm_list=None, - correction_factor=1.0, - seq_df=None, - measured_tr=None, - use_median=True, - mod_name=None): + def calibrate_preds_func_pygam( + self, + psm_list=None, + correction_factor=1.0, + seq_df=None, + measured_tr=None, + use_median=True, + mod_name=None, + ): # TODO make a df to psm_list function # TODO make sure either psm_list or seq_df is supplied if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] # TODO include charge here if self.predict_ccs: - for seq,mod,ident,tr,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"],seq_df["charge"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr)) + for seq, mod, ident, tr, z in zip( + seq_df["seq"], + seq_df["modifications"], + seq_df.index, + seq_df["tr"], + seq_df["charge"], + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod, charge=z), + spectrum_id=ident, + retention_time=tr, + ) + ) else: - for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + for seq, mod, ident, tr in zip( + seq_df["seq"], seq_df["modifications"], seq_df.index, seq_df["tr"] + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod), + spectrum_id=ident, + retention_time=tr, + ) + ) psm_list = PSMList(psm_list=list_of_psms) measured_tr = [psm.retention_time for psm in psm_list] - predicted_tr = self.make_preds( - psm_list, - calibrate=False, - mod_name=mod_name) + predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=mod_name) # sort two lists, predicted and observed based on measured tr tr_sort = [ @@ -747,12 +839,14 @@ def calibrate_preds_func_pygam(self, calibrate_max = max(predicted_tr) return calibrate_min, calibrate_max, gam_model_cv - def calibrate_preds_func(self, - psm_list=None, - correction_factor=1.0, - seq_df=None, - use_median=True, - mod_name=None): + def calibrate_preds_func( + self, + psm_list=None, + correction_factor=1.0, + seq_df=None, + use_median=True, + mod_name=None, + ): """ Make calibration curve for predictions @@ -797,19 +891,36 @@ def calibrate_preds_func(self, list_of_psms = [] # TODO include charge here if self.predict_ccs: - for seq,mod,tr,ident,z in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index,seq_df["charge"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr)) + for seq, mod, tr, ident, z in zip( + seq_df["seq"], + seq_df["modifications"], + seq_df["tr"], + seq_df.index, + seq_df["charge"], + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod, charge=z), + spectrum_id=ident, + retention_time=tr, + ) + ) else: - for seq,mod,tr,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df["tr"],seq_df.index): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + for seq, mod, tr, ident in zip( + seq_df["seq"], seq_df["modifications"], seq_df["tr"], seq_df.index + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod), + spectrum_id=ident, + retention_time=tr, + ) + ) psm_list = PSMList(psm_list=list_of_psms) - + measured_tr = [psm.retention_time for psm in psm_list] - predicted_tr = self.make_preds( - psm_list, - calibrate=False, - mod_name=mod_name) + predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=mod_name) # sort two lists, predicted and observed based on measured tr tr_sort = [ @@ -905,17 +1016,19 @@ def calibrate_preds_func(self, return calibrate_min, calibrate_max, calibrate_dict - def calibrate_preds(self, - psm_list=None, - infile="", - measured_tr=[], - correction_factor=1.0, - location_retraining_models="", - psm_utils_obj=None, - sample_for_calibration_curve=None, - seq_df=None, - use_median=True, - return_plotly_report=False): + def calibrate_preds( + self, + psm_list=None, + infile="", + measured_tr=[], + correction_factor=1.0, + location_retraining_models="", + psm_utils_obj=None, + sample_for_calibration_curve=None, + seq_df=None, + use_median=True, + return_plotly_report=False, + ): """ Find best model and calibrate. @@ -947,22 +1060,48 @@ def calibrate_preds(self, if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] if self.predict_ccs: - for seq,mod,ident,tr,z in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"],seq_df["charge"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod,charge=z),spectrum_id=ident,retention_time=tr)) + for seq, mod, ident, tr, z in zip( + seq_df["seq"], + seq_df["modifications"], + seq_df.index, + seq_df["tr"], + seq_df["charge"], + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod, charge=z), + spectrum_id=ident, + retention_time=tr, + ) + ) else: - for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + for seq, mod, ident, tr in zip( + seq_df["seq"], seq_df["modifications"], seq_df.index, seq_df["tr"] + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod), + spectrum_id=ident, + retention_time=tr, + ) + ) psm_list = PSMList(psm_list=list_of_psms) elif psm_utils_obj: - psm_list = psm_utils_obj + psm_list = psm_utils_obj if isinstance(self.model, str): self.model = [self.model] - + if len(infile) > 0: psm_list = read_file(infile) if "msms" in infile and ".txt" in infile: - mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() + mapper = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "unimod/map_mq_file.csv", + ), + index_col=0, + )["value"].to_dict() psm_list.rename_modifications(mapper) measured_tr = [psm.retention_time for psm in psm_list] @@ -970,11 +1109,8 @@ def calibrate_preds(self, if self.verbose: logger.debug("Start to calibrate predictions ...") if self.verbose: + logger.debug("Ready to find the best model out of: %s" % (self.model)) - logger.debug( - "Ready to find the best model out of: %s" % - (self.model)) - best_perf = float("inf") best_calibrate_min = 0.0 best_calibrate_max = 0.0 @@ -1008,7 +1144,7 @@ def calibrate_preds(self, # Here we will apply transfer learning we specify previously trained models in the 'mods_transfer_learning' models = deeplcretrainer.retrain( - {"deeplc_transferlearn":psm_list}, + {"deeplc_transferlearn": psm_list}, outpath=t_dir_models, mods_transfer_learning=self.model, freeze_layers=True, @@ -1052,13 +1188,10 @@ def calibrate_preds(self, if type(self.calibrate_dict) == dict: if len(self.calibrate_dict.keys()) == 0: continue - + m_name = m.split("/")[-1] - preds = self.make_preds(psm_list, - calibrate=True, - seq_df=seq_df, - mod_name=m) + preds = self.make_preds(psm_list, calibrate=True, seq_df=seq_df, mod_name=m) if self.deepcallc_mod: m_group_name = "deepcallc" @@ -1120,21 +1253,26 @@ def calibrate_preds(self, self.model = best_model if self.deepcallc_mod: - self.deepcallc_model = train_en(pd.DataFrame(pred_dict["deepcallc"]),seq_df["tr"]) + self.deepcallc_model = train_en( + pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"] + ) - self.n_jobs = 1 + # self.n_jobs = 1 logger.debug("Model with the best performance got selected: %s" % (best_model)) if return_plotly_report: import deeplc.plot + plotly_return_dict = {} plotly_df = pd.DataFrame( - list(zip(temp_obs,temp_pred)), - columns=["Observed retention time","Predicted retention time"] - ) + list(zip(temp_obs, temp_pred)), + columns=["Observed retention time", "Predicted retention time"], + ) plotly_return_dict["scatter"] = deeplc.plot.scatter(plotly_df) - plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline(plotly_df) + plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline( + plotly_df + ) return plotly_return_dict return {}