diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index 3a71fba..78773ec 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -10,6 +10,8 @@ from scipy import interpolate from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +import spectrum_fundamentals.constants as cs + from . import fragments_ratio as fr from . import similarity as sim from .metric import Metric @@ -40,6 +42,7 @@ class Percolator(Metric): FRAGMENTATION: fragmentation method, e.g. HCD, CID RETENTION_TIME: observed retention time PREDICTED_RETENTION_TIME: predicted retention time by Prosit + PROTEINS """ metadata: pd.DataFrame @@ -272,11 +275,16 @@ def add_percolator_metadata_columns(self): self.metrics_val["Label"] = self.target_decoy_labels self.metrics_val["ScanNr"] = self.metadata["SCAN_NUMBER"] self.metrics_val["filename"] = self.metadata["RAW_FILE"] + # self.metrics_val.insert(self.metrics_val.columns.get_loc("filename") + 1, "ExpMass", self.metrics_val.pop("ExpMass")) self.metrics_val["Peptide"] = self.metadata["MODIFIED_SEQUENCE"].apply(lambda x: "_." + x + "._") + # added a variable for proton mass + proton_mass = cs.PARTICLE_MASSES["PROTON"] + # added theorictical/expected (mass/charge) column including the charge - self.metrics_val["Proteins"] = self.metadata[ - "MODIFIED_SEQUENCE" - ] # we don't need the protein ID to get PSM / peptide results, fill with peptide sequence + self.metrics_val["ExpMass"] = ( + self.metadata["CALCULATED_MASS"] + proton_mass * self.metadata["PRECURSOR_CHARGE"] + ) / self.metadata["PRECURSOR_CHARGE"] + self.metrics_val["Proteins"] = self.metadata["PROTEINS"] def apply_lda_and_get_indices_below_fdr( self, initial_scoring_feature: str = "spectral_angle", fdr_cutoff: float = 0.01 @@ -371,7 +379,7 @@ def fdrs_to_qvals(fdrs: np.ndarray) -> np.ndarray: def _reorder_columns_for_percolator(self): all_columns = self.metrics_val.columns - first_columns = ["SpecId", "Label", "ScanNr", "filename"] + first_columns = ["SpecId", "Label", "ScanNr", "filename", "ExpMass"] last_columns = ["Peptide", "Proteins"] mid_columns = list(set(all_columns) - set(first_columns) - set(last_columns)) new_columns = first_columns + sorted(mid_columns) + last_columns