Merge pull request #135 from wilhelm-lab/patch/0.7.3

Patch/0.7.3
wilhelm-lab · Aug 13, 2024 · 040d3c5 · 040d3c5
2 parents 6f0465f + 9a8c0ae
commit 040d3c5
Show file tree

Hide file tree

Showing 18 changed files with 2,500 additions and 227 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Victor Giurcoiu
 email: victor.giurcoiu@tum.de
 project_name: spectrum_fundamentals
 project_short_description: Fundamentals public repo
-version: 0.7.2
+version: 0.7.3
 license: MIT
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.7.2 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.7.2 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.7.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.7.3 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.7.2
+current_version = 0.7.3
 
 [bumpversion_files_whitelisted]
 init_file = spectrum_fundamentals/__init__.py

diff --git a/docs/conf.py b/docs/conf.py
@@ -52,9 +52,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.7.2"
+version = "0.7.3"
 # The full version, including alpha/beta/rc tags.
-release = "0.7.2"
+release = "0.7.3"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "spectrum_fundamentals"
-version = "0.7.2"  # <<COOKIETEMPLE_FORCE_BUMP>>
+version = "0.7.3"  # <<COOKIETEMPLE_FORCE_BUMP>>
 description = "Fundamental functions, annotation pipeline and constants for oktoberfest"
 authors = ["Wilhelmlab at Technical University of Munich"]
 license = "MIT"

diff --git a/spectrum_fundamentals/__init__.py b/spectrum_fundamentals/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Mario Picciani"
 __email__ = "mario.picciani@tum.de"
-__version__ = "0.7.2"
+__version__ = "0.7.3"
 
 import logging
 import logging.handlers

diff --git a/spectrum_fundamentals/__main__.py b/spectrum_fundamentals/__main__.py
@@ -5,7 +5,7 @@
 
 
 @click.command()
-@click.version_option(version="0.7.2", message=click.style("spectrum_fundamentals Version: 0.7.2"))
+@click.version_option(version="0.7.3", message=click.style("spectrum_fundamentals Version: 0.7.3"))
 def main() -> None:
     """spectrum_fundamentals."""
 

diff --git a/spectrum_fundamentals/annotation/annotation.py b/spectrum_fundamentals/annotation/annotation.py
@@ -322,7 +322,7 @@ def generate_annotation_matrix(
     exp_mass_col = matched_peaks.columns.get_loc("exp_mass")
 
     for peak in matched_peaks.values:
-        ion_type_index = ion_types.index(peak[ion_type][0])
+        ion_type_index = ion_types.index(peak[ion_type].split("-", 1)[0])
         peak_pos = ((peak[no_col] - 1) * charge_const * len(ion_types)) + (peak[charge_col] - 1) + 3 * ion_type_index
 
         if peak_pos >= constants.VEC_LENGTH:

diff --git a/spectrum_fundamentals/constants.py b/spectrum_fundamentals/constants.py
@@ -17,6 +17,8 @@
 VEC_LENGTH_CMS2 = (SEQ_LEN - 1) * 2 * 3 * 2
 # peptide of length 30 can have 29 b, y, b_short, y_short, b_long and y_long ions, each with charge 1+, 2+ and 3+
 # we do not annotate fragments wth charge 3+. All fragmets with charge 3+ convert to -1
+
+
 #############
 # ALPHABETS #
 #############
@@ -380,36 +382,89 @@
     "[UNIMOD:35]": "[Oxidation (O)]",
 }
 
-FRAGMENTATION_ENCODING = {"HCD": 2, "CID": 1}
+FRAGMENTATION_ENCODING = {
+    "CID": 1,
+    "HCD": 2,
+    "ETD": 3,
+    "ETHCD": 4,
+    "ETCID": 5,
+    "UVPD": 6,
+    "EID": 7,
+    "ECD": 8,
+    "AIECD": 9,
+}
+
+########################
+# RESCORING PARAMETERS #
+########################
+
+
+class RescoreType(Enum):
+    """Class for rescoring types."""
+
+    PROSIT = "prosit"
+    ANDROMEDA = "andromeda"
+
+
+#############
+# ION TYPES #
+#############
+FORWARD_IONS = ["a", "b", "c"]
+BACKWARDS_IONS = ["x", "y", "z", "z_r"]  #
+IONS = FORWARD_IONS + BACKWARDS_IONS
+
+FRAGMENTATION_TO_IONS_BY_PAIRS = {
+    "HCD": [BACKWARDS_IONS[1], FORWARD_IONS[1]],  # y,b
+    "CID": [BACKWARDS_IONS[1], FORWARD_IONS[1]],  # y,b
+    "ETD": [BACKWARDS_IONS[-1], FORWARD_IONS[2]],  # z_r,c
+    "ECD": [BACKWARDS_IONS[-1], FORWARD_IONS[2]],  # z_r,c
+    "ETHCD": [BACKWARDS_IONS[1], FORWARD_IONS[1], BACKWARDS_IONS[-1], FORWARD_IONS[2]],  # y,b,z_r,c
+    "ETCID": [BACKWARDS_IONS[1], FORWARD_IONS[1], BACKWARDS_IONS[-1], FORWARD_IONS[2]],  # y,b,z_r,c
+    "UVPD": [
+        BACKWARDS_IONS[0],
+        FORWARD_IONS[0],
+        BACKWARDS_IONS[1],
+        FORWARD_IONS[1],
+        BACKWARDS_IONS[2],
+        FORWARD_IONS[2],
+    ],  # y,b,z,c,x,a
+}
+
+FRAGMENTATION_TO_IONS_BY_DIRECTION = {
+    "HCD": [BACKWARDS_IONS[1], FORWARD_IONS[1]],  # y,b
+    "CID": [BACKWARDS_IONS[1], FORWARD_IONS[1]],  # y,b
+    "ETD": [BACKWARDS_IONS[-1], FORWARD_IONS[2]],  # z_r,c
+    "ECD": [BACKWARDS_IONS[-1], FORWARD_IONS[2]],  # z_r,c
+    "ETHCD": [BACKWARDS_IONS[1], BACKWARDS_IONS[-1]] + FORWARD_IONS[1:],  # y,z_r,b,c
+    "ETCID": [BACKWARDS_IONS[1], BACKWARDS_IONS[-1]] + FORWARD_IONS[1:],  # y,z_r,b,c
+    "UVPD": BACKWARDS_IONS[:-1] + FORWARD_IONS,  # y,z,x,b,c,a
+}
+
+ION_DELTAS = {
+    "a": -ATOM_MASSES["O"] - ATOM_MASSES["C"],
+    "b": 0.0,
+    "c": 3 * ATOM_MASSES["H"] + ATOM_MASSES["N"],
+    "x": 2 * ATOM_MASSES["O"] + ATOM_MASSES["C"],
+    "y": ATOM_MASSES["O"] + 2 * ATOM_MASSES["H"],
+    "z": ATOM_MASSES["O"] - ATOM_MASSES["N"] - ATOM_MASSES["H"],
+    "z_r": ATOM_MASSES["O"] - ATOM_MASSES["N"],
+}
 
 ############################
 # GENERATION OF ANNOTATION #
 ############################
 
-IONS = ["y", "b"]  # limited to single character unicode string when array is created
 CHARGES = [1, 2, 3]  # limited to uint8 (0-255) when array is created
 POSITIONS = [x for x in range(1, 30)]  # fragment numbers 1-29 -- limited to uint8 (0-255) when array is created
 
 ANNOTATION_FRAGMENT_TYPE = []
 ANNOTATION_FRAGMENT_CHARGE = []
 ANNOTATION_FRAGMENT_NUMBER = []
 for pos in POSITIONS:
-    for ion in IONS:
+    for ion in FRAGMENTATION_TO_IONS_BY_DIRECTION["HCD"]:
         for charge in CHARGES:
             ANNOTATION_FRAGMENT_TYPE.append(ion)
             ANNOTATION_FRAGMENT_CHARGE.append(charge)
             ANNOTATION_FRAGMENT_NUMBER.append(pos)
 
 ANNOTATION = [ANNOTATION_FRAGMENT_TYPE, ANNOTATION_FRAGMENT_CHARGE, ANNOTATION_FRAGMENT_NUMBER]
-
-
-########################
-# RESCORING PARAMETERS #
-########################
-
-
-class RescoreType(Enum):
-    """Class for rescoring types."""
-
-    PROSIT = "prosit"
-    ANDROMEDA = "andromeda"
diff --git a/spectrum_fundamentals/fragments.py b/spectrum_fundamentals/fragments.py
@@ -1,12 +1,14 @@
+import itertools
 import logging
 import re
 from operator import itemgetter
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Literal, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
 
-from .constants import AA_MASSES, ATOM_MASSES, MOD_MASSES, PARTICLE_MASSES
+import spectrum_fundamentals.constants as c
+
 from .mod_string import internal_without_mods
 
 logger = logging.getLogger(__name__)
@@ -38,7 +40,7 @@ def _get_modifications(peptide_sequence: str, custom_mods: Optional[Dict[str, fl
     pattern = re.compile(r"\[.{8}[^\]]*\]")
     matches = pattern.finditer(peptide_sequence)
 
-    mod_masses = MOD_MASSES | (custom_mods or {})
+    mod_masses = c.MOD_MASSES | (custom_mods or {})
 
     for match in matches:
         start_pos, end_pos = match.span()
@@ -56,14 +58,14 @@ def compute_peptide_mass(sequence: str, custom_mods: Optional[Dict[str, float]]
     :param custom_mods: Custom Modifications with the identifier, the unimod equivalent and the respective mass
     :return: Theoretical mass of the sequence
     """
-    terminal_masses = 2 * ATOM_MASSES["H"] + ATOM_MASSES["O"]  # add terminal masses HO- and H-
+    terminal_masses = 2 * c.ATOM_MASSES["H"] + c.ATOM_MASSES["O"]  # add terminal masses HO- and H-
 
     modification_deltas = _get_modifications(sequence, custom_mods=custom_mods)
     if modification_deltas:  # there were modifictions
         sequence = internal_without_mods([sequence])[0]
         terminal_masses += modification_deltas.get(-2, 0.0)  # prime with n_term_mod delta if present
 
-    peptide_sum = sum([AA_MASSES[c] + modification_deltas.get(i, 0.0) for i, c in enumerate(sequence)])
+    peptide_sum = sum([c.AA_MASSES[aa] + modification_deltas.get(i, 0.0) for i, aa in enumerate(sequence)])
 
     return terminal_masses + peptide_sum
 
@@ -90,21 +92,15 @@ def retrieve_ion_types(fragmentation_method: str) -> List[str]:
 
     Given the fragmentation method the function returns all ion types that can result from it.
 
-    : param fragmentation_method: fragmentation method used during the MS
-    : raises ValueError: if fragmentation_method is other than one of HCD, CID, ETD, ECD, ETCID, ETHCD, UVPD
-    : return: list of possible ion types
+    :param fragmentation_method: fragmentation method used during the MS
+    :raises ValueError: if fragmentation_method is not supported
+    :return: list of possible ion types
     """
     fragmentation_method = fragmentation_method.upper()
-    if fragmentation_method == "HCD" or fragmentation_method == "CID":
-        return ["y", "b"]
-    elif fragmentation_method == "ETD" or fragmentation_method == "ECD":
-        return ["z", "c"]
-    elif fragmentation_method == "ETCID" or fragmentation_method == "ETHCD":
-        return ["y", "b", "z", "c"]
-    elif fragmentation_method == "UVPD":
-        return ["y", "b", "z", "c", "x", "a"]
-    else:
+    ions = c.FRAGMENTATION_TO_IONS_BY_PAIRS.get(fragmentation_method, [])
+    if not ions:
         raise ValueError(f"Unknown fragmentation method provided: {fragmentation_method}")
+    return ions
 
 
 def retrieve_ion_types_for_peak_initialization(fragmentation_method: str) -> List[str]:
@@ -113,21 +109,15 @@ def retrieve_ion_types_for_peak_initialization(fragmentation_method: str) -> Lis
 
     Given the fragmentation method the function returns all ion types that can result from it.
 
-    : param fragmentation_method: fragmentation method used during the MS
-    : raises ValueError: if fragmentation_method is other than one of HCD, CID, ETD, ECD, ETCID, ETHCD, UVPD
-    : return: list of possible ion types
+    :param fragmentation_method: fragmentation method used during the MS
+    :raises ValueError: if fragmentation_method is not supported
+    :return: list of possible ion types
     """
     fragmentation_method = fragmentation_method.upper()
-    if fragmentation_method == "HCD" or fragmentation_method == "CID":
-        return ["y", "b"]
-    elif fragmentation_method == "ETD" or fragmentation_method == "ECD":
-        return ["z", "c"]
-    elif fragmentation_method == "ETCID" or fragmentation_method == "ETHCD":
-        return ["y", "z", "b", "c"]
-    elif fragmentation_method == "UVPD":
-        return ["x", "y", "z", "a", "b", "c"]
-    else:
+    ions = c.FRAGMENTATION_TO_IONS_BY_DIRECTION.get(fragmentation_method, [])
+    if not ions:
         raise ValueError(f"Unknown fragmentation method provided: {fragmentation_method}")
+    return ions
 
 
 def get_ion_delta(ion_types: List[str]) -> np.ndarray:
@@ -137,18 +127,7 @@ def get_ion_delta(ion_types: List[str]) -> np.ndarray:
     :param ion_types: type of ions for which mass should be calculated
     :return: numpy array with masses of the ions
     """
-    ion_type_offsets = {
-        "a": -ATOM_MASSES["O"] - ATOM_MASSES["C"],
-        "b": 0.0,
-        "c": 3 * ATOM_MASSES["H"] + ATOM_MASSES["N"],
-        "x": 2 * ATOM_MASSES["O"] + ATOM_MASSES["C"],
-        "y": ATOM_MASSES["O"] + 2 * ATOM_MASSES["H"],
-        "z": ATOM_MASSES["O"] - ATOM_MASSES["N"] - ATOM_MASSES["H"],
-    }
-    # I think list comprehension is fastest way
-    deltas = np.array([ion_type_offsets[ion_type] for ion_type in ion_types]).reshape(len(ion_types), 1)
-
-    return deltas
+    return np.array([c.ION_DELTAS[ion_type] for ion_type in ion_types]).reshape(len(ion_types), 1)
 
 
 def initialize_peaks(
@@ -196,13 +175,13 @@ def initialize_peaks(
 
     if modification_deltas:  # there were modifictions
         sequence = internal_without_mods([sequence])[0]
-        n_term_delta = modification_deltas.get(-2, 0.0)
+        n_term_delta = modification_deltas.pop(-2, 0.0)  # directly pop it to avoid readding it later
         if n_term_delta != 0:
             n_term_mod = 2
             # add n_term mass to first aa for easy processing in the following calculation
             modification_deltas[0] = modification_deltas.get(0, 0.0) + n_term_delta
 
-    mass_arr = np.array([AA_MASSES[_] for _ in sequence])
+    mass_arr = np.array([c.AA_MASSES[_] for _ in sequence])
     for pos, mod_mass in modification_deltas.items():
         mass_arr[pos] += mod_mass
 
@@ -221,7 +200,7 @@ def initialize_peaks(
     # calculate for m/z for charges 1, 2, 3
     # shape of ion_mzs: (n_ions, n_fragments, max_charge)
     charges = np.arange(1, max_charge + 1)
-    ion_mzs = (sum_array[..., np.newaxis] + charges * PARTICLE_MASSES["PROTON"]) / charges
+    ion_mzs = (sum_array[..., np.newaxis] + charges * c.PARTICLE_MASSES["PROTON"]) / charges
 
     min_mzs, max_mzs = get_min_max_mass(mass_analyzer, ion_mzs, mass_tolerance, unit_mass_tolerance)
 
@@ -246,7 +225,7 @@ def initialize_peaks(
         fragments_meta_data,
         n_term_mod,
         sequence,
-        (peptide_mass + ATOM_MASSES["O"] + 2 * ATOM_MASSES["H"]),
+        (peptide_mass + c.ATOM_MASSES["O"] + 2 * c.ATOM_MASSES["H"]),
     )
 
 
@@ -422,3 +401,53 @@ def get_min_max_mass(
     else:
         raise ValueError(f"Unsupported mass_analyzer: {mass_analyzer}")
     return (min_mass, max_mass)
+
+
+FragmentIonComponent = Literal["ion_type", "position", "charge"]
+
+
+def generate_fragment_ion_annotations(
+    ion_types: List[str], order: Tuple[FragmentIonComponent, FragmentIonComponent, FragmentIonComponent]
+) -> List[Tuple[str, int, int]]:
+    """Generate full list of fragment ions for permitted ion types and specified order.
+
+    :param ion_types: List of permitted ion types
+    :param order: What fragment ion parameters (ion type, position & charge) to group the annotations by
+    :return: List of (ion_type, position, charge) tuples sorted by specified component order
+    :raises ValueError: if invalid or unsupported ion types are specified or duplicate order keys are used
+    """
+    fragment_ion_components: Dict[str, Union[List[str]]] = {
+        "ion_type": ion_types,
+        "position": [str(pos) for pos in c.POSITIONS],
+        "charge": [str(charge) for charge in c.CHARGES],
+    }
+
+    if len(set(ion_types)) != len(ion_types):
+        raise ValueError("Redundant ion types specified")
+    elif len(ion_types) == 0:
+        raise ValueError("No ion types specified")
+    if set(order) != {"ion_type", "position", "charge"}:
+        raise ValueError("Duplicate component used for ordering fragment ions")
+
+    raw_annotations = list(itertools.product(*[fragment_ion_components[component] for component in order]))
+
+    ordered_raw_annotations = [
+        (
+            str(combination[order.index("ion_type")]),
+            int(combination[order.index("position")]),
+            int(combination[order.index("charge")]),
+        )
+        for combination in raw_annotations
+    ]
+
+    return ordered_raw_annotations
+
+
+def format_fragment_ion_annotation(raw_annotation: Tuple[str, int, int]) -> str:
+    """Transform (ion_type, position, charge) tuple into <ion_type><position>+<charge> string.
+
+    :param raw_annotation: `(ion_type, position, charge)` tuple
+    :returns: formatted annotation string
+    """
+    ion_type, pos, charge = raw_annotation
+    return f"{ion_type}{pos}+{charge}"