Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speedup of peptide parsing and annotation #61

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 38 additions & 13 deletions spectrum_utils/proforma.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
import pyteomics.mass as pmass


UNMODIFIED_PEPTIDE_REGEX = re.compile(r"^([A-Za-z]+)(/-?[0-9]+)?$")

# Set to None to disable caching.
cache_dir = appdirs.user_cache_dir("spectrum_utils", False)

Expand Down Expand Up @@ -378,9 +380,9 @@ def proteoform(self, tree) -> Proteoform:
self._modifications.sort(key=_modification_sort_key)
proteoform = Proteoform(
sequence=sequence,
modifications=self._modifications
if len(self._modifications) > 0
else None,
modifications=(
self._modifications if len(self._modifications) > 0 else None
),
charge=charge,
)
# Reset class variables.
Expand Down Expand Up @@ -615,6 +617,25 @@ def _modification_sort_key(mod: Modification):
return -4


@functools.lru_cache(1)
def _build_parser() -> lark.Lark:
"""Build a lark parser for proforma sequences.

This function also caches the parser in-memory, thus loading it only
once per process.
"""
dir_name = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(dir_name, "proforma.ebnf")) as f_in:
parser = lark.Lark(
f_in.read(),
start="proforma",
parser="earley",
lexer="dynamic_complete",
import_paths=[dir_name],
)
return parser


def parse(proforma: str) -> List[Proteoform]:
"""
Parse a ProForma-encoded string.
Expand Down Expand Up @@ -647,18 +668,22 @@ def parse(proforma: str) -> List[Proteoform]:
ValueError
If no mass was specified for a GNO term or its parent terms.
"""
dir_name = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(dir_name, "proforma.ebnf")) as f_in:
parser = lark.Lark(
f_in.read(),
start="proforma",
parser="earley",
lexer="dynamic_complete",
import_paths=[dir_name],
)
match_unmod = UNMODIFIED_PEPTIDE_REGEX.match(proforma)
if match_unmod is not None:
# Fast path for unmodified peptides.
charge = match_unmod.group(2)
if charge is not None:
charge = Charge(int(charge[1:]))
return [
Proteoform(sequence=match_unmod.group(1).upper(), charge=charge)
]

parser = _build_parser()
# noinspection PyUnresolvedReferences
try:
return ProFormaTransformer().transform(parser.parse(proforma))
parsed = parser.parse(proforma)
parsed = ProFormaTransformer().transform(parsed)
return parsed
except lark.visitors.VisitError as e:
raise e.orig_exc

Expand Down
142 changes: 94 additions & 48 deletions spectrum_utils/spectrum.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import copy
import functools
import urllib.parse
Expand Down Expand Up @@ -622,73 +624,49 @@ def scale_intensity(
)
return self

def annotate_proforma(
def _annotate_proteoforms(
self,
proteoforms: list[proforma.Proteoform],
proforma_str: str,
fragment_tol_mass: float,
fragment_tol_mode: str,
ion_types: str = "by",
max_ion_charge: Optional[int] = None,
neutral_losses: Union[bool, Dict[Optional[str], float]] = False,
) -> "MsmsSpectrum":
) -> MsmsSpectrum:
"""
Assign fragment ion labels to the peaks from a ProForma annotation.

Parameters
----------
proforma_str : str
The ProForma spectrum annotation.
fragment_tol_mass : float
Fragment mass tolerance to match spectrum peaks against theoretical
peaks.
fragment_tol_mode : {'Da', 'ppm'}
Fragment mass tolerance unit. Either 'Da' or 'ppm'.
ion_types : str, optional
The ion types to generate. Can be any combination of 'a', 'b', 'c',
'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm'
for internal fragment ions, 'p' for the precursor ion, and 'r' for
reporter ions. The default is 'by', which means that b and y
peptide ions will be generated.
max_ion_charge : Optional[int], optional
All fragments up to and including the given charge will be
annotated (by default all fragments with a charge up to the
precursor minus one (minimum charge one) will be annotated).
neutral_losses : Union[bool, Dict[Optional[str], float]], optional
Neutral losses to consider for each peak. If `None` or `False`, no
neutral losses are considered. If specified as a dictionary, keys
should be the molecular formulas of the neutral losses and values
the corresponding mass differences. Note that mass differences
should typically be negative. If `True`, all of the following
neutral losses are considered:
This is meant to be an internal function that uses a pre-parsed proforma string
instead of parsing it internally. This can be useful when the same parsed
sequence is used multiple times (since parsing the sequence is a lot slower
than annotating the peaks)

- Loss of hydrogen (H): -1.007825.
- Loss of ammonia (NH3): -17.026549.
- Loss of water (H2O): -18.010565.
- Loss of carbon monoxide (CO): -27.994915.
- Loss of carbon dioxide (CO2): -43.989829.
- Loss of formamide (HCONH2): -45.021464.
- Loss of formic acid (HCOOH): -46.005479.
- Loss of methanesulfenic acid (CH4OS): -63.998301.
- Loss of sulfur trioxide (SO3): -79.956818.
- Loss of metaphosphoric acid (HPO3): -79.966331.
- Loss of mercaptoacetamide (C2H5NOS): -91.009195.
- Loss of mercaptoacetic acid (C2H4O2S): -91.993211.
- Loss of phosphoric acid (H3PO4): -97.976896.
>>> proforma_sequence = "MYPEPTIDEK/2"
>>> parsed_proforma = proforma.parse(proforma_sequence)
>>> spectrum.annotate_proforma(proforma_sequence, ...)

Returns
-------
MsmsSpectrum
or

>>> spectrum._annotate_proteoforms(parsed_proforma, proforma_sequence, ...)

WARN:
This function does not check that the passed sequence corresponds to the
passed proteoforms.

For additional information on the arguments, see the
`MsmsSpectrum.annotate_proforma` documentation.
"""
if fragment_tol_mode not in ("Da", "ppm"):
raise ValueError(
"Unknown fragment mass tolerance unit specified. Supported "
'values are "Da" or "ppm".'
)
self.proforma = proforma_str
mass_diff = functools.partial(
utils.mass_diff, mode_is_da=fragment_tol_mode == "Da"
)

self.proforma = proforma_str
self._annotation = np.full_like(self.mz, None, object)
# By default, peak charges are assumed to be smaller than the precursor
# charge.
Expand All @@ -703,9 +681,7 @@ def annotate_proforma(
neutral_losses = fa._neutral_loss
if neutral_losses is not None and None not in neutral_losses:
neutral_losses[None] = 0
# Parse the ProForma string and find peaks that match the theoretical
# fragments.
proteoforms = proforma.parse(self.proforma)

analyte_number = 1 if len(proteoforms) > 1 else None
for proteoform in proteoforms:
fragments = fa.get_theoretical_fragments(
Expand Down Expand Up @@ -742,4 +718,74 @@ def annotate_proforma(
self.annotation[peak_i] = pi
if analyte_number is not None:
analyte_number += 1

return self

def annotate_proforma(
self,
proforma_str: str,
fragment_tol_mass: float,
fragment_tol_mode: str,
ion_types: str = "by",
max_ion_charge: Optional[int] = None,
neutral_losses: Union[bool, Dict[Optional[str], float]] = False,
) -> MsmsSpectrum:
"""
Assign fragment ion labels to the peaks from a ProForma annotation.

Parameters
----------
proforma_str : str
The ProForma spectrum annotation.
fragment_tol_mass : float
Fragment mass tolerance to match spectrum peaks against theoretical
peaks.
fragment_tol_mode : {'Da', 'ppm'}
Fragment mass tolerance unit. Either 'Da' or 'ppm'.
ion_types : str, optional
The ion types to generate. Can be any combination of 'a', 'b', 'c',
'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm'
for internal fragment ions, 'p' for the precursor ion, and 'r' for
reporter ions. The default is 'by', which means that b and y
peptide ions will be generated.
max_ion_charge : Optional[int], optional
All fragments up to and including the given charge will be
annotated (by default all fragments with a charge up to the
precursor minus one (minimum charge one) will be annotated).
neutral_losses : Union[bool, Dict[Optional[str], float]], optional
Neutral losses to consider for each peak. If `None` or `False`, no
neutral losses are considered. If specified as a dictionary, keys
should be the molecular formulas of the neutral losses and values
the corresponding mass differences. Note that mass differences
should typically be negative. If `True`, all of the following
neutral losses are considered:

- Loss of hydrogen (H): -1.007825.
- Loss of ammonia (NH3): -17.026549.
- Loss of water (H2O): -18.010565.
- Loss of carbon monoxide (CO): -27.994915.
- Loss of carbon dioxide (CO2): -43.989829.
- Loss of formamide (HCONH2): -45.021464.
- Loss of formic acid (HCOOH): -46.005479.
- Loss of methanesulfenic acid (CH4OS): -63.998301.
- Loss of sulfur trioxide (SO3): -79.956818.
- Loss of metaphosphoric acid (HPO3): -79.966331.
- Loss of mercaptoacetamide (C2H5NOS): -91.009195.
- Loss of mercaptoacetic acid (C2H4O2S): -91.993211.
- Loss of phosphoric acid (H3PO4): -97.976896.

Returns
-------
MsmsSpectrum
"""
proteoforms = proforma.parse(proforma_str)

return self._annotate_proteoforms(
proforma_str=proforma_str,
proteoforms=proteoforms,
fragment_tol_mass=fragment_tol_mass,
fragment_tol_mode=fragment_tol_mode,
ion_types=ion_types,
max_ion_charge=max_ion_charge,
neutral_losses=neutral_losses,
)
4 changes: 2 additions & 2 deletions tests/spectrum_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def test_scale_intensity_root():
intensity_unscaled = spec.intensity.copy()
spec.scale_intensity(scaling="root", degree=degree)
np.testing.assert_allclose(
spec.intensity ** degree, intensity_unscaled, rtol=1e-5
spec.intensity**degree, intensity_unscaled, rtol=1e-5
)


Expand All @@ -434,7 +434,7 @@ def test_scale_intensity_log():
intensity_unscaled = spec.intensity.copy()
spec.scale_intensity(scaling="log", base=base)
np.testing.assert_allclose(
base ** spec.intensity - 1, intensity_unscaled, rtol=1e-5
base**spec.intensity - 1, intensity_unscaled, rtol=1e-5
)


Expand Down
Loading