Coverage for encoding_decoding.py : 92%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import logging
3from collections import namedtuple
4import pandas as pd
5from elfragmentador import constants, annotate
6from pandas.core.frame import DataFrame
7from torch import Tensor
8from typing import Dict, List, Optional, Sequence, Union
10SequencePair = namedtuple("SequencePair", "aas, mods")
13def encode_mod_seq(seq):
14 """
15 Encodes a peptide sequence to a numeric vector
17 Raises:
18 ValueError
20 Example
21 =======
22 >>> samp_seq = "_AAIFVVAR_"
23 >>> print(constants.MAX_TENSOR_SEQUENCE)
24 32
25 >>> out = encode_mod_seq(samp_seq)
26 >>> out
27 SequencePair(aas=[23, 1, 1, 8, 5, 19, 19, 1, 15, ..., 0], mods=[0, 0, 0, 0,..., 0, 0])
28 >>> len(out)
29 2
30 >>> [len(x) for x in out]
31 [32, 32]
32 """
33 seq_out = [0] * constants.MAX_TENSOR_SEQUENCE
34 mod_out = [0] * constants.MAX_TENSOR_SEQUENCE
36 try:
37 split_seq = list(annotate.peptide_parser(seq, solve_aliases=True))
38 seq_out_i = [constants.ALPHABET[x[:1]] for x in split_seq]
39 mod_out_i = [
40 constants.MOD_PEPTIDE_ALIASES[x] if len(x) > 1 else 0 for x in split_seq
41 ]
42 mod_out_i = [constants.MOD_INDICES.get(x, 0) for x in mod_out_i]
43 if len(seq_out_i) > len(seq_out):
44 logging.warning(
45 f"Length of the encoded sequence is more than the one allowed {constants.MAX_SEQUENCE}."
46 f" Sequence={seq}, the remainder will be clipped"
47 )
49 seq_out[: len(seq_out_i)] = seq_out_i
50 mod_out[: len(mod_out_i)] = mod_out_i
51 except ValueError as e:
52 logging.error(seq)
53 logging.error(e)
54 raise ValueError(
55 f"Sequence provided is longer than the supported length of {constants.MAX_SEQUENCE}"
56 )
58 return SequencePair(seq_out, mod_out)
61def clip_explicit_terminus(seq):
62 """Remove explicit terminus
64 Args:
65 seq: Sequence to be stripped form eplicit termini
67 Returns:
68 Same as sequence input but removing explicit
69 n and c termini
71 Examples:
72 >>> clip_explicit_terminus("PEPTIDEPINK")
73 'PEPTIDEPINK'
74 >>> clip_explicit_terminus("nPEPTIDEPINKc")
75 'PEPTIDEPINK'
76 >>> clip_explicit_terminus("n[ACETYL]PEPTIDEPINKc")
77 'n[ACETYL]PEPTIDEPINK'
78 """
80 if seq[0] == "n" and not seq[1].startswith("["):
81 seq = seq[1:]
83 if seq[-1] == "c":
84 seq = seq[:-1]
86 return seq
89def decode_mod_seq(
90 seq_encoding: List[int],
91 mod_encoding: Optional[List[int]] = None,
92 clip_explicit_term=True,
93) -> str:
94 out = []
96 if mod_encoding is None:
97 mod_encoding = [0] * len(seq_encoding)
99 for i, s in enumerate(seq_encoding):
100 if s == 0:
101 break
103 out.append(constants.ALPHABET_S[s])
104 if mod_encoding[i] != 0:
105 out.append(f"[{constants.MOD_INDICES_S[mod_encoding[i]]}]")
107 if clip_explicit_term:
108 out = clip_explicit_terminus(out)
109 return "".join(out)
112def get_fragment_encoding_labels(
113 annotated_peaks: Optional[Union[Dict[str, int], Dict[str, float]]] = None
114) -> Union[List[Union[int, float]], List[int], List[str]]:
115 """
116 Gets either the laels or an sequence that encodes a spectra
118 Examples
119 ========
120 >>> get_fragment_encoding_labels()
121 ['z1b1', 'z1y1', ..., 'z3b29', 'z3y29']
122 >>> get_fragment_encoding_labels({'z1y2': 100, 'z2y2': 52})
123 [0, 0, 0, 100, ..., 0, 52, ...]
124 """
126 # TODO just redefine this to use the constant keys for fragments ...
127 encoding = []
128 ion_encoding_iterables = {
129 "ION_TYPE": "".join(sorted(constants.ION_TYPES)),
130 "CHARGE": [f"z{z}" for z in range(1, constants.MAX_FRAG_CHARGE + 1)],
131 "POSITION": list(range(1, constants.MAX_ION + 1)),
132 }
134 # TODO implement neutral losses ... if needed
135 for charge in ion_encoding_iterables[constants.ION_ENCODING_NESTING[0]]:
136 for pos in ion_encoding_iterables[constants.ION_ENCODING_NESTING[1]]:
137 for ion in ion_encoding_iterables[constants.ION_ENCODING_NESTING[2]]:
138 key = f"{charge}{ion}{pos}"
139 if annotated_peaks is None:
140 encoding.append(key)
141 else:
142 encoding.append(annotated_peaks.get(key, 0))
144 return encoding
147def decode_fragment_tensor(
148 sequence: str,
149 tensor: Union[List[int], Tensor],
150) -> DataFrame:
151 """
152 Returns a data frame with annotations from sequence
153 and a tensor encoding a spectra
155 Example
156 =======
157 >>> import torch
158 >>> foo = decode_fragment_tensor("AAACK", torch.arange(0, (constants.NUM_FRAG_EMBEDINGS)))
159 >>> foo.head()
160 Fragment Mass Intensity
161 0 z1b1 72.044390 0.0
162 1 z1y1 147.112804 1.0
163 2 z1b2 143.081504 2.0
164 3 z1y2 307.143453 3.0
165 4 z1b3 214.118618 4.0
166 >>> # import matplotlib.pyplot as plt
167 >>> # plt.vlines(foo['Mass'], 0, foo['Intensity'])
168 >>> # plt.show()
169 """
170 key_list = constants.FRAG_EMBEDING_LABELS
171 fragment_ions = annotate.get_peptide_ions(sequence)
172 masses = [fragment_ions.get(k, 0) for k in key_list]
173 intensities = [float(x) for x in tensor]
175 assert len(intensities) == len(masses), logging.error(
176 f"Int {len(intensities)}: \n{intensities}\n\nmasses {len(masses)}: \n{masses}"
177 )
179 out_dict = {"Fragment": key_list, "Mass": masses, "Intensity": intensities}
180 out_df = pd.DataFrame(out_dict)
181 out_df = out_df[out_df["Mass"] != 0].copy()
183 return out_df