0
|
1 # Copyright 2012 Anton Goloborodko, Lev Levitsky
|
|
2 #
|
|
3 # Licensed under the Apache License, Version 2.0 (the "License");
|
|
4 # you may not use this file except in compliance with the License.
|
|
5 # You may obtain a copy of the License at
|
|
6 #
|
|
7 # http://www.apache.org/licenses/LICENSE-2.0
|
|
8 #
|
|
9 # Unless required by applicable law or agreed to in writing, software
|
|
10 # distributed under the License is distributed on an "AS IS" BASIS,
|
|
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12 # See the License for the specific language governing permissions and
|
|
13 # limitations under the License.
|
|
14
|
|
15 import re
|
|
16 from collections import deque
|
|
17 import itertools as it
|
|
18
|
|
19 def cleave(sequence, rule, missed_cleavages=0, min_length=None):
|
|
20 """Cleaves a polypeptide sequence using a given rule.
|
|
21
|
|
22 Parameters
|
|
23 ----------
|
|
24 sequence : str
|
|
25 The sequence of a polypeptide.
|
|
26
|
|
27 .. note::
|
|
28 The sequence is expected to be in one-letter uppercase notation.
|
|
29 Otherwise, some of the cleavage rules in :py:data:`expasy_rules`
|
|
30 will not work as expected.
|
|
31
|
|
32 rule : str or compiled regex
|
|
33 A regular expression describing the site of cleavage. It is recommended
|
|
34 to design the regex so that it matches only the residue whose C-terminal
|
|
35 bond is to be cleaved. All additional requirements should be specified
|
|
36 using `lookaround assertions
|
|
37 <http://www.regular-expressions.info/lookaround.html>`_.
|
|
38 :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents.
|
|
39 missed_cleavages : int, optional
|
|
40 Maximum number of allowed missed cleavages. Defaults to 0.
|
|
41 min_length : int or None, optional
|
|
42 Minimum peptide length. Defaults to :py:const:`None`.
|
|
43
|
|
44 ..note ::
|
|
45 This checks for string length, which is only correct for one-letter
|
|
46 notation and not for full *modX*. Use :py:func:`length` manually if
|
|
47 you know what you are doing and apply :py:func:`cleave` to *modX*
|
|
48 sequences.
|
|
49
|
|
50 Returns
|
|
51 -------
|
|
52 out : set
|
|
53 A set of unique (!) peptides.
|
|
54
|
|
55 Examples
|
|
56 --------
|
|
57 >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'}
|
|
58 True
|
|
59 >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \
|
|
60 {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'}
|
|
61 True
|
|
62
|
|
63 """
|
|
64 return set(_cleave(sequence, rule, missed_cleavages, min_length))
|
|
65
|
|
66 def _cleave(sequence, rule, missed_cleavages=0, min_length=None):
|
|
67 """Like :py:func:`cleave`, but the result is a list. Refer to
|
|
68 :py:func:`cleave` for explanation of parameters.
|
|
69 """
|
|
70 peptides = []
|
|
71 ml = missed_cleavages+2
|
|
72 trange = range(ml)
|
|
73 cleavage_sites = deque([0], maxlen=ml)
|
|
74 cl = 1
|
|
75 for i in it.chain([x.end() for x in re.finditer(rule, sequence)],
|
|
76 [None]):
|
|
77 cleavage_sites.append(i)
|
|
78 if cl < ml:
|
|
79 cl += 1
|
|
80 for j in trange[:cl-1]:
|
|
81 seq = sequence[cleavage_sites[j]:cleavage_sites[-1]]
|
|
82 if seq:
|
|
83 if min_length is None or len(seq) >= min_length:
|
|
84 peptides.append(seq)
|
|
85 return peptides
|
|
86
|
|
87 def num_sites(sequence, rule, **kwargs):
|
|
88 """Count the number of sites where `sequence` can be cleaved using
|
|
89 the given `rule` (e.g. number of miscleavages for a peptide).
|
|
90
|
|
91 Parameters
|
|
92 ----------
|
|
93 sequence : str
|
|
94 The sequence of a polypeptide.
|
|
95 rule : str or compiled regex
|
|
96 A regular expression describing the site of cleavage. It is recommended
|
|
97 to design the regex so that it matches only the residue whose C-terminal
|
|
98 bond is to be cleaved. All additional requirements should be specified
|
|
99 using `lookaround assertions
|
|
100 <http://www.regular-expressions.info/lookaround.html>`_.
|
|
101 labels : list, optional
|
|
102 A list of allowed labels for amino acids and terminal modifications.
|
|
103
|
|
104 Returns
|
|
105 -------
|
|
106 out : int
|
|
107 Number of cleavage sites.
|
|
108 """
|
|
109 return len(_cleave(sequence, rule, **kwargs)) - 1
|
|
110
|
|
111 expasy_rules = {
|
|
112 'arg-c': r'R',
|
|
113 'asp-n': r'\w(?=D)',
|
|
114 'bnps-skatole' : r'W',
|
|
115 'caspase 1': r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])',
|
|
116 'caspase 2': r'(?<=DVA)D(?=[^PEDQKR])',
|
|
117 'caspase 3': r'(?<=DMQ)D(?=[^PEDQKR])',
|
|
118 'caspase 4': r'(?<=LEV)D(?=[^PEDQKR])',
|
|
119 'caspase 5': r'(?<=[LW]EH)D',
|
|
120 'caspase 6': r'(?<=VE[HI])D(?=[^PEDQKR])',
|
|
121 'caspase 7': r'(?<=DEV)D(?=[^PEDQKR])',
|
|
122 'caspase 8': r'(?<=[IL]ET)D(?=[^PEDQKR])',
|
|
123 'caspase 9': r'(?<=LEH)D',
|
|
124 'caspase 10': r'(?<=IEA)D',
|
|
125 'chymotrypsin high specificity' : r'([FY](?=[^P]))|(W(?=[^MP]))',
|
|
126 'chymotrypsin low specificity':
|
|
127 r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))',
|
|
128 'clostripain': r'R',
|
|
129 'cnbr': r'M',
|
|
130 'enterokinase': r'(?<=[DE]{3})K',
|
|
131 'factor xa': r'(?<=[AFGILTVM][DE]G)R',
|
|
132 'formic acid': r'D',
|
|
133 'glutamyl endopeptidase': r'E',
|
|
134 'granzyme b': r'(?<=IEP)D',
|
|
135 'hydroxylamine': r'N(?=G)',
|
|
136 'iodosobenzoic acid': r'W',
|
|
137 'lysc': r'K',
|
|
138 'ntcb': r'\w(?=C)',
|
|
139 'pepsin ph1.3': r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|'
|
|
140 r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))',
|
|
141 'pepsin ph2.0': r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|'
|
|
142 r'((?<=[^HKR][^P])[FL](?=\w[^P]))',
|
|
143 'proline endopeptidase': r'(?<=[HKR])P(?=[^P])',
|
|
144 'proteinase k': r'[AEFILTVWY]',
|
|
145 'staphylococcal peptidase i': r'(?<=[^E])E',
|
|
146 'thermolysin': r'[^DE](?=[AFILMV])',
|
|
147 'thrombin': r'((?<=G)R(?=G))|'
|
|
148 r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))',
|
|
149 'trypsin': r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))'
|
|
150 }
|
|
151 """
|
|
152 This dict contains regular expressions for cleavage rules of the most
|
|
153 popular proteolytic enzymes. The rules were taken from the
|
|
154 `PeptideCutter tool
|
|
155 <http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_
|
|
156 at Expasy.
|
|
157 """
|
|
158
|