comparison digest.py @ 0:a8218b11216f draft

Uploaded
author jjohnson
date Wed, 29 Nov 2017 15:55:59 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a8218b11216f
1 # Copyright 2012 Anton Goloborodko, Lev Levitsky
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 import re
16 from collections import deque
17 import itertools as it
18
19 def cleave(sequence, rule, missed_cleavages=0, min_length=None):
20 """Cleaves a polypeptide sequence using a given rule.
21
22 Parameters
23 ----------
24 sequence : str
25 The sequence of a polypeptide.
26
27 .. note::
28 The sequence is expected to be in one-letter uppercase notation.
29 Otherwise, some of the cleavage rules in :py:data:`expasy_rules`
30 will not work as expected.
31
32 rule : str or compiled regex
33 A regular expression describing the site of cleavage. It is recommended
34 to design the regex so that it matches only the residue whose C-terminal
35 bond is to be cleaved. All additional requirements should be specified
36 using `lookaround assertions
37 <http://www.regular-expressions.info/lookaround.html>`_.
38 :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents.
39 missed_cleavages : int, optional
40 Maximum number of allowed missed cleavages. Defaults to 0.
41 min_length : int or None, optional
42 Minimum peptide length. Defaults to :py:const:`None`.
43
44 ..note ::
45 This checks for string length, which is only correct for one-letter
46 notation and not for full *modX*. Use :py:func:`length` manually if
47 you know what you are doing and apply :py:func:`cleave` to *modX*
48 sequences.
49
50 Returns
51 -------
52 out : set
53 A set of unique (!) peptides.
54
55 Examples
56 --------
57 >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'}
58 True
59 >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \
60 {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'}
61 True
62
63 """
64 return set(_cleave(sequence, rule, missed_cleavages, min_length))
65
66 def _cleave(sequence, rule, missed_cleavages=0, min_length=None):
67 """Like :py:func:`cleave`, but the result is a list. Refer to
68 :py:func:`cleave` for explanation of parameters.
69 """
70 peptides = []
71 ml = missed_cleavages+2
72 trange = range(ml)
73 cleavage_sites = deque([0], maxlen=ml)
74 cl = 1
75 for i in it.chain([x.end() for x in re.finditer(rule, sequence)],
76 [None]):
77 cleavage_sites.append(i)
78 if cl < ml:
79 cl += 1
80 for j in trange[:cl-1]:
81 seq = sequence[cleavage_sites[j]:cleavage_sites[-1]]
82 if seq:
83 if min_length is None or len(seq) >= min_length:
84 peptides.append(seq)
85 return peptides
86
87 def num_sites(sequence, rule, **kwargs):
88 """Count the number of sites where `sequence` can be cleaved using
89 the given `rule` (e.g. number of miscleavages for a peptide).
90
91 Parameters
92 ----------
93 sequence : str
94 The sequence of a polypeptide.
95 rule : str or compiled regex
96 A regular expression describing the site of cleavage. It is recommended
97 to design the regex so that it matches only the residue whose C-terminal
98 bond is to be cleaved. All additional requirements should be specified
99 using `lookaround assertions
100 <http://www.regular-expressions.info/lookaround.html>`_.
101 labels : list, optional
102 A list of allowed labels for amino acids and terminal modifications.
103
104 Returns
105 -------
106 out : int
107 Number of cleavage sites.
108 """
109 return len(_cleave(sequence, rule, **kwargs)) - 1
110
111 expasy_rules = {
112 'arg-c': r'R',
113 'asp-n': r'\w(?=D)',
114 'bnps-skatole' : r'W',
115 'caspase 1': r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])',
116 'caspase 2': r'(?<=DVA)D(?=[^PEDQKR])',
117 'caspase 3': r'(?<=DMQ)D(?=[^PEDQKR])',
118 'caspase 4': r'(?<=LEV)D(?=[^PEDQKR])',
119 'caspase 5': r'(?<=[LW]EH)D',
120 'caspase 6': r'(?<=VE[HI])D(?=[^PEDQKR])',
121 'caspase 7': r'(?<=DEV)D(?=[^PEDQKR])',
122 'caspase 8': r'(?<=[IL]ET)D(?=[^PEDQKR])',
123 'caspase 9': r'(?<=LEH)D',
124 'caspase 10': r'(?<=IEA)D',
125 'chymotrypsin high specificity' : r'([FY](?=[^P]))|(W(?=[^MP]))',
126 'chymotrypsin low specificity':
127 r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))',
128 'clostripain': r'R',
129 'cnbr': r'M',
130 'enterokinase': r'(?<=[DE]{3})K',
131 'factor xa': r'(?<=[AFGILTVM][DE]G)R',
132 'formic acid': r'D',
133 'glutamyl endopeptidase': r'E',
134 'granzyme b': r'(?<=IEP)D',
135 'hydroxylamine': r'N(?=G)',
136 'iodosobenzoic acid': r'W',
137 'lysc': r'K',
138 'ntcb': r'\w(?=C)',
139 'pepsin ph1.3': r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|'
140 r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))',
141 'pepsin ph2.0': r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|'
142 r'((?<=[^HKR][^P])[FL](?=\w[^P]))',
143 'proline endopeptidase': r'(?<=[HKR])P(?=[^P])',
144 'proteinase k': r'[AEFILTVWY]',
145 'staphylococcal peptidase i': r'(?<=[^E])E',
146 'thermolysin': r'[^DE](?=[AFILMV])',
147 'thrombin': r'((?<=G)R(?=G))|'
148 r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))',
149 'trypsin': r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))'
150 }
151 """
152 This dict contains regular expressions for cleavage rules of the most
153 popular proteolytic enzymes. The rules were taken from the
154 `PeptideCutter tool
155 <http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_
156 at Expasy.
157 """
158