Mercurial > repos > jjohnson > ensembl_cdna_translate
comparison digest.py @ 0:a8218b11216f draft
Uploaded
author | jjohnson |
---|---|
date | Wed, 29 Nov 2017 15:55:59 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a8218b11216f |
---|---|
1 # Copyright 2012 Anton Goloborodko, Lev Levitsky | |
2 # | |
3 # Licensed under the Apache License, Version 2.0 (the "License"); | |
4 # you may not use this file except in compliance with the License. | |
5 # You may obtain a copy of the License at | |
6 # | |
7 # http://www.apache.org/licenses/LICENSE-2.0 | |
8 # | |
9 # Unless required by applicable law or agreed to in writing, software | |
10 # distributed under the License is distributed on an "AS IS" BASIS, | |
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 # See the License for the specific language governing permissions and | |
13 # limitations under the License. | |
14 | |
15 import re | |
16 from collections import deque | |
17 import itertools as it | |
18 | |
19 def cleave(sequence, rule, missed_cleavages=0, min_length=None): | |
20 """Cleaves a polypeptide sequence using a given rule. | |
21 | |
22 Parameters | |
23 ---------- | |
24 sequence : str | |
25 The sequence of a polypeptide. | |
26 | |
27 .. note:: | |
28 The sequence is expected to be in one-letter uppercase notation. | |
29 Otherwise, some of the cleavage rules in :py:data:`expasy_rules` | |
30 will not work as expected. | |
31 | |
32 rule : str or compiled regex | |
33 A regular expression describing the site of cleavage. It is recommended | |
34 to design the regex so that it matches only the residue whose C-terminal | |
35 bond is to be cleaved. All additional requirements should be specified | |
36 using `lookaround assertions | |
37 <http://www.regular-expressions.info/lookaround.html>`_. | |
38 :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents. | |
39 missed_cleavages : int, optional | |
40 Maximum number of allowed missed cleavages. Defaults to 0. | |
41 min_length : int or None, optional | |
42 Minimum peptide length. Defaults to :py:const:`None`. | |
43 | |
44 ..note :: | |
45 This checks for string length, which is only correct for one-letter | |
46 notation and not for full *modX*. Use :py:func:`length` manually if | |
47 you know what you are doing and apply :py:func:`cleave` to *modX* | |
48 sequences. | |
49 | |
50 Returns | |
51 ------- | |
52 out : set | |
53 A set of unique (!) peptides. | |
54 | |
55 Examples | |
56 -------- | |
57 >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'} | |
58 True | |
59 >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \ | |
60 {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'} | |
61 True | |
62 | |
63 """ | |
64 return set(_cleave(sequence, rule, missed_cleavages, min_length)) | |
65 | |
66 def _cleave(sequence, rule, missed_cleavages=0, min_length=None): | |
67 """Like :py:func:`cleave`, but the result is a list. Refer to | |
68 :py:func:`cleave` for explanation of parameters. | |
69 """ | |
70 peptides = [] | |
71 ml = missed_cleavages+2 | |
72 trange = range(ml) | |
73 cleavage_sites = deque([0], maxlen=ml) | |
74 cl = 1 | |
75 for i in it.chain([x.end() for x in re.finditer(rule, sequence)], | |
76 [None]): | |
77 cleavage_sites.append(i) | |
78 if cl < ml: | |
79 cl += 1 | |
80 for j in trange[:cl-1]: | |
81 seq = sequence[cleavage_sites[j]:cleavage_sites[-1]] | |
82 if seq: | |
83 if min_length is None or len(seq) >= min_length: | |
84 peptides.append(seq) | |
85 return peptides | |
86 | |
87 def num_sites(sequence, rule, **kwargs): | |
88 """Count the number of sites where `sequence` can be cleaved using | |
89 the given `rule` (e.g. number of miscleavages for a peptide). | |
90 | |
91 Parameters | |
92 ---------- | |
93 sequence : str | |
94 The sequence of a polypeptide. | |
95 rule : str or compiled regex | |
96 A regular expression describing the site of cleavage. It is recommended | |
97 to design the regex so that it matches only the residue whose C-terminal | |
98 bond is to be cleaved. All additional requirements should be specified | |
99 using `lookaround assertions | |
100 <http://www.regular-expressions.info/lookaround.html>`_. | |
101 labels : list, optional | |
102 A list of allowed labels for amino acids and terminal modifications. | |
103 | |
104 Returns | |
105 ------- | |
106 out : int | |
107 Number of cleavage sites. | |
108 """ | |
109 return len(_cleave(sequence, rule, **kwargs)) - 1 | |
110 | |
111 expasy_rules = { | |
112 'arg-c': r'R', | |
113 'asp-n': r'\w(?=D)', | |
114 'bnps-skatole' : r'W', | |
115 'caspase 1': r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])', | |
116 'caspase 2': r'(?<=DVA)D(?=[^PEDQKR])', | |
117 'caspase 3': r'(?<=DMQ)D(?=[^PEDQKR])', | |
118 'caspase 4': r'(?<=LEV)D(?=[^PEDQKR])', | |
119 'caspase 5': r'(?<=[LW]EH)D', | |
120 'caspase 6': r'(?<=VE[HI])D(?=[^PEDQKR])', | |
121 'caspase 7': r'(?<=DEV)D(?=[^PEDQKR])', | |
122 'caspase 8': r'(?<=[IL]ET)D(?=[^PEDQKR])', | |
123 'caspase 9': r'(?<=LEH)D', | |
124 'caspase 10': r'(?<=IEA)D', | |
125 'chymotrypsin high specificity' : r'([FY](?=[^P]))|(W(?=[^MP]))', | |
126 'chymotrypsin low specificity': | |
127 r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))', | |
128 'clostripain': r'R', | |
129 'cnbr': r'M', | |
130 'enterokinase': r'(?<=[DE]{3})K', | |
131 'factor xa': r'(?<=[AFGILTVM][DE]G)R', | |
132 'formic acid': r'D', | |
133 'glutamyl endopeptidase': r'E', | |
134 'granzyme b': r'(?<=IEP)D', | |
135 'hydroxylamine': r'N(?=G)', | |
136 'iodosobenzoic acid': r'W', | |
137 'lysc': r'K', | |
138 'ntcb': r'\w(?=C)', | |
139 'pepsin ph1.3': r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|' | |
140 r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))', | |
141 'pepsin ph2.0': r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|' | |
142 r'((?<=[^HKR][^P])[FL](?=\w[^P]))', | |
143 'proline endopeptidase': r'(?<=[HKR])P(?=[^P])', | |
144 'proteinase k': r'[AEFILTVWY]', | |
145 'staphylococcal peptidase i': r'(?<=[^E])E', | |
146 'thermolysin': r'[^DE](?=[AFILMV])', | |
147 'thrombin': r'((?<=G)R(?=G))|' | |
148 r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))', | |
149 'trypsin': r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))' | |
150 } | |
151 """ | |
152 This dict contains regular expressions for cleavage rules of the most | |
153 popular proteolytic enzymes. The rules were taken from the | |
154 `PeptideCutter tool | |
155 <http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_ | |
156 at Expasy. | |
157 """ | |
158 |