Mercurial > repos > jjohnson > ensembl_cdna_translate

diff digest.py @ 0:a8218b11216f draft
Uploaded
author: jjohnson
date: Wed, 29 Nov 2017 15:55:59 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/digest.py	Wed Nov 29 15:55:59 2017 -0500
@@ -0,0 +1,158 @@
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import re
+from collections import deque
+import itertools as it
+
+def cleave(sequence, rule, missed_cleavages=0, min_length=None):
+    """Cleaves a polypeptide sequence using a given rule.
+
+    Parameters
+    ----------
+    sequence : str
+        The sequence of a polypeptide.
+
+        .. note::
+            The sequence is expected to be in one-letter uppercase notation.
+            Otherwise, some of the cleavage rules in :py:data:`expasy_rules`
+            will not work as expected.
+
+    rule : str or compiled regex
+        A regular expression describing the site of cleavage. It is recommended
+        to design the regex so that it matches only the residue whose C-terminal
+        bond is to be cleaved. All additional requirements should be specified
+        using `lookaround assertions
+        <http://www.regular-expressions.info/lookaround.html>`_.
+        :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents.
+    missed_cleavages : int, optional
+        Maximum number of allowed missed cleavages. Defaults to 0.
+    min_length : int or None, optional
+        Minimum peptide length. Defaults to :py:const:`None`.
+
+        ..note ::
+            This checks for string length, which is only correct for one-letter
+            notation and not for full *modX*. Use :py:func:`length` manually if
+            you know what you are doing and apply :py:func:`cleave` to *modX*
+            sequences.
+
+    Returns
+    -------
+    out : set
+        A set of unique (!) peptides.
+
+    Examples
+    --------
+    >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'}
+    True
+    >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \
+    {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'}
+    True
+
+    """
+    return set(_cleave(sequence, rule, missed_cleavages, min_length))
+
+def _cleave(sequence, rule, missed_cleavages=0, min_length=None):
+    """Like :py:func:`cleave`, but the result is a list. Refer to
+    :py:func:`cleave` for explanation of parameters.
+    """
+    peptides = []
+    ml = missed_cleavages+2
+    trange = range(ml)
+    cleavage_sites = deque([0], maxlen=ml)
+    cl = 1
+    for i in it.chain([x.end() for x in re.finditer(rule, sequence)],
+                   [None]):
+        cleavage_sites.append(i)
+        if cl < ml:
+            cl += 1
+        for j in trange[:cl-1]:
+            seq = sequence[cleavage_sites[j]:cleavage_sites[-1]]
+            if seq:
+                if min_length is None or len(seq) >= min_length:
+                    peptides.append(seq)
+    return peptides
+
+def num_sites(sequence, rule, **kwargs):
+    """Count the number of sites where `sequence` can be cleaved using
+    the given `rule` (e.g. number of miscleavages for a peptide).
+
+    Parameters
+    ----------
+    sequence : str
+        The sequence of a polypeptide.
+    rule : str or compiled regex
+        A regular expression describing the site of cleavage. It is recommended
+        to design the regex so that it matches only the residue whose C-terminal
+        bond is to be cleaved. All additional requirements should be specified
+        using `lookaround assertions
+        <http://www.regular-expressions.info/lookaround.html>`_.
+    labels : list, optional
+        A list of allowed labels for amino acids and terminal modifications.
+
+    Returns
+    -------
+    out : int
+        Number of cleavage sites.
+    """
+    return len(_cleave(sequence, rule, **kwargs)) - 1
+
+expasy_rules = {
+    'arg-c':         r'R',
+    'asp-n':         r'\w(?=D)',
+    'bnps-skatole' : r'W',
+    'caspase 1':     r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])',
+    'caspase 2':     r'(?<=DVA)D(?=[^PEDQKR])',
+    'caspase 3':     r'(?<=DMQ)D(?=[^PEDQKR])',
+    'caspase 4':     r'(?<=LEV)D(?=[^PEDQKR])',
+    'caspase 5':     r'(?<=[LW]EH)D',
+    'caspase 6':     r'(?<=VE[HI])D(?=[^PEDQKR])',
+    'caspase 7':     r'(?<=DEV)D(?=[^PEDQKR])',
+    'caspase 8':     r'(?<=[IL]ET)D(?=[^PEDQKR])',
+    'caspase 9':     r'(?<=LEH)D',
+    'caspase 10':    r'(?<=IEA)D',
+    'chymotrypsin high specificity' : r'([FY](?=[^P]))|(W(?=[^MP]))',
+    'chymotrypsin low specificity':
+        r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))',
+    'clostripain':   r'R',
+    'cnbr':          r'M',
+    'enterokinase':  r'(?<=[DE]{3})K',
+    'factor xa':     r'(?<=[AFGILTVM][DE]G)R',
+    'formic acid':   r'D',
+    'glutamyl endopeptidase': r'E',
+    'granzyme b':    r'(?<=IEP)D',
+    'hydroxylamine': r'N(?=G)',
+    'iodosobenzoic acid': r'W',
+    'lysc':          r'K',
+    'ntcb':          r'\w(?=C)',
+    'pepsin ph1.3':  r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|'
+                     r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))',
+    'pepsin ph2.0':  r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|'
+                     r'((?<=[^HKR][^P])[FL](?=\w[^P]))',
+    'proline endopeptidase': r'(?<=[HKR])P(?=[^P])',
+    'proteinase k':  r'[AEFILTVWY]',
+    'staphylococcal peptidase i': r'(?<=[^E])E',
+    'thermolysin':   r'[^DE](?=[AFILMV])',
+    'thrombin':      r'((?<=G)R(?=G))|'
+                     r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))',
+    'trypsin':       r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))'
+    }
+"""
+This dict contains regular expressions for cleavage rules of the most
+popular proteolytic enzymes. The rules were taken from the
+`PeptideCutter tool
+<http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_
+at Expasy.
+"""
+
author	jjohnson
date	Wed, 29 Nov 2017 15:55:59 -0500
parents
children