annotate shinefind.py @ 0:f678e282b320 draft default tip

"planemo upload"
author cpt_testbed
date Fri, 06 May 2022 07:07:23 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
1 #!/usr/bin/env python
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
2 import re
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
3 import sys
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
4 import argparse
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
5 import logging
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
6 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
7 from Bio import SeqIO
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
8 from Bio.SeqRecord import SeqRecord
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
9 from Bio.SeqFeature import FeatureLocation
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
10 from gff3 import (
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
11 feature_lambda,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
12 feature_test_type,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
13 feature_test_true,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
14 feature_test_quals,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
15 get_id,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
16 ensure_location_in_bounds,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
17 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
18
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
19 logging.basicConfig(level=logging.INFO)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
20 log = logging.getLogger()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
21
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
22
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
23 class NaiveSDCaller(object):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
24
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
25 # TODO May make switch for different sequence sets
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
26 SD_SEQUENCES = (
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
27 "AGGAGGT",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
28 "GGAGGT",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
29 "AGGAGG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
30 "GGGGGG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
31 "AGGAG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
32 "GAGGT",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
33 "GGAGG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
34 "GGGGG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
35 "AGGT",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
36 "GGGT",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
37 "GAGG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
38 "GGGG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
39 "AGGA",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
40 "GGAG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
41 "GGA",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
42 "GAG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
43 "AGG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
44 "GGT",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
45 "GGG",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
46 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
47
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
48 def __init__(self):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
49 self.sd_reg = [re.compile(x, re.IGNORECASE) for x in self.SD_SEQUENCES]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
50
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
51 def list_sds(self, sequence, sd_min=3, sd_max=17):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
52 hits = []
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
53 for regex in self.sd_reg:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
54 for match in regex.finditer(sequence):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
55 spacing = len(sequence) - len(match.group()) - match.start()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
56 if sd_max >= spacing+sd_min and spacing+sd_min >= sd_min:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
57 #if the spacing is within gap limits, add
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
58 #(search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
59 #print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
60 hits.append(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
61 {
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
62 "spacing": spacing,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
63 "hit": match.group(),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
64 "start": match.start(),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
65 "end": match.end(),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
66 "len": len(match.group()),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
67 }
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
68 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
69 hits = sorted(hits, key= lambda x: (-x['len'],x['spacing']))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
70 return hits
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
71
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
72 @classmethod
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
73 def highlight_sd(cls, sequence, start, end):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
74 return " ".join(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
75 [
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
76 sequence[0:start].lower(),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
77 sequence[start:end].upper(),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
78 sequence[end:].lower(),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
79 ]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
80 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
81
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
82 @classmethod
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
83 def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd_min=3, sd_max=17):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
84 results = []
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
85 for idx, hit in enumerate(hits):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
86 # gene complement(124..486)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
87 # -1 491 501 0 5 5
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
88 # -1 491 501 0 4 5
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
89 # -1 491 501 1 4 5
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
90 # -1 491 501 2 3 5
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
91 # -1 491 501 1 3 5
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
92 # -1 491 501 0 3 5
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
93
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
94 qualifiers = {
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
95 "source": "CPT_ShineFind",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
96 "ID": "%s.rbs-%s" % (feature_id, idx),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
97 }
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
98
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
99 if strand > 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
100 start = parent_end - hit["spacing"] - hit["len"]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
101 end = parent_end - hit["spacing"]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
102 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
103 start = parent_start + hit["spacing"]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
104 end = parent_start + hit["spacing"] + hit["len"]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
105 # check that the END of the SD sequence is within the given min/max of parent start/end
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
106
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
107 # gap is either the sd_start-cds_end (neg strand) or the sd_end-cds_start (pos strand)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
108 # minimum absolute value of these two will be the proper gap regardless of strand
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
109 tmp = gffSeqFeature(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
110 FeatureLocation(min(start, end), max(start, end), strand=strand),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
111 #FeatureLocation(min(start, end), max(start, end), strand=strand),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
112 type="Shine_Dalgarno_sequence",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
113 qualifiers=qualifiers,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
114 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
115 results.append(tmp)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
116 return results
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
117
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
118 def testFeatureUpstream(self, feature, record, sd_min=3, sd_max=17):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
119 # Strand information necessary to getting correct upstream sequence
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
120 strand = feature.location.strand
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
121
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
122 # n_bases_upstream (plus/minus 7 upstream to make the min/max define the possible gap position)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
123 if strand > 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
124 start = feature.location.start - sd_max - 7
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
125 end = feature.location.start - sd_min
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
126 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
127 start = feature.location.end + sd_min
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
128 end = feature.location.end + sd_max + 7
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
129
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
130 (start, end) = ensure_location_in_bounds(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
131 start=start, end=end, parent_length=len(record)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
132 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
133
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
134 # Create our temp feature used to obtain correct portion of
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
135 # genome
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
136 tmp = gffSeqFeature(FeatureLocation(min(start, end), max(start, end), strand=strand), type="domain")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
137 seq = str(tmp.extract(record.seq))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
138 return self.list_sds(seq, sd_min, sd_max), start, end, seq
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
139
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
140 def hasSd(self, feature, record, sd_min=3, sd_max=17):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
141 sds, start, end, seq = self.testFeatureUpstream(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
142 feature, record, sd_min=sd_min, sd_max=sd_max
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
143 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
144 return len(sds) > 0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
145
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
146
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
147 # Cycle through subfeatures, set feature's location to be equal
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
148 # to the smallest start and largest end.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
149 # Remove pending bugfix for feature display in Apollo
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
150 def fminmax(feature):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
151 fmin = None
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
152 fmax = None
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
153 for sf in feature_lambda([feature], feature_test_true, {}, subfeatures=True):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
154 if fmin is None:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
155 fmin = sf.location.start
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
156 fmax = sf.location.end
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
157 if sf.location.start < fmin:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
158 fmin = sf.location.start
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
159 if sf.location.end > fmax:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
160 fmax = sf.location.end
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
161 return fmin, fmax
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
162
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
163
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
164 def fix_gene_boundaries(feature):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
165 # There is a bug in Apollo whereby we have created gene
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
166 # features which are larger than expected, but we cannot see this.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
167 # We only see a perfect sized gene + SD together.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
168 #
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
169 # So, we clamp the location of the gene feature to the
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
170 # contained mRNAs. Will remove pending Apollo upgrade.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
171 fmin, fmax = fminmax(feature)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
172 if feature.location.strand > 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
173 feature.location = FeatureLocation(fmin, fmax, strand=1)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
174 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
175 feature.location = FeatureLocation(fmin, fmax, strand=-1)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
176 return feature
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
177
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
178 def shinefind(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
179 fasta,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
180 gff3,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
181 gff3_output=None,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
182 table_output=None,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
183 lookahead_min=3,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
184 lookahead_max=17,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
185 top_only=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
186 add=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
187 ):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
188 table_output.write(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
189 "\t".join(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
190 [
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
191 "ID",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
192 "Name",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
193 "Terminus",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
194 "Terminus",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
195 "Strand",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
196 "Upstream Sequence",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
197 "SD",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
198 "Spacing",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
199 ]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
200 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
201 + "\n"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
202 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
203
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
204 sd_finder = NaiveSDCaller()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
205 # Load up sequence(s) for GFF3 data
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
206 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
207 # Parse GFF3 records
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
208 for record in gffParse(gff3, base_dict=seq_dict):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
209 # Shinefind's gff3_output.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
210 gff3_output_record = SeqRecord(record.seq, record.id)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
211 # Filter out just coding sequences
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
212 ignored_features = []
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
213 for x in record.features:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
214 # If feature X does NOT contain a CDS, add to ignored_features
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
215 # list. This means if we have a top level gene feature with or
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
216 # without a CDS subfeature, we're catch it appropriately here.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
217 if (
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
218 len(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
219 list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
220 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
221 [x], feature_test_type, {"type": "CDS"}, subfeatures=True
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
222 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
223 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
224 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
225 == 0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
226 ):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
227 ignored_features.append(x)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
228
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
229 # Loop over all gene features
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
230 for gene in feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
231 record.features, feature_test_type, {"type": "gene"}, subfeatures=True
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
232 ):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
233
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
234 # Get the CDS from this gene.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
235 feature = sorted(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
236 list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
237 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
238 gene.sub_features,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
239 feature_test_type,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
240 {"type": "CDS"},
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
241 subfeatures=True,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
242 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
243 ),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
244 key=lambda x: x.location.start,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
245 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
246 # If no CDSs are in this gene feature, then quit
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
247 if len(feature) == 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
248 # We've already caught these above in our ignored_features
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
249 # list, so we skip out on the rest of this for loop
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
250 continue
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
251 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
252 # Otherwise pull the first on the strand.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
253 feature = feature[0]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
254
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
255 # Three different ways RBSs can be stored that we expect.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
256 rbs_rbs = list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
257 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
258 gene.sub_features,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
259 feature_test_type,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
260 {"type": "RBS"},
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
261 subfeatures=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
262 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
263 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
264 rbs_sds = list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
265 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
266 gene.sub_features,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
267 feature_test_type,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
268 {"type": "Shine_Dalgarno_sequence"},
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
269 subfeatures=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
270 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
271 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
272 regulatory_elements = list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
273 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
274 gene.sub_features,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
275 feature_test_type,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
276 {"type": "regulatory"},
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
277 subfeatures=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
278 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
279 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
280 rbs_regulatory = list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
281 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
282 regulatory_elements,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
283 feature_test_quals,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
284 {"regulatory_class": ["ribosome_binding_site"]},
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
285 subfeatures=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
286 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
287 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
288 rbss = rbs_rbs + rbs_sds + rbs_regulatory
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
289
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
290 # If someone has already annotated an RBS, we move to the next gene
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
291 if len(rbss) > 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
292 log.debug("Has %s RBSs", len(rbss))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
293 ignored_features.append(gene)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
294 continue
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
295
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
296 sds, start, end, seq = sd_finder.testFeatureUpstream(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
297 feature, record, sd_min=lookahead_min, sd_max=lookahead_max
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
298 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
299
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
300 feature_id = get_id(feature)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
301 sd_features = sd_finder.to_features(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
302 sds, feature.location.strand, start, end, feature_id=feature.id
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
303 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
304
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
305 human_strand = "+" if feature.location.strand == 1 else "-"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
306
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
307 # http://book.pythontips.com/en/latest/for_-_else.html
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
308 log.debug("Found %s SDs", len(sds))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
309 for (sd, sd_feature) in zip(sds, sd_features):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
310 # If we only want the top feature, after the bulk of the
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
311 # forloop executes once, we append the top feature, and fake a
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
312 # break, because an actual break triggers the else: block
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
313 table_output.write(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
314 "\t".join(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
315 map(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
316 str,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
317 [
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
318 feature.id,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
319 feature_id,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
320 feature.location.start,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
321 feature.location.end,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
322 human_strand,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
323 sd_finder.highlight_sd(seq, sd["start"], sd["end"]),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
324 sd["hit"],
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
325 int(sd["spacing"]) + lookahead_min,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
326 ],
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
327 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
328 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
329 + "\n"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
330 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
331
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
332 if add:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
333 # Append the top RBS to the gene feature
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
334 gene.sub_features.append(sd_feature)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
335 # Pick out start/end locations for all sub_features
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
336 locations = [x.location.start for x in gene.sub_features] + [
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
337 x.location.end for x in gene.sub_features
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
338 ]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
339 # Update gene's start/end to be inclusive
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
340 gene.location._start = min(locations)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
341 gene.location._end = max(locations)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
342 # Also register the feature with the separate GFF3 output
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
343 sd_feature = fix_gene_boundaries(sd_feature)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
344 gff3_output_record.features.append(sd_feature)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
345
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
346 if top_only or sd == (sds[-1]):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
347 break
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
348 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
349 table_output.write(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
350 "\t".join(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
351 map(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
352 str,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
353 [
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
354 feature.id,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
355 feature_id,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
356 feature.location.start,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
357 feature.location.end,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
358 human_strand,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
359 seq,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
360 None,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
361 -1,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
362 ],
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
363 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
364 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
365 + "\n"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
366 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
367
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
368 record.annotations = {}
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
369 gffWrite([record], sys.stdout)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
370
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
371 gff3_output_record.features = sorted(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
372 gff3_output_record.features, key=lambda x: x.location.start
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
373 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
374 gff3_output_record.annotations = {}
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
375 gffWrite([gff3_output_record], gff3_output)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
376
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
377
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
378 if __name__ == "__main__":
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
379 parser = argparse.ArgumentParser(description="Identify shine-dalgarno sequences")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
380 parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
381 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
382
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
383 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
384 "--gff3_output",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
385 type=argparse.FileType("w"),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
386 help="GFF3 Output",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
387 default="shinefind.gff3",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
388 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
389 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
390 "--table_output",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
391 type=argparse.FileType("w"),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
392 help="Tabular Output",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
393 default="shinefind.tbl",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
394 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
395
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
396 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
397 "--lookahead_min",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
398 nargs="?",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
399 type=int,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
400 help="Number of bases upstream of CDSs to end search",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
401 default=3,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
402 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
403 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
404 "--lookahead_max",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
405 nargs="?",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
406 type=int,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
407 help="Number of bases upstream of CDSs to begin search",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
408 default=17,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
409 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
410
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
411 parser.add_argument("--top_only", action="store_true", help="Only report best hits")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
412 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
413 "--add",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
414 action="store_true",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
415 help='Function in "addition" mode whereby the '
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
416 + "RBSs are added directly to the gene model.",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
417 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
418
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
419 args = parser.parse_args()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
420 shinefind(**vars(args))