0
|
1 #!/usr/bin/env python
|
|
2 import os
|
|
3 import sys
|
|
4 import argparse
|
|
5 import subprocess
|
|
6 import tempfile
|
|
7 import re
|
|
8 import logging
|
|
9 from Bio.Seq import Seq
|
|
10 from Bio.SeqRecord import SeqRecord
|
|
11 from Bio.SeqFeature import SeqFeature, FeatureLocation
|
|
12 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
|
|
13 from gff3 import feature_lambda, feature_test_type
|
|
14
|
|
15 logging.basicConfig(level=logging.INFO)
|
|
16 log = logging.getLogger(__name__)
|
|
17 SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
|
|
18
|
|
19 REGEX_TERM = re.compile(
|
|
20 " TERM \d+ \s* (\d+) - (\d+)\s*(\+|-) [^ ]* \s* (\d+)\s* ([0-9.-]+)\s* -([0-9.]+)\s*\|\s*(.*)"
|
|
21 )
|
|
22 PARTS_TERM = re.compile(" ([^ ]*)\s+([^ ]*)\s+([^ ]*)\s+([^ ]*)\s+([^ ]*)")
|
|
23 COLS = (
|
|
24 "start",
|
|
25 "end",
|
|
26 "strand",
|
|
27 "confidence",
|
|
28 "hp score",
|
|
29 "tail score",
|
|
30 "notes",
|
|
31 "5' tail",
|
|
32 "5' stem",
|
|
33 "loop",
|
|
34 "3' stem",
|
|
35 "3' tail",
|
|
36 )
|
|
37
|
|
38
|
|
39 def build_expterm():
|
|
40 pass
|
|
41
|
|
42
|
|
43 def generate_annotation_file(gff3):
|
|
44 # TODO: cleanup
|
|
45 t = tempfile.NamedTemporaryFile(mode="w",delete=False, suffix=".coords")
|
|
46 for rec in gffParse(gff3):
|
|
47 features = feature_lambda(
|
|
48 rec.features, feature_test_type, {"type": "CDS"}, subfeatures=False
|
|
49 )
|
|
50 for feature in sorted(features, key=lambda x: x.location.start):
|
|
51 t.write(
|
|
52 "\t".join(
|
|
53 map(
|
|
54 str,
|
|
55 [
|
|
56 feature.id,
|
|
57 feature.location.start + 1,
|
|
58 feature.location.end,
|
|
59 rec.id,
|
|
60 ],
|
|
61 )
|
|
62 )
|
|
63 + "\n"
|
|
64 )
|
|
65 name = t.name
|
|
66 t.close()
|
|
67 return name
|
|
68
|
|
69
|
|
70 def run_transterm(expterm, fasta, annotations):
|
|
71 output = subprocess.check_output(
|
|
72 ["transterm", "-p", expterm, "--all-context", fasta.name, annotations]
|
|
73 )
|
|
74 return output
|
|
75
|
|
76
|
|
77 def pairwise(it):
|
|
78 it = iter(it)
|
|
79 while True:
|
|
80 try:
|
|
81 yield next(it), next(it)
|
|
82 except StopIteration:
|
|
83 return
|
|
84
|
|
85 def parse_transterm(data):
|
|
86 data = data.decode("utf-8")
|
|
87 data = data.split("SEQUENCE")[1:]
|
|
88 for datum in data:
|
|
89 lines = datum.split("\n")
|
|
90
|
|
91 seq_name = lines[0][1:]
|
|
92 seq_name = seq_name[0 : seq_name.index(" ")]
|
|
93
|
|
94 record = SeqRecord(Seq("ACTG"), id=seq_name)
|
|
95
|
|
96 important_lines = [
|
|
97 x for x in lines if len(x.strip()) > 0 and x.startswith(" ")
|
|
98 ]
|
|
99 # Must have an even #
|
|
100 assert len(important_lines) % 2 == 0
|
|
101 for idx, (a, b) in enumerate(pairwise(important_lines)):
|
|
102 parsed_data = REGEX_TERM.match(a).groups() + PARTS_TERM.match(b).groups()
|
|
103
|
|
104 pd = {k: v for (k, v) in zip(COLS, parsed_data)}
|
|
105 # start , end , strand , confidence , hp score , tail score , notes
|
|
106 # , 5' tail , 5'stem , loop , 3' stem , 3'loop
|
|
107 start = int(pd["start"])
|
|
108 end = int(pd["end"])
|
|
109
|
|
110 notes = {k: pd[k] for k in COLS[6:]}
|
|
111 notes2 = {}
|
|
112 # However, if on - strand, we need to swap 5', 3':
|
|
113 if pd["strand"] == "-":
|
|
114 # Also need to flip the data
|
|
115 notes2 = {
|
|
116 "[1] 5' stem": str(Seq(notes["3' stem"]).reverse_complement()),
|
|
117 "[2] loop": str(Seq(notes["loop"]).reverse_complement()),
|
|
118 "[3] 3' stem": str(Seq(notes["5' stem"]).reverse_complement()),
|
|
119 "[4] 3' tail": str(Seq(notes["5' tail"]).reverse_complement()),
|
|
120 }
|
|
121 else:
|
|
122 notes2 = {
|
|
123 "[1] 5' stem": notes["5' stem"],
|
|
124 "[2] loop": notes["loop"],
|
|
125 "[3] 3' stem": notes["3' stem"],
|
|
126 "[4] 3' tail": notes["3' tail"],
|
|
127 }
|
|
128
|
|
129 qualifiers = {
|
|
130 "score": [pd["confidence"]],
|
|
131 "source": ["TranstermHP"],
|
|
132 "ID": ["terminator_%s" % idx],
|
|
133 }
|
|
134 current_start = min(start, end) - 1
|
|
135 current_end = max(start, end)
|
|
136
|
|
137 if pd["strand"] == "+":
|
|
138 # Let's extend the current_end to include any Ts we find.
|
|
139 # Take the 3' tail, and check to see how many Ts we can strip:
|
|
140 #
|
|
141 # Updated algo: take as many chars as possible until >1 is non-T
|
|
142 # If the non-T is last, strip.
|
|
143 # Otherwise (internal), leave it.
|
|
144 addition = ""
|
|
145 prime3tail = notes["3' tail"]
|
|
146 for idx in range(len(prime3tail)):
|
|
147 if (
|
|
148 prime3tail[idx] != "T"
|
|
149 and addition.count("A")
|
|
150 + addition.count("C")
|
|
151 + addition.count("G")
|
|
152 == 1
|
|
153 ):
|
|
154 break
|
|
155
|
|
156 addition += prime3tail[idx]
|
|
157
|
|
158 if addition[-1] != "T":
|
|
159 addition = addition[0:-1]
|
|
160
|
|
161 current_end += len(addition)
|
|
162 else:
|
|
163 addition = ""
|
|
164 prime5tail = notes["5' tail"][::-1]
|
|
165 for idx in range(len(prime5tail)):
|
|
166 if (
|
|
167 prime5tail[idx] != "A"
|
|
168 and addition.count("T")
|
|
169 + addition.count("C")
|
|
170 + addition.count("G")
|
|
171 == 1
|
|
172 ):
|
|
173 break
|
|
174
|
|
175 addition += prime5tail[idx]
|
|
176
|
|
177 if addition[-1] != "A":
|
|
178 addition = addition[0:-1]
|
|
179
|
|
180 current_start -= len(addition)
|
|
181
|
|
182 qualifiers.update(notes2)
|
|
183 feature = gffSeqFeature(
|
|
184 FeatureLocation(current_start, current_end),
|
|
185 type="terminator",
|
|
186 strand=1 if pd["strand"] == "+" else -1,
|
|
187 qualifiers=qualifiers,
|
|
188 )
|
|
189 record.features.append(feature)
|
|
190
|
|
191 yield record
|
|
192
|
|
193
|
|
194 def main(fasta, gff3, existing_expterm="", **kwargs):
|
|
195 coords_file = generate_annotation_file(gff3)
|
|
196 transterm_output = run_transterm(existing_expterm, fasta, coords_file)
|
|
197 try:
|
|
198 os.unlink(coords_file)
|
|
199 except Exception:
|
|
200 pass
|
|
201 # Not my job
|
|
202
|
|
203 for record in parse_transterm(transterm_output):
|
|
204 yield record
|
|
205
|
|
206
|
|
207 if __name__ == "__main__":
|
|
208 parser = argparse.ArgumentParser(
|
|
209 description="Export corresponding sequence in genome from GFF3", epilog=""
|
|
210 )
|
|
211 parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome")
|
|
212 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 File")
|
|
213
|
|
214 parser.add_argument(
|
|
215 "--min_conf",
|
|
216 type=int,
|
|
217 default=76,
|
|
218 help="Only output terminators with confidence >= n",
|
|
219 )
|
|
220
|
|
221 # parser.add_argument('--gc', type=float, default=-2.3, help='Score of a G-C pair')
|
|
222 # parser.add_argument('--au', type=float, default=-0.9, help='Score of an A-U pair')
|
|
223 # parser.add_argument('--gu', type=float, default=1.3, help='Score of a G-U pair')
|
|
224 # parser.add_argument('--mm', type=float, default=3.5, help='Score of any other pair')
|
|
225 # parser.add_argument('--gap', type=int, default=6, help='Score of a gap in the hairpin')
|
|
226 # parser.add_argument('--max_hp_score', type=float, default=-2, help='Maximum allowable hairpin score')
|
|
227 # parser.add_argument('--max_tail_score', type=float, default=-2.5, help='Maximum allowable tail score')
|
|
228 # parser.add_argument('--max_len', type=int, default=59, help='Total extent of hairpin <= n NT long')
|
|
229 # parser.add_argument('--min_stem', type=int, default=4, help='Stem must be n nucleotides long')
|
|
230 # parser.add_argument('--max_loop', type=int, default=13, help='The loop portion can be no longer than n')
|
|
231 # parser.add_argument('--min_loop', type=int, default=3, help='Loop portion of the hairpin must be at least n long')
|
|
232 # parser.add_argument('--uwin_require', type=int, default=3, help='Number of "U" nucleotides in the --uwin_length long region.')
|
|
233 # parser.add_argument('--loop_penalty', default='1,2,3,4,5,6,7,8,9,10,11', help='The cost of loops of various lengths can be set using --loop_penalty=f1,f2,f3,f4,f5,...fn, where f1 is the cost of a loop of length --min_loop, f2 is the cost of a loop of length --min_loop+1, as so on. If there are too few terms to cover up to max_loop, the last term is repeated.',)
|
|
234
|
|
235 args = parser.parse_args()
|
|
236
|
|
237 for record in main(
|
|
238 existing_expterm=os.path.join(SCRIPT_PATH, "expterm.dat"), **vars(args)
|
|
239 ):
|
|
240 gffWrite([record], sys.stdout)
|