annotate cpt_convert_mga_to_gff3.py @ 0:d5c3354c166d draft default tip

Uploaded
author cpt_testbed
date Fri, 29 Apr 2022 10:33:36 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
1 #!/usr/bin/env python
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
2 import sys
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
3 import argparse
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
4 from Bio import SeqIO
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
5 from Bio.SeqFeature import SeqFeature
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
6 from Bio.SeqFeature import FeatureLocation
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
8 import logging
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
9
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
10 logging.basicConfig(level=logging.INFO)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
11
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
12
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
13 def mga_to_gff3(mga_output, genome):
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
14 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
15
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
16 current_record = None
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
17 for line in mga_output:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
18 if line.startswith("#"):
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
19 if line.startswith("# gc = ") or line.startswith("# self:"):
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
20 continue
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
21 chromId = line.strip().replace("# ", "")
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
22
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
23 if " " in chromId:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
24 chromId = chromId[0 : chromId.index(" ")]
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
25
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
26 if chromId in seq_dict:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
27 if current_record is not None:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
28 yield current_record
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
29 current_record = seq_dict[chromId]
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
30 else:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
31 raise Exception(
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
32 "Found results for sequence %s which was not in fasta file sequences (%s)"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
33 % (chromId, ", ".join(seq_dict.keys()))
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
34 )
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
35
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
36 else:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
37 (
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
38 gene_id,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
39 start,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
40 end,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
41 strand,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
42 phase,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
43 complete,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
44 score,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
45 model,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
46 rbs_start,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
47 rbs_end,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
48 rbs_score,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
49 ) = line.strip().split("\t")
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
50 start = int(start)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
51 end = int(end)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
52 strand = +1 if strand == "+" else -1
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
53
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
54 # Correct for gff3
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
55 start -= 1
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
56
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
57 rbs_feat = None
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
58 if rbs_start != "-":
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
59 rbs_start = int(rbs_start)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
60 rbs_end = int(rbs_end)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
61 rbs_feat = gffSeqFeature(
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
62 FeatureLocation(rbs_start, rbs_end),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
63 type="Shine_Dalgarno_sequence",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
64 strand=strand,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
65 qualifiers={
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
66 "ID": "%s.rbs_%s" % (current_record.id, gene_id),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
67 "Source": "MGA",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
68 },
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
69 phase=phase,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
70 source="MGA"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
71 )
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
72
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
73 cds_feat = gffSeqFeature(
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
74 FeatureLocation(start, end),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
75 type="CDS",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
76 strand=strand,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
77 qualifiers={
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
78 "Source": "MGA",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
79 "ID": "%s.cds_%s" % (current_record.id, gene_id),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
80 },
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
81 phase=phase,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
82 source="MGA"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
83 )
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
84
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
85 if rbs_feat is not None:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
86 if strand > 0:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
87 gene_start = rbs_start
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
88 gene_end = end
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
89 else:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
90 gene_start = start
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
91 gene_end = rbs_end
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
92 else:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
93 gene_start = start
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
94 gene_end = end
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
95
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
96 gene = gffSeqFeature(
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
97 FeatureLocation(gene_start, gene_end),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
98 type="gene",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
99 strand=strand,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
100 id="%s.%s" % (current_record.id, gene_id),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
101 qualifiers={
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
102 "Source": "MGA",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
103 "ID": "%s.%s" % (current_record.id, gene_id),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
104 },
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
105 phase=phase,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
106 source="MGA"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
107 )
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
108
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
109 gene.sub_features = [cds_feat]
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
110 if rbs_feat is not None:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
111 gene.sub_features.append(rbs_feat)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
112 current_record.features.append(gene)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
113 yield current_record
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
114
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
115
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
116 if __name__ == "__main__":
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
117 parser = argparse.ArgumentParser(description="Convert MGA to GFF3", epilog="")
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
118 parser.add_argument(
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
119 "mga_output", type=argparse.FileType("r"), help="MetaGeneAnnotator Output"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
120 )
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
121 parser.add_argument("genome", type=argparse.FileType("r"), help="Fasta Genome")
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
122 args = parser.parse_args()
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
123
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
124 for result in mga_to_gff3(**vars(args)):
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
125 gffWrite([result], sys.stdout)