0
|
1 #!/usr/bin/env python
|
|
2 import sys
|
|
3 import argparse
|
|
4 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
|
|
5 from Bio import SeqIO
|
|
6 from Bio.SeqFeature import SeqFeature
|
|
7 from Bio.SeqFeature import FeatureLocation
|
|
8 import logging
|
|
9
|
|
10 logging.basicConfig(level=logging.INFO)
|
|
11
|
|
12
|
|
13 def glimmer3_to_gff3(glimmer, genome):
|
|
14 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
|
|
15
|
|
16 current_record = None
|
|
17 for line in glimmer:
|
|
18 if line.startswith(">"):
|
|
19 chromId = line.strip().replace(">", "")
|
|
20 if chromId in seq_dict:
|
|
21 if current_record is not None:
|
|
22 yield current_record
|
|
23 current_record = seq_dict[chromId]
|
|
24 else:
|
|
25 raise Exception(
|
|
26 "Found results for sequence %s which was not in fasta file sequences (%s)"
|
|
27 % (chromId, ", ".join(seq_dict.keys()))
|
|
28 )
|
|
29
|
|
30 if not line.startswith(">"):
|
|
31 (gene_id, gstart, gend, phase, score) = line.strip().split()
|
|
32 gstart = int(gstart)
|
|
33 gend = int(gend)
|
|
34
|
|
35 if "+" in phase:
|
|
36 strand = 1
|
|
37 start = gstart
|
|
38 end = gend
|
|
39 else:
|
|
40 strand = -1
|
|
41 start = gend
|
|
42 end = gstart
|
|
43
|
|
44 # Correct for gff3
|
|
45 start -= 1
|
|
46
|
|
47 if start > end:
|
|
48 #gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome
|
|
49 #-------------start<=======|sequence end|========>end------
|
|
50 if strand > 0:
|
|
51 end = len(current_record)
|
|
52 else:
|
|
53 start = 0
|
|
54 gene_id+="_truncated"
|
|
55
|
|
56 cds_feat = gffSeqFeature(
|
|
57 FeatureLocation(start, end),
|
|
58 type="CDS",
|
|
59 strand=strand,
|
|
60 id="%s.%s" % (current_record.id, gene_id),
|
|
61 qualifiers={
|
|
62 "source": "Glimmer3",
|
|
63 "ID": "%s.cds_%s" % (current_record.id, gene_id),
|
|
64 },
|
|
65 source="Glimmer3"
|
|
66 )
|
|
67
|
|
68 gene = gffSeqFeature(
|
|
69 FeatureLocation(start, end),
|
|
70 type="gene",
|
|
71 strand=strand,
|
|
72 id="%s.%s" % (current_record.id, gene_id),
|
|
73 qualifiers={
|
|
74 "source": "Glimmer3",
|
|
75 "ID": "%s.%s" % (current_record.id, gene_id),
|
|
76 },
|
|
77 source="Glimmer3"
|
|
78 )
|
|
79 gene.sub_features = [cds_feat]
|
|
80 current_record.features.append(gene)
|
|
81 yield current_record
|
|
82
|
|
83
|
|
84 if __name__ == "__main__":
|
|
85 parser = argparse.ArgumentParser(description="Convert Glimmer to GFF3")
|
|
86 parser.add_argument("glimmer", type=argparse.FileType("r"), help="Glimmer3 Output")
|
|
87 parser.add_argument("genome", type=argparse.FileType("r"), help="Fasta Genome")
|
|
88 args = parser.parse_args()
|
|
89
|
|
90 for result in glimmer3_to_gff3(**vars(args)):
|
|
91 gffWrite([result], sys.stdout)
|