annotate cpt_convert_glimmer_to_gff3.py @ 0:d5c3354c166d draft default tip

Uploaded
author cpt_testbed
date Fri, 29 Apr 2022 10:33:36 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
1 #!/usr/bin/env python
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
2 import sys
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
3 import argparse
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
4 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
5 from Bio import SeqIO
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
6 from Bio.SeqFeature import SeqFeature
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
7 from Bio.SeqFeature import FeatureLocation
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
8 import logging
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
9
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
10 logging.basicConfig(level=logging.INFO)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
11
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
12
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
13 def glimmer3_to_gff3(glimmer, genome):
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
14 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
15
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
16 current_record = None
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
17 for line in glimmer:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
18 if line.startswith(">"):
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
19 chromId = line.strip().replace(">", "")
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
20 if chromId in seq_dict:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
21 if current_record is not None:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
22 yield current_record
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
23 current_record = seq_dict[chromId]
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
24 else:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
25 raise Exception(
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
26 "Found results for sequence %s which was not in fasta file sequences (%s)"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
27 % (chromId, ", ".join(seq_dict.keys()))
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
28 )
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
29
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
30 if not line.startswith(">"):
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
31 (gene_id, gstart, gend, phase, score) = line.strip().split()
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
32 gstart = int(gstart)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
33 gend = int(gend)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
34
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
35 if "+" in phase:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
36 strand = 1
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
37 start = gstart
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
38 end = gend
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
39 else:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
40 strand = -1
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
41 start = gend
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
42 end = gstart
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
43
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
44 # Correct for gff3
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
45 start -= 1
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
46
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
47 if start > end:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
48 #gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
49 #-------------start<=======|sequence end|========>end------
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
50 if strand > 0:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
51 end = len(current_record)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
52 else:
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
53 start = 0
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
54 gene_id+="_truncated"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
55
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
56 cds_feat = gffSeqFeature(
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
57 FeatureLocation(start, end),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
58 type="CDS",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
59 strand=strand,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
60 id="%s.%s" % (current_record.id, gene_id),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
61 qualifiers={
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
62 "source": "Glimmer3",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
63 "ID": "%s.cds_%s" % (current_record.id, gene_id),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
64 },
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
65 source="Glimmer3"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
66 )
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
67
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
68 gene = gffSeqFeature(
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
69 FeatureLocation(start, end),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
70 type="gene",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
71 strand=strand,
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
72 id="%s.%s" % (current_record.id, gene_id),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
73 qualifiers={
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
74 "source": "Glimmer3",
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
75 "ID": "%s.%s" % (current_record.id, gene_id),
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
76 },
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
77 source="Glimmer3"
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
78 )
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
79 gene.sub_features = [cds_feat]
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
80 current_record.features.append(gene)
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
81 yield current_record
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
82
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
83
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
84 if __name__ == "__main__":
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
85 parser = argparse.ArgumentParser(description="Convert Glimmer to GFF3")
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
86 parser.add_argument("glimmer", type=argparse.FileType("r"), help="Glimmer3 Output")
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
87 parser.add_argument("genome", type=argparse.FileType("r"), help="Fasta Genome")
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
88 args = parser.parse_args()
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
89
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
90 for result in glimmer3_to_gff3(**vars(args)):
d5c3354c166d Uploaded
cpt_testbed
parents:
diff changeset
91 gffWrite([result], sys.stdout)