annotate lipory.py @ 0:f678e282b320 draft default tip

"planemo upload"
author cpt_testbed
date Fri, 06 May 2022 07:07:23 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
1 #!/usr/bin/env python
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
2 import re
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
3 import sys
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
4 import argparse
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
5 import logging
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
6 from Bio import SeqIO
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
8 from gff3 import feature_lambda, feature_test_type, get_id
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
9 from Bio.SeqFeature import SeqFeature, FeatureLocation
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
10
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
11 logging.basicConfig(level=logging.INFO)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
12 log = logging.getLogger(__name__)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
13
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
14
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
15 def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=40):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
16 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta"))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
17
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
18 CASES = [
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
19 re.compile(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
20 "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
21 ),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
22 re.compile(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
23 "^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
24 ),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
25 # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
26 ]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
27
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
28 for record in gffParse(gff3_file, base_dict=seq_dict):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
29 good_features = []
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
30
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
31 genes = list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
32 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
33 record.features, feature_test_type, {"type": "gene"}, subfeatures=True
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
34 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
35 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
36 for gene in genes:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
37 cdss = list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
38 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
39 gene.sub_features,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
40 feature_test_type,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
41 {"type": "CDS"},
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
42 subfeatures=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
43 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
44 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
45 if len(cdss) == 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
46 continue
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
47
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
48 for cds in cdss:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
49 try:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
50 tmpseq = str(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
51 cds.extract(record.seq).translate(table=11, cds=True)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
52 ).replace("*", "")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
53 except:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
54 continue
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
55
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
56 for case in CASES:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
57 m = case.search(tmpseq)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
58 if m:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
59 if cds.location.strand > 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
60 start = cds.location.start + (3 * (m.end() - 4))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
61 end = cds.location.start + (3 * m.end())
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
62 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
63 start = cds.location.end - (3 * (m.end() - 4))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
64 end = cds.location.end - (3 * m.end())
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
65
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
66 tmp = gffSeqFeature(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
67 FeatureLocation(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
68 min(start, end),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
69 max(start, end),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
70 strand=cds.location.strand,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
71 ),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
72 type="Lipobox",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
73 qualifiers={
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
74 "source": "CPT_LipoRy",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
75 "ID": "%s.lipobox" % get_id(gene),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
76 },
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
77 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
78 tmp.qualifiers["sequence"] = str(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
79 tmp.extract(record).seq.translate()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
80 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
81
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
82 gene.sub_features.append(tmp)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
83 good_features.append(gene)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
84
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
85 record.features = good_features
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
86 yield [record]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
87
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
88
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
89 if __name__ == "__main__":
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
90 parser = argparse.ArgumentParser(description="Filter out lipoproteins", epilog="")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
91 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
92 "gff3_file", type=argparse.FileType("r"), help="Naive ORF Calls"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
93 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
94 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
95 "fasta_genome", type=argparse.FileType("r"), help="Fasta genome sequence"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
96 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
97
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
98 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
99 "--lipobox_mindist",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
100 type=int,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
101 help="Minimum distance in codons to start of lipobox",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
102 default=10,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
103 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
104 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
105 "--lipobox_maxdist",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
106 type=int,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
107 help="Maximum distance in codons to start of lipobox",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
108 default=40,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
109 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
110
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
111 args = parser.parse_args()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
112
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
113 args = vars(parser.parse_args())
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
114 for record in find_lipoprotein(**args):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
115 record[0].annotations = {}
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
116 gffWrite(record, sys.stdout)