comparison lipory.py @ 0:f678e282b320 draft default tip

"planemo upload"
author cpt_testbed
date Fri, 06 May 2022 07:07:23 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f678e282b320
1 #!/usr/bin/env python
2 import re
3 import sys
4 import argparse
5 import logging
6 from Bio import SeqIO
7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
8 from gff3 import feature_lambda, feature_test_type, get_id
9 from Bio.SeqFeature import SeqFeature, FeatureLocation
10
11 logging.basicConfig(level=logging.INFO)
12 log = logging.getLogger(__name__)
13
14
15 def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=40):
16 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta"))
17
18 CASES = [
19 re.compile(
20 "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist)
21 ),
22 re.compile(
23 "^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist)
24 ),
25 # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file
26 ]
27
28 for record in gffParse(gff3_file, base_dict=seq_dict):
29 good_features = []
30
31 genes = list(
32 feature_lambda(
33 record.features, feature_test_type, {"type": "gene"}, subfeatures=True
34 )
35 )
36 for gene in genes:
37 cdss = list(
38 feature_lambda(
39 gene.sub_features,
40 feature_test_type,
41 {"type": "CDS"},
42 subfeatures=False,
43 )
44 )
45 if len(cdss) == 0:
46 continue
47
48 for cds in cdss:
49 try:
50 tmpseq = str(
51 cds.extract(record.seq).translate(table=11, cds=True)
52 ).replace("*", "")
53 except:
54 continue
55
56 for case in CASES:
57 m = case.search(tmpseq)
58 if m:
59 if cds.location.strand > 0:
60 start = cds.location.start + (3 * (m.end() - 4))
61 end = cds.location.start + (3 * m.end())
62 else:
63 start = cds.location.end - (3 * (m.end() - 4))
64 end = cds.location.end - (3 * m.end())
65
66 tmp = gffSeqFeature(
67 FeatureLocation(
68 min(start, end),
69 max(start, end),
70 strand=cds.location.strand,
71 ),
72 type="Lipobox",
73 qualifiers={
74 "source": "CPT_LipoRy",
75 "ID": "%s.lipobox" % get_id(gene),
76 },
77 )
78 tmp.qualifiers["sequence"] = str(
79 tmp.extract(record).seq.translate()
80 )
81
82 gene.sub_features.append(tmp)
83 good_features.append(gene)
84
85 record.features = good_features
86 yield [record]
87
88
89 if __name__ == "__main__":
90 parser = argparse.ArgumentParser(description="Filter out lipoproteins", epilog="")
91 parser.add_argument(
92 "gff3_file", type=argparse.FileType("r"), help="Naive ORF Calls"
93 )
94 parser.add_argument(
95 "fasta_genome", type=argparse.FileType("r"), help="Fasta genome sequence"
96 )
97
98 parser.add_argument(
99 "--lipobox_mindist",
100 type=int,
101 help="Minimum distance in codons to start of lipobox",
102 default=10,
103 )
104 parser.add_argument(
105 "--lipobox_maxdist",
106 type=int,
107 help="Maximum distance in codons to start of lipobox",
108 default=40,
109 )
110
111 args = parser.parse_args()
112
113 args = vars(parser.parse_args())
114 for record in find_lipoprotein(**args):
115 record[0].annotations = {}
116 gffWrite(record, sys.stdout)