0
|
1 #!/usr/bin/env python
|
|
2 import re
|
|
3 import sys
|
|
4 import argparse
|
|
5 import logging
|
|
6 from Bio import SeqIO
|
|
7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
|
|
8 from gff3 import feature_lambda, feature_test_type, get_id
|
|
9 from Bio.SeqFeature import SeqFeature, FeatureLocation
|
|
10
|
|
11 logging.basicConfig(level=logging.INFO)
|
|
12 log = logging.getLogger(__name__)
|
|
13
|
|
14
|
|
15 def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=40):
|
|
16 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta"))
|
|
17
|
|
18 CASES = [
|
|
19 re.compile(
|
|
20 "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist)
|
|
21 ),
|
|
22 re.compile(
|
|
23 "^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist)
|
|
24 ),
|
|
25 # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file
|
|
26 ]
|
|
27
|
|
28 for record in gffParse(gff3_file, base_dict=seq_dict):
|
|
29 good_features = []
|
|
30
|
|
31 genes = list(
|
|
32 feature_lambda(
|
|
33 record.features, feature_test_type, {"type": "gene"}, subfeatures=True
|
|
34 )
|
|
35 )
|
|
36 for gene in genes:
|
|
37 cdss = list(
|
|
38 feature_lambda(
|
|
39 gene.sub_features,
|
|
40 feature_test_type,
|
|
41 {"type": "CDS"},
|
|
42 subfeatures=False,
|
|
43 )
|
|
44 )
|
|
45 if len(cdss) == 0:
|
|
46 continue
|
|
47
|
|
48 for cds in cdss:
|
|
49 try:
|
|
50 tmpseq = str(
|
|
51 cds.extract(record.seq).translate(table=11, cds=True)
|
|
52 ).replace("*", "")
|
|
53 except:
|
|
54 continue
|
|
55
|
|
56 for case in CASES:
|
|
57 m = case.search(tmpseq)
|
|
58 if m:
|
|
59 if cds.location.strand > 0:
|
|
60 start = cds.location.start + (3 * (m.end() - 4))
|
|
61 end = cds.location.start + (3 * m.end())
|
|
62 else:
|
|
63 start = cds.location.end - (3 * (m.end() - 4))
|
|
64 end = cds.location.end - (3 * m.end())
|
|
65
|
|
66 tmp = gffSeqFeature(
|
|
67 FeatureLocation(
|
|
68 min(start, end),
|
|
69 max(start, end),
|
|
70 strand=cds.location.strand,
|
|
71 ),
|
|
72 type="Lipobox",
|
|
73 qualifiers={
|
|
74 "source": "CPT_LipoRy",
|
|
75 "ID": "%s.lipobox" % get_id(gene),
|
|
76 },
|
|
77 )
|
|
78 tmp.qualifiers["sequence"] = str(
|
|
79 tmp.extract(record).seq.translate()
|
|
80 )
|
|
81
|
|
82 gene.sub_features.append(tmp)
|
|
83 good_features.append(gene)
|
|
84
|
|
85 record.features = good_features
|
|
86 yield [record]
|
|
87
|
|
88
|
|
89 if __name__ == "__main__":
|
|
90 parser = argparse.ArgumentParser(description="Filter out lipoproteins", epilog="")
|
|
91 parser.add_argument(
|
|
92 "gff3_file", type=argparse.FileType("r"), help="Naive ORF Calls"
|
|
93 )
|
|
94 parser.add_argument(
|
|
95 "fasta_genome", type=argparse.FileType("r"), help="Fasta genome sequence"
|
|
96 )
|
|
97
|
|
98 parser.add_argument(
|
|
99 "--lipobox_mindist",
|
|
100 type=int,
|
|
101 help="Minimum distance in codons to start of lipobox",
|
|
102 default=10,
|
|
103 )
|
|
104 parser.add_argument(
|
|
105 "--lipobox_maxdist",
|
|
106 type=int,
|
|
107 help="Maximum distance in codons to start of lipobox",
|
|
108 default=40,
|
|
109 )
|
|
110
|
|
111 args = parser.parse_args()
|
|
112
|
|
113 args = vars(parser.parse_args())
|
|
114 for record in find_lipoprotein(**args):
|
|
115 record[0].annotations = {}
|
|
116 gffWrite(record, sys.stdout)
|