Mercurial > repos > cpt_testbed > functionalworkflow
comparison lipory.py @ 0:f678e282b320 draft default tip
"planemo upload"
author | cpt_testbed |
---|---|
date | Fri, 06 May 2022 07:07:23 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f678e282b320 |
---|---|
1 #!/usr/bin/env python | |
2 import re | |
3 import sys | |
4 import argparse | |
5 import logging | |
6 from Bio import SeqIO | |
7 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature | |
8 from gff3 import feature_lambda, feature_test_type, get_id | |
9 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
10 | |
11 logging.basicConfig(level=logging.INFO) | |
12 log = logging.getLogger(__name__) | |
13 | |
14 | |
15 def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=40): | |
16 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta")) | |
17 | |
18 CASES = [ | |
19 re.compile( | |
20 "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist) | |
21 ), | |
22 re.compile( | |
23 "^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist) | |
24 ), | |
25 # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file | |
26 ] | |
27 | |
28 for record in gffParse(gff3_file, base_dict=seq_dict): | |
29 good_features = [] | |
30 | |
31 genes = list( | |
32 feature_lambda( | |
33 record.features, feature_test_type, {"type": "gene"}, subfeatures=True | |
34 ) | |
35 ) | |
36 for gene in genes: | |
37 cdss = list( | |
38 feature_lambda( | |
39 gene.sub_features, | |
40 feature_test_type, | |
41 {"type": "CDS"}, | |
42 subfeatures=False, | |
43 ) | |
44 ) | |
45 if len(cdss) == 0: | |
46 continue | |
47 | |
48 for cds in cdss: | |
49 try: | |
50 tmpseq = str( | |
51 cds.extract(record.seq).translate(table=11, cds=True) | |
52 ).replace("*", "") | |
53 except: | |
54 continue | |
55 | |
56 for case in CASES: | |
57 m = case.search(tmpseq) | |
58 if m: | |
59 if cds.location.strand > 0: | |
60 start = cds.location.start + (3 * (m.end() - 4)) | |
61 end = cds.location.start + (3 * m.end()) | |
62 else: | |
63 start = cds.location.end - (3 * (m.end() - 4)) | |
64 end = cds.location.end - (3 * m.end()) | |
65 | |
66 tmp = gffSeqFeature( | |
67 FeatureLocation( | |
68 min(start, end), | |
69 max(start, end), | |
70 strand=cds.location.strand, | |
71 ), | |
72 type="Lipobox", | |
73 qualifiers={ | |
74 "source": "CPT_LipoRy", | |
75 "ID": "%s.lipobox" % get_id(gene), | |
76 }, | |
77 ) | |
78 tmp.qualifiers["sequence"] = str( | |
79 tmp.extract(record).seq.translate() | |
80 ) | |
81 | |
82 gene.sub_features.append(tmp) | |
83 good_features.append(gene) | |
84 | |
85 record.features = good_features | |
86 yield [record] | |
87 | |
88 | |
89 if __name__ == "__main__": | |
90 parser = argparse.ArgumentParser(description="Filter out lipoproteins", epilog="") | |
91 parser.add_argument( | |
92 "gff3_file", type=argparse.FileType("r"), help="Naive ORF Calls" | |
93 ) | |
94 parser.add_argument( | |
95 "fasta_genome", type=argparse.FileType("r"), help="Fasta genome sequence" | |
96 ) | |
97 | |
98 parser.add_argument( | |
99 "--lipobox_mindist", | |
100 type=int, | |
101 help="Minimum distance in codons to start of lipobox", | |
102 default=10, | |
103 ) | |
104 parser.add_argument( | |
105 "--lipobox_maxdist", | |
106 type=int, | |
107 help="Maximum distance in codons to start of lipobox", | |
108 default=40, | |
109 ) | |
110 | |
111 args = parser.parse_args() | |
112 | |
113 args = vars(parser.parse_args()) | |
114 for record in find_lipoprotein(**args): | |
115 record[0].annotations = {} | |
116 gffWrite(record, sys.stdout) |