annotate lipoP_to_gff3.py @ 0:f678e282b320 draft default tip

"planemo upload"
author cpt_testbed
date Fri, 06 May 2022 07:07:23 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
1 #!/usr/bin/env python
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
2 import sys
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
3 import copy
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
4 import argparse
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
5 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
6 from Bio.Seq import Seq
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
7 from Bio.SeqRecord import SeqRecord
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
8 from Bio.SeqFeature import FeatureLocation
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
9 from gff3 import feature_lambda, feature_test_type, get_id
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
10
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
11
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
12 def lipoP_gff(lipoIn, gff3In, jBrowseOut, filterSP2):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
13
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
14 orgIDs = {}
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
15 orgID = ""
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
16
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
17 # Take and parse the txt output into a sequence of records
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
18 # Dict of X records, with the ID as key and an array Y of each cleavage site as the value,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
19 for row in lipoIn:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
20 if row.startswith("#"):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
21 orgID = ""
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
22 continue
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
23
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
24 rowElem = row.split("\t")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
25
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
26 orgID = rowElem[0]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
27
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
28 if filterSP2:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
29 if rowElem[2] == "CleavII":
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
30 if not (orgID in orgIDs.keys()):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
31 orgIDs[orgID] = []
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
32 orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4])))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
33 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
34 if rowElem[2] in "CleavII":
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
35 if not (orgID in orgIDs.keys()):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
36 orgIDs[orgID] = []
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
37 orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4])))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
38
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
39
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
40 # Rebase
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
41 for gff in gffParse(gff3In):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
42 keepSeq = []
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
43 for xRec in gff.features:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
44 cdss = list(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
45 feature_lambda(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
46 xRec.sub_features,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
47 feature_test_type,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
48 {"type": "CDS"},
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
49 subfeatures=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
50 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
51 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
52 findCleave = ""
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
53 cdsOff = 0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
54 for cds in cdss:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
55 if cds.id in orgIDs:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
56 findCleave = cds.id
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
57 break
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
58 cdsOff += 1
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
59 if findCleave == "":
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
60 if not jBrowseOut:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
61 keepSeq.append(xRec)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
62 continue
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
63
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
64 #if jBrowseOut:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
65 # xRec.sub_features = []
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
66
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
67 i = 0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
68 for cleaveBase in orgIDs[findCleave]:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
69 tempQuals = xRec.qualifiers.copy()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
70 i += 1
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
71 tempQuals["ID"] = xRec.id + "_cleavage_" + str(i)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
72
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
73 xRec.sub_features.append(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
74 gffSeqFeature(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
75 FeatureLocation(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
76 cdss[cdsOff].location.start + (cleaveBase * 3) - 1,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
77 cdss[cdsOff].location.start + (cleaveBase * 3) + 1,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
78 ),
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
79 type="cleavage_site",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
80 strand=xRec.location.strand,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
81 qualifiers=tempQuals,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
82 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
83 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
84 keepSeq.append(xRec)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
85
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
86 gff.features = keepSeq
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
87 gffWrite([gff], sys.stdout)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
88
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
89
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
90 if __name__ == "__main__":
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
91 parser = argparse.ArgumentParser(description="add parent gene features to CDSs")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
92 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
93 "lipoIn", type=argparse.FileType("r"), help="LipoP tool's .txt output"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
94 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
95 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
96 "gff3In", type=argparse.FileType("r"), help="GFF3 to rebase LipoP results"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
97 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
98 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
99 "--jBrowseOut",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
100 type=bool,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
101 default=False,
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
102 help="Prepare Output for jBrowse instance",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
103 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
104 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
105 "--filterSP2",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
106 action='store_true',
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
107 help="Filter for only SPII sites",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
108 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
109 args = parser.parse_args()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
110 lipoP_gff(**vars(args))