annotate wig_rebase.py @ 0:f678e282b320 draft default tip

"planemo upload"
author cpt_testbed
date Fri, 06 May 2022 07:07:23 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
1 #!/usr/bin/env python
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
2 import re
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
3 import sys
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
4 import logging
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
5 import argparse
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
6 import numpy
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
7 from gff3 import feature_lambda, feature_test_true
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
8 from CPT_GFFParser import gffParse, gffWrite
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
9
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
10 log = logging.getLogger(__name__)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
11 logging.basicConfig(level=logging.INFO)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
12
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
13
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
14 def __update_feature_location(pos, parent, protein2dna):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
15 if protein2dna:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
16 pos *= 3
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
17 # Move back so location is correct.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
18 if parent.strand > 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
19 pos -= 3
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
20
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
21 if parent.strand >= 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
22 new_pos = parent.start + pos
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
23 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
24 new_pos = parent.end - pos
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
25
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
26 # print(start, end, ns, ne, st)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
27
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
28 # Don't let start/stops be less than zero.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
29 # Instead, we'll replace with %3 to try and keep it in the same reading
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
30 # frame that it should be in.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
31 if new_pos < 0:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
32 new_pos %= 3
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
33 return new_pos
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
34
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
35
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
36 def getGff3Locations(parent, map_by="ID"):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
37 featureLocations = {}
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
38 recs = gffParse(parent)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
39 # Only parse first.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
40 rec = recs[0]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
41 # Get all the feature locations in this genome
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
42 for feature in feature_lambda(rec.features, feature_test_true, {}):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
43 id = feature.qualifiers.get(map_by, [feature.id])[0]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
44 featureLocations[id] = feature.location
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
45 return rec, featureLocations
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
46
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
47
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
48 def rebase_wig(parent, wigData, protein2dna=False, map_by="ID"):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
49 rec, locations = getGff3Locations(parent, map_by=map_by)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
50 current_id = None
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
51 current_ft = None
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
52 # We have to store in a giant array so we can overwrite safely and don't
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
53 # emit multiple values.
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
54 values = numpy.empty(1000000)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
55
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
56 maxFtLoc = 0
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
57 for line in wigData:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
58 if line.startswith("track"):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
59 # pass through
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
60 sys.stdout.write(line)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
61 sys.stdout.write("variableStep chrom=%s span=1\n" % rec.id)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
62 continue
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
63 if line.startswith("variableStep"):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
64 # No passthrough
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
65 current_id = re.findall("chrom=([^ ]+)", line)[0]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
66 try:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
67 current_ft = locations[current_id]
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
68 except:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
69 continue
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
70 # Update max value
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
71 if current_ft.end > maxFtLoc:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
72 maxFtLoc = current_ft.end
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
73 else:
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
74 (pos, val) = line.strip().split()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
75 pos = int(pos)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
76 val = float(val)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
77
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
78 npos = __update_feature_location(pos, current_ft, protein2dna=protein2dna)
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
79 values[npos] = val
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
80 values[npos + 1] = val
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
81 values[npos + 2] = val
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
82
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
83 for i in range(maxFtLoc):
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
84 sys.stdout.write("%s %s\n" % (i + 1, values[i]))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
85
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
86
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
87 if __name__ == "__main__":
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
88 parser = argparse.ArgumentParser(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
89 description="rebase wig data against parent locations"
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
90 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
91 parser.add_argument("parent", type=argparse.FileType("r"))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
92 parser.add_argument("wigData", type=argparse.FileType("r"))
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
93 parser.add_argument(
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
94 "--protein2dna",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
95 action="store_true",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
96 help="Map protein translated results to original DNA data",
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
97 )
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
98 parser.add_argument("--map_by", help="Map by key", default="ID")
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
99 args = parser.parse_args()
f678e282b320 "planemo upload"
cpt_testbed
parents:
diff changeset
100 rebase_wig(**vars(args))