comparison getIndelRates_3way.py @ 0:02e619ae15dc draft default tip

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 09:12:55 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:02e619ae15dc
1 #!/usr/bin/env python
2 #Guruprasad Ananda
3
4 import sys, os, tempfile
5 import fileinput
6 from warnings import warn
7
8 from galaxy.tools.util.galaxyops import *
9 from bx.intervals.io import *
10
11 from bx.intervals.operations import quicksect
12
13 def stop_err(msg):
14 sys.stderr.write(msg)
15 sys.exit()
16
17
18 def counter(node, start, end, sort_col):
19 global full, blk_len, blk_list
20 if node.start < start:
21 if node.right:
22 counter(node.right, start, end, sort_col)
23 elif start <= node.start <= end and start <= node.end <= end:
24 full += 1
25 if node.other[0] not in blk_list:
26 blk_list.append(node.other[0])
27 blk_len += int(node.other[sort_col+2])
28 if node.left and node.left.maxend > start:
29 counter(node.left, start, end, sort_col)
30 if node.right:
31 counter(node.right, start, end, sort_col)
32 elif node.start > end:
33 if node.left:
34 counter(node.left, start, end, sort_col)
35
36
37 infile = sys.argv[1]
38 fout = open(sys.argv[2],'w')
39 int_file = sys.argv[3]
40 if int_file != "None": #User has specified an interval file
41 try:
42 fint = open(int_file, 'r')
43 dbkey_i = sys.argv[4]
44 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
45 except:
46 stop_err("Unable to open input Interval file")
47
48
49 def main():
50 for i, line in enumerate( file ( infile )):
51 line = line.rstrip('\r\n')
52 if len( line )>0 and not line.startswith( '#' ):
53 elems = line.split( '\t' )
54 break
55 if i == 30:
56 break # Hopefully we'll never get here...
57
58 if len( elems ) != 18:
59 stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." )
60
61 for i, line in enumerate( file ( infile )):
62 line = line.rstrip('\r\n')
63 elems = line.split('\t')
64 try:
65 assert int(elems[0])
66 assert len(elems) == 18
67 if int_file != "None":
68 if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]:
69 stop_err("The species build corresponding to your interval file is not present in the Indel file.")
70 if dbkey_i in elems[3]:
71 sort_col = 4
72 elif dbkey_i in elems[8]:
73 sort_col = 9
74 elif dbkey_i in elems[13]:
75 sort_col = 14
76 else:
77 species = []
78 species.append( elems[3].split('.')[0] )
79 species.append( elems[8].split('.')[0] )
80 species.append( elems[13].split('.')[0] )
81 sort_col = 0 #Based on block numbers
82 break
83 except:
84 continue
85
86 fin = open(infile, 'r')
87 skipped = 0
88
89 if int_file == "None":
90 sorted_infile = tempfile.NamedTemporaryFile()
91 cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile
92 try:
93 os.system(cmdline)
94 except:
95 stop_err("Encountered error while sorting the input file.")
96 print >> fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" % ( species[0], species[1], species[2], species[0], species[1], species[2] )
97 prev_bnum = -1
98 sorted_infile.seek(0)
99 for line in sorted_infile.readlines():
100 line = line.rstrip('\r\n')
101 elems = line.split('\t')
102 try:
103 assert int(elems[0])
104 assert len(elems) == 18
105 new_bnum = int(elems[0])
106 if new_bnum != prev_bnum:
107 if prev_bnum != -1:
108 irate = []
109 drate = []
110 for i, elem in enumerate(inserts):
111 try:
112 irate.append(str("%.2e" % (inserts[i]/blen[i])))
113 except:
114 irate.append('0')
115 try:
116 drate.append(str("%.2e" % (deletes[i]/blen[i])))
117 except:
118 drate.append('0')
119 print >> fout, "%s\t%s\t%s" % ( prev_bnum, '\t'.join(irate) , '\t'.join(drate) )
120 inserts = [0.0, 0.0, 0.0]
121 deletes = [0.0, 0.0, 0.0]
122 blen = []
123 blen.append( int(elems[6]) )
124 blen.append( int(elems[11]) )
125 blen.append( int(elems[16]) )
126 line_sp = elems[1].split('.')[0]
127 sp_ind = species.index(line_sp)
128 if elems[1].endswith('insert'):
129 inserts[sp_ind] += 1
130 elif elems[1].endswith('delete'):
131 deletes[sp_ind] += 1
132 prev_bnum = new_bnum
133 except Exception, ei:
134 #print >>sys.stderr, ei
135 continue
136 irate = []
137 drate = []
138 for i, elem in enumerate(inserts):
139 try:
140 irate.append(str("%.2e" % (inserts[i]/blen[i])))
141 except:
142 irate.append('0')
143 try:
144 drate.append(str("%.2e" % (deletes[i]/blen[i])))
145 except:
146 drate.append('0')
147 print >> fout, "%s\t%s\t%s" % ( prev_bnum, '\t'.join(irate) , '\t'.join(drate) )
148 sys.exit()
149
150 inf = open(infile, 'r')
151 start_met = False
152 end_met = False
153 sp_file = tempfile.NamedTemporaryFile()
154 for n, line in enumerate(inf):
155 line = line.rstrip('\r\n')
156 elems = line.split('\t')
157 try:
158 assert int(elems[0])
159 assert len(elems) == 18
160 if dbkey_i not in elems[1]:
161 if not(start_met):
162 continue
163 else:
164 sp_end = n
165 break
166 else:
167 print >> sp_file, line
168 if not(start_met):
169 start_met = True
170 sp_start = n
171 except:
172 continue
173
174 try:
175 assert sp_end
176 except:
177 sp_end = n+1
178
179 sp_file.seek(0)
180 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
181 chrom_col=chr_col_i,
182 start_col=start_col_i,
183 end_col=end_col_i,
184 strand_col=strand_col_i,
185 fix_strand=True)
186
187 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),
188 chrom_col=1,
189 start_col=sort_col,
190 end_col=sort_col+1,
191 strand_col=-1,
192 fix_strand=True)
193
194 indelTree = quicksect.IntervalTree()
195 for item in indel:
196 if type( item ) is GenomicInterval:
197 indelTree.insert( item, indel.linenum, item.fields )
198 result = []
199
200 global full, blk_len, blk_list
201 for interval in win:
202 if type( interval ) is Header:
203 pass
204 if type( interval ) is Comment:
205 pass
206 elif type( interval ) == GenomicInterval:
207 chrom = interval.chrom
208 start = int(interval.start)
209 end = int(interval.end)
210 if start > end:
211 warn( "Interval start after end!" )
212 ins_chr = "%s.%s_insert" % ( dbkey_i, chrom )
213 del_chr = "%s.%s_delete" % ( dbkey_i, chrom )
214 irate = 0
215 drate = 0
216 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms:
217 pass
218 else:
219 if ins_chr in indelTree.chroms:
220 full = 0.0
221 blk_len = 0
222 blk_list = []
223 root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree
224 counter(root, start, end, sort_col)
225 if blk_len:
226 irate = full/blk_len
227
228 if del_chr in indelTree.chroms:
229 full = 0.0
230 blk_len = 0
231 blk_list = []
232 root = indelTree.chroms[del_chr] #root node for the chrom insertion tree
233 counter(root, start, end, sort_col)
234 if blk_len:
235 drate = full/blk_len
236
237 interval.fields.append(str("%.2e" %irate))
238 interval.fields.append(str("%.2e" %drate))
239 print >> fout, "\t".join(interval.fields)
240 fout.flush()
241
242
243 if __name__ == "__main__":
244 main()