Mercurial > repos > devteam > getindelrates_3way
comparison getIndelRates_3way.py @ 0:02e619ae15dc draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Tue, 01 Apr 2014 09:12:55 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:02e619ae15dc |
---|---|
1 #!/usr/bin/env python | |
2 #Guruprasad Ananda | |
3 | |
4 import sys, os, tempfile | |
5 import fileinput | |
6 from warnings import warn | |
7 | |
8 from galaxy.tools.util.galaxyops import * | |
9 from bx.intervals.io import * | |
10 | |
11 from bx.intervals.operations import quicksect | |
12 | |
13 def stop_err(msg): | |
14 sys.stderr.write(msg) | |
15 sys.exit() | |
16 | |
17 | |
18 def counter(node, start, end, sort_col): | |
19 global full, blk_len, blk_list | |
20 if node.start < start: | |
21 if node.right: | |
22 counter(node.right, start, end, sort_col) | |
23 elif start <= node.start <= end and start <= node.end <= end: | |
24 full += 1 | |
25 if node.other[0] not in blk_list: | |
26 blk_list.append(node.other[0]) | |
27 blk_len += int(node.other[sort_col+2]) | |
28 if node.left and node.left.maxend > start: | |
29 counter(node.left, start, end, sort_col) | |
30 if node.right: | |
31 counter(node.right, start, end, sort_col) | |
32 elif node.start > end: | |
33 if node.left: | |
34 counter(node.left, start, end, sort_col) | |
35 | |
36 | |
37 infile = sys.argv[1] | |
38 fout = open(sys.argv[2],'w') | |
39 int_file = sys.argv[3] | |
40 if int_file != "None": #User has specified an interval file | |
41 try: | |
42 fint = open(int_file, 'r') | |
43 dbkey_i = sys.argv[4] | |
44 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) | |
45 except: | |
46 stop_err("Unable to open input Interval file") | |
47 | |
48 | |
49 def main(): | |
50 for i, line in enumerate( file ( infile )): | |
51 line = line.rstrip('\r\n') | |
52 if len( line )>0 and not line.startswith( '#' ): | |
53 elems = line.split( '\t' ) | |
54 break | |
55 if i == 30: | |
56 break # Hopefully we'll never get here... | |
57 | |
58 if len( elems ) != 18: | |
59 stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." ) | |
60 | |
61 for i, line in enumerate( file ( infile )): | |
62 line = line.rstrip('\r\n') | |
63 elems = line.split('\t') | |
64 try: | |
65 assert int(elems[0]) | |
66 assert len(elems) == 18 | |
67 if int_file != "None": | |
68 if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]: | |
69 stop_err("The species build corresponding to your interval file is not present in the Indel file.") | |
70 if dbkey_i in elems[3]: | |
71 sort_col = 4 | |
72 elif dbkey_i in elems[8]: | |
73 sort_col = 9 | |
74 elif dbkey_i in elems[13]: | |
75 sort_col = 14 | |
76 else: | |
77 species = [] | |
78 species.append( elems[3].split('.')[0] ) | |
79 species.append( elems[8].split('.')[0] ) | |
80 species.append( elems[13].split('.')[0] ) | |
81 sort_col = 0 #Based on block numbers | |
82 break | |
83 except: | |
84 continue | |
85 | |
86 fin = open(infile, 'r') | |
87 skipped = 0 | |
88 | |
89 if int_file == "None": | |
90 sorted_infile = tempfile.NamedTemporaryFile() | |
91 cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile | |
92 try: | |
93 os.system(cmdline) | |
94 except: | |
95 stop_err("Encountered error while sorting the input file.") | |
96 print >> fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" % ( species[0], species[1], species[2], species[0], species[1], species[2] ) | |
97 prev_bnum = -1 | |
98 sorted_infile.seek(0) | |
99 for line in sorted_infile.readlines(): | |
100 line = line.rstrip('\r\n') | |
101 elems = line.split('\t') | |
102 try: | |
103 assert int(elems[0]) | |
104 assert len(elems) == 18 | |
105 new_bnum = int(elems[0]) | |
106 if new_bnum != prev_bnum: | |
107 if prev_bnum != -1: | |
108 irate = [] | |
109 drate = [] | |
110 for i, elem in enumerate(inserts): | |
111 try: | |
112 irate.append(str("%.2e" % (inserts[i]/blen[i]))) | |
113 except: | |
114 irate.append('0') | |
115 try: | |
116 drate.append(str("%.2e" % (deletes[i]/blen[i]))) | |
117 except: | |
118 drate.append('0') | |
119 print >> fout, "%s\t%s\t%s" % ( prev_bnum, '\t'.join(irate) , '\t'.join(drate) ) | |
120 inserts = [0.0, 0.0, 0.0] | |
121 deletes = [0.0, 0.0, 0.0] | |
122 blen = [] | |
123 blen.append( int(elems[6]) ) | |
124 blen.append( int(elems[11]) ) | |
125 blen.append( int(elems[16]) ) | |
126 line_sp = elems[1].split('.')[0] | |
127 sp_ind = species.index(line_sp) | |
128 if elems[1].endswith('insert'): | |
129 inserts[sp_ind] += 1 | |
130 elif elems[1].endswith('delete'): | |
131 deletes[sp_ind] += 1 | |
132 prev_bnum = new_bnum | |
133 except Exception, ei: | |
134 #print >>sys.stderr, ei | |
135 continue | |
136 irate = [] | |
137 drate = [] | |
138 for i, elem in enumerate(inserts): | |
139 try: | |
140 irate.append(str("%.2e" % (inserts[i]/blen[i]))) | |
141 except: | |
142 irate.append('0') | |
143 try: | |
144 drate.append(str("%.2e" % (deletes[i]/blen[i]))) | |
145 except: | |
146 drate.append('0') | |
147 print >> fout, "%s\t%s\t%s" % ( prev_bnum, '\t'.join(irate) , '\t'.join(drate) ) | |
148 sys.exit() | |
149 | |
150 inf = open(infile, 'r') | |
151 start_met = False | |
152 end_met = False | |
153 sp_file = tempfile.NamedTemporaryFile() | |
154 for n, line in enumerate(inf): | |
155 line = line.rstrip('\r\n') | |
156 elems = line.split('\t') | |
157 try: | |
158 assert int(elems[0]) | |
159 assert len(elems) == 18 | |
160 if dbkey_i not in elems[1]: | |
161 if not(start_met): | |
162 continue | |
163 else: | |
164 sp_end = n | |
165 break | |
166 else: | |
167 print >> sp_file, line | |
168 if not(start_met): | |
169 start_met = True | |
170 sp_start = n | |
171 except: | |
172 continue | |
173 | |
174 try: | |
175 assert sp_end | |
176 except: | |
177 sp_end = n+1 | |
178 | |
179 sp_file.seek(0) | |
180 win = NiceReaderWrapper( fileinput.FileInput( int_file ), | |
181 chrom_col=chr_col_i, | |
182 start_col=start_col_i, | |
183 end_col=end_col_i, | |
184 strand_col=strand_col_i, | |
185 fix_strand=True) | |
186 | |
187 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), | |
188 chrom_col=1, | |
189 start_col=sort_col, | |
190 end_col=sort_col+1, | |
191 strand_col=-1, | |
192 fix_strand=True) | |
193 | |
194 indelTree = quicksect.IntervalTree() | |
195 for item in indel: | |
196 if type( item ) is GenomicInterval: | |
197 indelTree.insert( item, indel.linenum, item.fields ) | |
198 result = [] | |
199 | |
200 global full, blk_len, blk_list | |
201 for interval in win: | |
202 if type( interval ) is Header: | |
203 pass | |
204 if type( interval ) is Comment: | |
205 pass | |
206 elif type( interval ) == GenomicInterval: | |
207 chrom = interval.chrom | |
208 start = int(interval.start) | |
209 end = int(interval.end) | |
210 if start > end: | |
211 warn( "Interval start after end!" ) | |
212 ins_chr = "%s.%s_insert" % ( dbkey_i, chrom ) | |
213 del_chr = "%s.%s_delete" % ( dbkey_i, chrom ) | |
214 irate = 0 | |
215 drate = 0 | |
216 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms: | |
217 pass | |
218 else: | |
219 if ins_chr in indelTree.chroms: | |
220 full = 0.0 | |
221 blk_len = 0 | |
222 blk_list = [] | |
223 root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree | |
224 counter(root, start, end, sort_col) | |
225 if blk_len: | |
226 irate = full/blk_len | |
227 | |
228 if del_chr in indelTree.chroms: | |
229 full = 0.0 | |
230 blk_len = 0 | |
231 blk_list = [] | |
232 root = indelTree.chroms[del_chr] #root node for the chrom insertion tree | |
233 counter(root, start, end, sort_col) | |
234 if blk_len: | |
235 drate = full/blk_len | |
236 | |
237 interval.fields.append(str("%.2e" %irate)) | |
238 interval.fields.append(str("%.2e" %drate)) | |
239 print >> fout, "\t".join(interval.fields) | |
240 fout.flush() | |
241 | |
242 | |
243 if __name__ == "__main__": | |
244 main() |