0
|
1 #!/usr/bin/env python
|
|
2 #Guruprasad Ananda
|
|
3
|
|
4 import sys, os, tempfile
|
|
5 import fileinput
|
|
6 from warnings import warn
|
|
7
|
|
8 from galaxy.tools.util.galaxyops import *
|
|
9 from bx.intervals.io import *
|
|
10
|
|
11 from bx.intervals.operations import quicksect
|
|
12
|
|
13 def stop_err(msg):
|
|
14 sys.stderr.write(msg)
|
|
15 sys.exit()
|
|
16
|
|
17
|
|
18 def counter(node, start, end, sort_col):
|
|
19 global full, blk_len, blk_list
|
|
20 if node.start < start:
|
|
21 if node.right:
|
|
22 counter(node.right, start, end, sort_col)
|
|
23 elif start <= node.start <= end and start <= node.end <= end:
|
|
24 full += 1
|
|
25 if node.other[0] not in blk_list:
|
|
26 blk_list.append(node.other[0])
|
|
27 blk_len += int(node.other[sort_col+2])
|
|
28 if node.left and node.left.maxend > start:
|
|
29 counter(node.left, start, end, sort_col)
|
|
30 if node.right:
|
|
31 counter(node.right, start, end, sort_col)
|
|
32 elif node.start > end:
|
|
33 if node.left:
|
|
34 counter(node.left, start, end, sort_col)
|
|
35
|
|
36
|
|
37 infile = sys.argv[1]
|
|
38 fout = open(sys.argv[2],'w')
|
|
39 int_file = sys.argv[3]
|
|
40 if int_file != "None": #User has specified an interval file
|
|
41 try:
|
|
42 fint = open(int_file, 'r')
|
|
43 dbkey_i = sys.argv[4]
|
|
44 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
|
|
45 except:
|
|
46 stop_err("Unable to open input Interval file")
|
|
47
|
|
48
|
|
49 def main():
|
|
50 for i, line in enumerate( file ( infile )):
|
|
51 line = line.rstrip('\r\n')
|
|
52 if len( line )>0 and not line.startswith( '#' ):
|
|
53 elems = line.split( '\t' )
|
|
54 break
|
|
55 if i == 30:
|
|
56 break # Hopefully we'll never get here...
|
|
57
|
|
58 if len( elems ) != 18:
|
|
59 stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." )
|
|
60
|
|
61 for i, line in enumerate( file ( infile )):
|
|
62 line = line.rstrip('\r\n')
|
|
63 elems = line.split('\t')
|
|
64 try:
|
|
65 assert int(elems[0])
|
|
66 assert len(elems) == 18
|
|
67 if int_file != "None":
|
|
68 if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]:
|
|
69 stop_err("The species build corresponding to your interval file is not present in the Indel file.")
|
|
70 if dbkey_i in elems[3]:
|
|
71 sort_col = 4
|
|
72 elif dbkey_i in elems[8]:
|
|
73 sort_col = 9
|
|
74 elif dbkey_i in elems[13]:
|
|
75 sort_col = 14
|
|
76 else:
|
|
77 species = []
|
|
78 species.append( elems[3].split('.')[0] )
|
|
79 species.append( elems[8].split('.')[0] )
|
|
80 species.append( elems[13].split('.')[0] )
|
|
81 sort_col = 0 #Based on block numbers
|
|
82 break
|
|
83 except:
|
|
84 continue
|
|
85
|
|
86 fin = open(infile, 'r')
|
|
87 skipped = 0
|
|
88
|
|
89 if int_file == "None":
|
|
90 sorted_infile = tempfile.NamedTemporaryFile()
|
|
91 cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile
|
|
92 try:
|
|
93 os.system(cmdline)
|
|
94 except:
|
|
95 stop_err("Encountered error while sorting the input file.")
|
|
96 print >> fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" % ( species[0], species[1], species[2], species[0], species[1], species[2] )
|
|
97 prev_bnum = -1
|
|
98 sorted_infile.seek(0)
|
|
99 for line in sorted_infile.readlines():
|
|
100 line = line.rstrip('\r\n')
|
|
101 elems = line.split('\t')
|
|
102 try:
|
|
103 assert int(elems[0])
|
|
104 assert len(elems) == 18
|
|
105 new_bnum = int(elems[0])
|
|
106 if new_bnum != prev_bnum:
|
|
107 if prev_bnum != -1:
|
|
108 irate = []
|
|
109 drate = []
|
|
110 for i, elem in enumerate(inserts):
|
|
111 try:
|
|
112 irate.append(str("%.2e" % (inserts[i]/blen[i])))
|
|
113 except:
|
|
114 irate.append('0')
|
|
115 try:
|
|
116 drate.append(str("%.2e" % (deletes[i]/blen[i])))
|
|
117 except:
|
|
118 drate.append('0')
|
|
119 print >> fout, "%s\t%s\t%s" % ( prev_bnum, '\t'.join(irate) , '\t'.join(drate) )
|
|
120 inserts = [0.0, 0.0, 0.0]
|
|
121 deletes = [0.0, 0.0, 0.0]
|
|
122 blen = []
|
|
123 blen.append( int(elems[6]) )
|
|
124 blen.append( int(elems[11]) )
|
|
125 blen.append( int(elems[16]) )
|
|
126 line_sp = elems[1].split('.')[0]
|
|
127 sp_ind = species.index(line_sp)
|
|
128 if elems[1].endswith('insert'):
|
|
129 inserts[sp_ind] += 1
|
|
130 elif elems[1].endswith('delete'):
|
|
131 deletes[sp_ind] += 1
|
|
132 prev_bnum = new_bnum
|
|
133 except Exception, ei:
|
|
134 #print >>sys.stderr, ei
|
|
135 continue
|
|
136 irate = []
|
|
137 drate = []
|
|
138 for i, elem in enumerate(inserts):
|
|
139 try:
|
|
140 irate.append(str("%.2e" % (inserts[i]/blen[i])))
|
|
141 except:
|
|
142 irate.append('0')
|
|
143 try:
|
|
144 drate.append(str("%.2e" % (deletes[i]/blen[i])))
|
|
145 except:
|
|
146 drate.append('0')
|
|
147 print >> fout, "%s\t%s\t%s" % ( prev_bnum, '\t'.join(irate) , '\t'.join(drate) )
|
|
148 sys.exit()
|
|
149
|
|
150 inf = open(infile, 'r')
|
|
151 start_met = False
|
|
152 end_met = False
|
|
153 sp_file = tempfile.NamedTemporaryFile()
|
|
154 for n, line in enumerate(inf):
|
|
155 line = line.rstrip('\r\n')
|
|
156 elems = line.split('\t')
|
|
157 try:
|
|
158 assert int(elems[0])
|
|
159 assert len(elems) == 18
|
|
160 if dbkey_i not in elems[1]:
|
|
161 if not(start_met):
|
|
162 continue
|
|
163 else:
|
|
164 sp_end = n
|
|
165 break
|
|
166 else:
|
|
167 print >> sp_file, line
|
|
168 if not(start_met):
|
|
169 start_met = True
|
|
170 sp_start = n
|
|
171 except:
|
|
172 continue
|
|
173
|
|
174 try:
|
|
175 assert sp_end
|
|
176 except:
|
|
177 sp_end = n+1
|
|
178
|
|
179 sp_file.seek(0)
|
|
180 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
|
|
181 chrom_col=chr_col_i,
|
|
182 start_col=start_col_i,
|
|
183 end_col=end_col_i,
|
|
184 strand_col=strand_col_i,
|
|
185 fix_strand=True)
|
|
186
|
|
187 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),
|
|
188 chrom_col=1,
|
|
189 start_col=sort_col,
|
|
190 end_col=sort_col+1,
|
|
191 strand_col=-1,
|
|
192 fix_strand=True)
|
|
193
|
|
194 indelTree = quicksect.IntervalTree()
|
|
195 for item in indel:
|
|
196 if type( item ) is GenomicInterval:
|
|
197 indelTree.insert( item, indel.linenum, item.fields )
|
|
198 result = []
|
|
199
|
|
200 global full, blk_len, blk_list
|
|
201 for interval in win:
|
|
202 if type( interval ) is Header:
|
|
203 pass
|
|
204 if type( interval ) is Comment:
|
|
205 pass
|
|
206 elif type( interval ) == GenomicInterval:
|
|
207 chrom = interval.chrom
|
|
208 start = int(interval.start)
|
|
209 end = int(interval.end)
|
|
210 if start > end:
|
|
211 warn( "Interval start after end!" )
|
|
212 ins_chr = "%s.%s_insert" % ( dbkey_i, chrom )
|
|
213 del_chr = "%s.%s_delete" % ( dbkey_i, chrom )
|
|
214 irate = 0
|
|
215 drate = 0
|
|
216 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms:
|
|
217 pass
|
|
218 else:
|
|
219 if ins_chr in indelTree.chroms:
|
|
220 full = 0.0
|
|
221 blk_len = 0
|
|
222 blk_list = []
|
|
223 root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree
|
|
224 counter(root, start, end, sort_col)
|
|
225 if blk_len:
|
|
226 irate = full/blk_len
|
|
227
|
|
228 if del_chr in indelTree.chroms:
|
|
229 full = 0.0
|
|
230 blk_len = 0
|
|
231 blk_list = []
|
|
232 root = indelTree.chroms[del_chr] #root node for the chrom insertion tree
|
|
233 counter(root, start, end, sort_col)
|
|
234 if blk_len:
|
|
235 drate = full/blk_len
|
|
236
|
|
237 interval.fields.append(str("%.2e" %irate))
|
|
238 interval.fields.append(str("%.2e" %drate))
|
|
239 print >> fout, "\t".join(interval.fields)
|
|
240 fout.flush()
|
|
241
|
|
242
|
|
243 if __name__ == "__main__":
|
|
244 main()
|