80
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 boxplot:
|
|
4 - box: first quartile and third quartile
|
|
5 - line inside the box: median
|
|
6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
|
|
7 IQR = third quartile - first quartile
|
|
8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
|
|
9 """
|
|
10
|
|
11 import os, sys, math, tempfile, re
|
|
12 #from rpy import *
|
|
13 import rpy2.robjects as robjects
|
|
14 import rpy2.rlike.container as rlc
|
|
15 import rpy2.rinterface as ri
|
|
16 r = robjects.r
|
|
17
|
|
18 assert sys.version_info[:2] >= ( 2, 4 )
|
|
19
|
|
20 def stop_err( msg ):
|
|
21 sys.stderr.write( "%s\n" % msg )
|
|
22 sys.exit()
|
|
23
|
|
24 def merge_to_20_datapoints( score ):
|
|
25 number_of_points = 20
|
|
26 read_length = len( score )
|
|
27 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
|
|
28 scores = []
|
|
29 point = 1
|
|
30 point_sum = 0
|
|
31 step_average = 0
|
|
32 score_points = 0
|
|
33
|
|
34 for i in xrange( 1, read_length ):
|
|
35 if i < ( point * step ):
|
|
36 point_sum += int( score[i] )
|
|
37 step_average += 1
|
|
38 else:
|
|
39 point_avg = point_sum * 1.0 / step_average
|
|
40 scores.append( point_avg )
|
|
41 point += 1
|
|
42 point_sum = 0
|
|
43 step_average = 0
|
|
44 if step_average > 0:
|
|
45 point_avg = point_sum * 1.0 / step_average
|
|
46 scores.append( point_avg )
|
|
47 if len( scores ) > number_of_points:
|
|
48 last_avg = 0
|
|
49 for j in xrange( number_of_points - 1, len( scores ) ):
|
|
50 last_avg += scores[j]
|
|
51 last_avg = last_avg / ( len(scores) - number_of_points + 1 )
|
|
52 else:
|
|
53 last_avg = scores[-1]
|
|
54 score_points = []
|
|
55 for k in range( number_of_points - 1 ):
|
|
56 score_points.append( scores[k] )
|
|
57 score_points.append( last_avg )
|
|
58 return score_points
|
|
59
|
|
60 def __main__():
|
|
61
|
|
62 invalid_lines = 0
|
|
63
|
|
64 infile_score_name = sys.argv[1].strip()
|
|
65 outfile_R_name = sys.argv[2].strip()
|
|
66
|
|
67 infile_name = infile_score_name
|
|
68
|
|
69 # Determine tabular or fasta format within the first 100 lines
|
|
70 seq_method = None
|
|
71 data_type = None
|
|
72 for i, line in enumerate( file( infile_name ) ):
|
|
73 line = line.rstrip( '\r\n' )
|
|
74 if not line or line.startswith( '#' ):
|
|
75 continue
|
|
76 if data_type == None:
|
|
77 if line.startswith( '>' ):
|
|
78 data_type = 'fasta'
|
|
79 continue
|
|
80 elif len( line.split( '\t' ) ) > 0:
|
|
81 fields = line.split()
|
|
82 for score in fields:
|
|
83 try:
|
|
84 int( score )
|
|
85 data_type = 'tabular'
|
|
86 seq_method = 'solexa'
|
|
87 break
|
|
88 except:
|
|
89 break
|
|
90 elif data_type == 'fasta':
|
|
91 fields = line.split()
|
|
92 for score in fields:
|
|
93 try:
|
|
94 int( score )
|
|
95 seq_method = '454'
|
|
96 break
|
|
97 except:
|
|
98 break
|
|
99 if i == 100:
|
|
100 break
|
|
101
|
|
102 if data_type is None:
|
|
103 stop_err( 'This tool can only use fasta data or tabular data.' )
|
|
104 if seq_method is None:
|
|
105 stop_err( 'Invalid data for fasta format.')
|
|
106
|
|
107 # Determine fixed length or variable length within the first 100 lines
|
|
108 read_length = 0
|
|
109 variable_length = False
|
|
110 if seq_method == 'solexa':
|
|
111 for i, line in enumerate( file( infile_name ) ):
|
|
112 line = line.rstrip( '\r\n' )
|
|
113 if not line or line.startswith( '#' ):
|
|
114 continue
|
|
115 scores = line.split('\t')
|
|
116 if read_length == 0:
|
|
117 read_length = len( scores )
|
|
118 if read_length != len( scores ):
|
|
119 variable_length = True
|
|
120 break
|
|
121 if i == 100:
|
|
122 break
|
|
123 elif seq_method == '454':
|
|
124 score = ''
|
|
125 for i, line in enumerate( file( infile_name ) ):
|
|
126 line = line.rstrip( '\r\n' )
|
|
127 if not line or line.startswith( '#' ):
|
|
128 continue
|
|
129 if line.startswith( '>' ):
|
|
130 if len( score ) > 0:
|
|
131 score = score.split()
|
|
132 if read_length == 0:
|
|
133 read_length = len( score )
|
|
134 if read_length != len( score ):
|
|
135 variable_length = True
|
|
136 break
|
|
137 score = ''
|
|
138 else:
|
|
139 score = score + ' ' + line
|
|
140 if i == 100:
|
|
141 break
|
|
142
|
|
143 if variable_length:
|
|
144 number_of_points = 20
|
|
145 else:
|
|
146 number_of_points = read_length
|
|
147 read_length_threshold = 100 # minimal read length for 454 file
|
|
148 score_points = []
|
|
149 score_matrix = []
|
|
150 invalid_scores = 0
|
|
151
|
|
152 if seq_method == 'solexa':
|
|
153 for i, line in enumerate( open( infile_name ) ):
|
|
154 line = line.rstrip( '\r\n' )
|
|
155 if not line or line.startswith( '#' ):
|
|
156 continue
|
|
157 tmp_array = []
|
|
158 scores = line.split( '\t' )
|
|
159 for bases in scores:
|
|
160 nuc_errors = bases.split()
|
|
161 try:
|
|
162 nuc_errors[0] = int( nuc_errors[0] )
|
|
163 nuc_errors[1] = int( nuc_errors[1] )
|
|
164 nuc_errors[2] = int( nuc_errors[2] )
|
|
165 nuc_errors[3] = int( nuc_errors[3] )
|
|
166 big = max( nuc_errors )
|
|
167 except:
|
|
168 #print 'Invalid numbers in the file. Skipped.'
|
|
169 invalid_scores += 1
|
|
170 big = 0
|
|
171 tmp_array.append( big )
|
|
172 score_points.append( tmp_array )
|
|
173 elif seq_method == '454':
|
|
174 # skip the last fasta sequence
|
|
175 score = ''
|
|
176 for i, line in enumerate( open( infile_name ) ):
|
|
177 line = line.rstrip( '\r\n' )
|
|
178 if not line or line.startswith( '#' ):
|
|
179 continue
|
|
180 if line.startswith( '>' ):
|
|
181 if len( score ) > 0:
|
|
182 score = ['0'] + score.split()
|
|
183 read_length = len( score )
|
|
184 tmp_array = []
|
|
185 if not variable_length:
|
|
186 score.pop(0)
|
|
187 score_points.append( score )
|
|
188 tmp_array = score
|
|
189 elif read_length > read_length_threshold:
|
|
190 score_points_tmp = merge_to_20_datapoints( score )
|
|
191 score_points.append( score_points_tmp )
|
|
192 tmp_array = score_points_tmp
|
|
193 score = ''
|
|
194 else:
|
|
195 score = "%s %s" % ( score, line )
|
|
196 if len( score ) > 0:
|
|
197 score = ['0'] + score.split()
|
|
198 read_length = len( score )
|
|
199 if not variable_length:
|
|
200 score.pop(0)
|
|
201 score_points.append( score )
|
|
202 elif read_length > read_length_threshold:
|
|
203 score_points_tmp = merge_to_20_datapoints( score )
|
|
204 score_points.append( score_points_tmp )
|
|
205 tmp_array = score_points_tmp
|
|
206
|
|
207 # reverse the matrix, for R
|
|
208 for i in range( number_of_points - 1 ):
|
|
209 tmp_array = []
|
|
210 for j in range( len( score_points ) ):
|
|
211 try:
|
|
212 tmp_array.append( int( score_points[j][i] ) )
|
|
213 except:
|
|
214 invalid_lines += 1
|
|
215 score_matrix.append( tmp_array )
|
|
216
|
|
217 # generate pdf figures
|
|
218 #outfile_R_pdf = outfile_R_name
|
|
219 #r.pdf( outfile_R_pdf )
|
|
220 outfile_R_png = outfile_R_name
|
|
221 print 'Writing bitmap'
|
|
222 r.bitmap( outfile_R_png )
|
|
223
|
|
224 title = "boxplot of quality scores"
|
|
225 empty_score_matrix_columns = 0
|
|
226 for i, subset in enumerate( score_matrix ):
|
|
227 if not subset:
|
|
228 empty_score_matrix_columns += 1
|
|
229 score_matrix[i] = [0]
|
|
230
|
|
231 if not variable_length:
|
|
232 print 'Creating fixed boxplot '
|
|
233 r.boxplot( score_matrix, xlab="location in read length", main=title )
|
|
234 else:
|
|
235 print 'Creating variable boxplot'
|
|
236 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
|
|
237 x_old_range = []
|
|
238 x_new_range = []
|
|
239 step = read_length_threshold / number_of_points
|
|
240 for i in xrange( 0, read_length_threshold, step ):
|
|
241 x_old_range.append( ( i / step ) )
|
|
242 x_new_range.append( i )
|
|
243 print 'Writing axis'
|
|
244 r.axis( 1, x_old_range, x_new_range )
|
|
245
|
|
246 print 'calling dev.off()'
|
|
247 r('dev.off()')
|
|
248
|
|
249 if invalid_scores > 0:
|
|
250 print 'Skipped %d invalid scores. ' % invalid_scores
|
|
251 if invalid_lines > 0:
|
|
252 print 'Skipped %d invalid lines. ' % invalid_lines
|
|
253 if empty_score_matrix_columns > 0:
|
|
254 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
|
|
255
|
|
256 #r.quit(save = "no")
|
|
257
|
|
258 if __name__=="__main__":__main__()
|