comparison short_reads_figure_score.py @ 90:b061185bcb83 draft

Uploaded
author bernhardlutz
date Thu, 23 Jan 2014 14:53:46 -0500
parents c4a3a8999945
children
comparison
equal deleted inserted replaced
89:71e953a4191d 90:b061185bcb83
1 #!/usr/bin/env python
2 """
3 boxplot:
4 - box: first quartile and third quartile
5 - line inside the box: median
6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
7 IQR = third quartile - first quartile
8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
9 """
10
11 import os, sys, math, tempfile, re
12 #from rpy import *
13 import rpy2.robjects as robjects
14 import rpy2.rlike.container as rlc
15 import rpy2.rinterface as ri
16 r = robjects.r
17
18 assert sys.version_info[:2] >= ( 2, 4 )
19
20 def stop_err( msg ):
21 sys.stderr.write( "%s\n" % msg )
22 sys.exit()
23
24 def merge_to_20_datapoints( score ):
25 number_of_points = 20
26 read_length = len( score )
27 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
28 scores = []
29 point = 1
30 point_sum = 0
31 step_average = 0
32 score_points = 0
33
34 for i in xrange( 1, read_length ):
35 if i < ( point * step ):
36 point_sum += int( score[i] )
37 step_average += 1
38 else:
39 point_avg = point_sum * 1.0 / step_average
40 scores.append( point_avg )
41 point += 1
42 point_sum = 0
43 step_average = 0
44 if step_average > 0:
45 point_avg = point_sum * 1.0 / step_average
46 scores.append( point_avg )
47 if len( scores ) > number_of_points:
48 last_avg = 0
49 for j in xrange( number_of_points - 1, len( scores ) ):
50 last_avg += scores[j]
51 last_avg = last_avg / ( len(scores) - number_of_points + 1 )
52 else:
53 last_avg = scores[-1]
54 score_points = []
55 for k in range( number_of_points - 1 ):
56 score_points.append( scores[k] )
57 score_points.append( last_avg )
58 return score_points
59
60 def __main__():
61
62 invalid_lines = 0
63
64 infile_score_name = sys.argv[1].strip()
65 outfile_R_name = sys.argv[2].strip()
66
67 infile_name = infile_score_name
68
69 # Determine tabular or fasta format within the first 100 lines
70 seq_method = None
71 data_type = None
72 for i, line in enumerate( file( infile_name ) ):
73 line = line.rstrip( '\r\n' )
74 if not line or line.startswith( '#' ):
75 continue
76 if data_type == None:
77 if line.startswith( '>' ):
78 data_type = 'fasta'
79 continue
80 elif len( line.split( '\t' ) ) > 0:
81 fields = line.split()
82 for score in fields:
83 try:
84 int( score )
85 data_type = 'tabular'
86 seq_method = 'solexa'
87 break
88 except:
89 break
90 elif data_type == 'fasta':
91 fields = line.split()
92 for score in fields:
93 try:
94 int( score )
95 seq_method = '454'
96 break
97 except:
98 break
99 if i == 100:
100 break
101
102 if data_type is None:
103 stop_err( 'This tool can only use fasta data or tabular data.' )
104 if seq_method is None:
105 stop_err( 'Invalid data for fasta format.')
106
107 # Determine fixed length or variable length within the first 100 lines
108 read_length = 0
109 variable_length = False
110 if seq_method == 'solexa':
111 for i, line in enumerate( file( infile_name ) ):
112 line = line.rstrip( '\r\n' )
113 if not line or line.startswith( '#' ):
114 continue
115 scores = line.split('\t')
116 if read_length == 0:
117 read_length = len( scores )
118 if read_length != len( scores ):
119 variable_length = True
120 break
121 if i == 100:
122 break
123 elif seq_method == '454':
124 score = ''
125 for i, line in enumerate( file( infile_name ) ):
126 line = line.rstrip( '\r\n' )
127 if not line or line.startswith( '#' ):
128 continue
129 if line.startswith( '>' ):
130 if len( score ) > 0:
131 score = score.split()
132 if read_length == 0:
133 read_length = len( score )
134 if read_length != len( score ):
135 variable_length = True
136 break
137 score = ''
138 else:
139 score = score + ' ' + line
140 if i == 100:
141 break
142
143 if variable_length:
144 number_of_points = 20
145 else:
146 number_of_points = read_length
147 read_length_threshold = 100 # minimal read length for 454 file
148 score_points = []
149 score_matrix = []
150 invalid_scores = 0
151
152 if seq_method == 'solexa':
153 for i, line in enumerate( open( infile_name ) ):
154 line = line.rstrip( '\r\n' )
155 if not line or line.startswith( '#' ):
156 continue
157 tmp_array = []
158 scores = line.split( '\t' )
159 for bases in scores:
160 nuc_errors = bases.split()
161 try:
162 nuc_errors[0] = int( nuc_errors[0] )
163 nuc_errors[1] = int( nuc_errors[1] )
164 nuc_errors[2] = int( nuc_errors[2] )
165 nuc_errors[3] = int( nuc_errors[3] )
166 big = max( nuc_errors )
167 except:
168 #print 'Invalid numbers in the file. Skipped.'
169 invalid_scores += 1
170 big = 0
171 tmp_array.append( big )
172 score_points.append( tmp_array )
173 elif seq_method == '454':
174 # skip the last fasta sequence
175 score = ''
176 for i, line in enumerate( open( infile_name ) ):
177 line = line.rstrip( '\r\n' )
178 if not line or line.startswith( '#' ):
179 continue
180 if line.startswith( '>' ):
181 if len( score ) > 0:
182 score = ['0'] + score.split()
183 read_length = len( score )
184 tmp_array = []
185 if not variable_length:
186 score.pop(0)
187 score_points.append( score )
188 tmp_array = score
189 elif read_length > read_length_threshold:
190 score_points_tmp = merge_to_20_datapoints( score )
191 score_points.append( score_points_tmp )
192 tmp_array = score_points_tmp
193 score = ''
194 else:
195 score = "%s %s" % ( score, line )
196 if len( score ) > 0:
197 score = ['0'] + score.split()
198 read_length = len( score )
199 if not variable_length:
200 score.pop(0)
201 score_points.append( score )
202 elif read_length > read_length_threshold:
203 score_points_tmp = merge_to_20_datapoints( score )
204 score_points.append( score_points_tmp )
205 tmp_array = score_points_tmp
206
207 # reverse the matrix, for R
208 for i in range( number_of_points - 1 ):
209 tmp_array = []
210 for j in range( len( score_points ) ):
211 try:
212 tmp_array.append( int( score_points[j][i] ) )
213 except:
214 invalid_lines += 1
215 score_matrix.append( tmp_array )
216
217 # generate pdf figures
218 #outfile_R_pdf = outfile_R_name
219 #r.pdf( outfile_R_pdf )
220 outfile_R_png = outfile_R_name
221 print 'Writing bitmap'
222 r.bitmap( outfile_R_png )
223
224 title = "boxplot of quality scores"
225 empty_score_matrix_columns = 0
226 for i, subset in enumerate( score_matrix ):
227 if not subset:
228 empty_score_matrix_columns += 1
229 score_matrix[i] = [0]
230
231 if not variable_length:
232 print 'Creating fixed boxplot '
233 r.boxplot( score_matrix, xlab="location in read length", main=title )
234 else:
235 print 'Creating variable boxplot'
236 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
237 x_old_range = []
238 x_new_range = []
239 step = read_length_threshold / number_of_points
240 for i in xrange( 0, read_length_threshold, step ):
241 x_old_range.append( ( i / step ) )
242 x_new_range.append( i )
243 print 'Writing axis'
244 r.axis( 1, x_old_range, x_new_range )
245
246 print 'calling dev.off()'
247 r('dev.off()')
248
249 if invalid_scores > 0:
250 print 'Skipped %d invalid scores. ' % invalid_scores
251 if invalid_lines > 0:
252 print 'Skipped %d invalid lines. ' % invalid_lines
253 if empty_score_matrix_columns > 0:
254 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
255
256 #r.quit(save = "no")
257
258 if __name__=="__main__":__main__()