Mercurial > repos > bgruening > upload_testing
comparison short_reads_figure_score.py @ 80:c4a3a8999945 draft
Uploaded
author | bernhardlutz |
---|---|
date | Mon, 20 Jan 2014 14:39:43 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
79:dc82017052ac | 80:c4a3a8999945 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 boxplot: | |
4 - box: first quartile and third quartile | |
5 - line inside the box: median | |
6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile | |
7 IQR = third quartile - first quartile | |
8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. | |
9 """ | |
10 | |
11 import os, sys, math, tempfile, re | |
12 #from rpy import * | |
13 import rpy2.robjects as robjects | |
14 import rpy2.rlike.container as rlc | |
15 import rpy2.rinterface as ri | |
16 r = robjects.r | |
17 | |
18 assert sys.version_info[:2] >= ( 2, 4 ) | |
19 | |
20 def stop_err( msg ): | |
21 sys.stderr.write( "%s\n" % msg ) | |
22 sys.exit() | |
23 | |
24 def merge_to_20_datapoints( score ): | |
25 number_of_points = 20 | |
26 read_length = len( score ) | |
27 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) | |
28 scores = [] | |
29 point = 1 | |
30 point_sum = 0 | |
31 step_average = 0 | |
32 score_points = 0 | |
33 | |
34 for i in xrange( 1, read_length ): | |
35 if i < ( point * step ): | |
36 point_sum += int( score[i] ) | |
37 step_average += 1 | |
38 else: | |
39 point_avg = point_sum * 1.0 / step_average | |
40 scores.append( point_avg ) | |
41 point += 1 | |
42 point_sum = 0 | |
43 step_average = 0 | |
44 if step_average > 0: | |
45 point_avg = point_sum * 1.0 / step_average | |
46 scores.append( point_avg ) | |
47 if len( scores ) > number_of_points: | |
48 last_avg = 0 | |
49 for j in xrange( number_of_points - 1, len( scores ) ): | |
50 last_avg += scores[j] | |
51 last_avg = last_avg / ( len(scores) - number_of_points + 1 ) | |
52 else: | |
53 last_avg = scores[-1] | |
54 score_points = [] | |
55 for k in range( number_of_points - 1 ): | |
56 score_points.append( scores[k] ) | |
57 score_points.append( last_avg ) | |
58 return score_points | |
59 | |
60 def __main__(): | |
61 | |
62 invalid_lines = 0 | |
63 | |
64 infile_score_name = sys.argv[1].strip() | |
65 outfile_R_name = sys.argv[2].strip() | |
66 | |
67 infile_name = infile_score_name | |
68 | |
69 # Determine tabular or fasta format within the first 100 lines | |
70 seq_method = None | |
71 data_type = None | |
72 for i, line in enumerate( file( infile_name ) ): | |
73 line = line.rstrip( '\r\n' ) | |
74 if not line or line.startswith( '#' ): | |
75 continue | |
76 if data_type == None: | |
77 if line.startswith( '>' ): | |
78 data_type = 'fasta' | |
79 continue | |
80 elif len( line.split( '\t' ) ) > 0: | |
81 fields = line.split() | |
82 for score in fields: | |
83 try: | |
84 int( score ) | |
85 data_type = 'tabular' | |
86 seq_method = 'solexa' | |
87 break | |
88 except: | |
89 break | |
90 elif data_type == 'fasta': | |
91 fields = line.split() | |
92 for score in fields: | |
93 try: | |
94 int( score ) | |
95 seq_method = '454' | |
96 break | |
97 except: | |
98 break | |
99 if i == 100: | |
100 break | |
101 | |
102 if data_type is None: | |
103 stop_err( 'This tool can only use fasta data or tabular data.' ) | |
104 if seq_method is None: | |
105 stop_err( 'Invalid data for fasta format.') | |
106 | |
107 # Determine fixed length or variable length within the first 100 lines | |
108 read_length = 0 | |
109 variable_length = False | |
110 if seq_method == 'solexa': | |
111 for i, line in enumerate( file( infile_name ) ): | |
112 line = line.rstrip( '\r\n' ) | |
113 if not line or line.startswith( '#' ): | |
114 continue | |
115 scores = line.split('\t') | |
116 if read_length == 0: | |
117 read_length = len( scores ) | |
118 if read_length != len( scores ): | |
119 variable_length = True | |
120 break | |
121 if i == 100: | |
122 break | |
123 elif seq_method == '454': | |
124 score = '' | |
125 for i, line in enumerate( file( infile_name ) ): | |
126 line = line.rstrip( '\r\n' ) | |
127 if not line or line.startswith( '#' ): | |
128 continue | |
129 if line.startswith( '>' ): | |
130 if len( score ) > 0: | |
131 score = score.split() | |
132 if read_length == 0: | |
133 read_length = len( score ) | |
134 if read_length != len( score ): | |
135 variable_length = True | |
136 break | |
137 score = '' | |
138 else: | |
139 score = score + ' ' + line | |
140 if i == 100: | |
141 break | |
142 | |
143 if variable_length: | |
144 number_of_points = 20 | |
145 else: | |
146 number_of_points = read_length | |
147 read_length_threshold = 100 # minimal read length for 454 file | |
148 score_points = [] | |
149 score_matrix = [] | |
150 invalid_scores = 0 | |
151 | |
152 if seq_method == 'solexa': | |
153 for i, line in enumerate( open( infile_name ) ): | |
154 line = line.rstrip( '\r\n' ) | |
155 if not line or line.startswith( '#' ): | |
156 continue | |
157 tmp_array = [] | |
158 scores = line.split( '\t' ) | |
159 for bases in scores: | |
160 nuc_errors = bases.split() | |
161 try: | |
162 nuc_errors[0] = int( nuc_errors[0] ) | |
163 nuc_errors[1] = int( nuc_errors[1] ) | |
164 nuc_errors[2] = int( nuc_errors[2] ) | |
165 nuc_errors[3] = int( nuc_errors[3] ) | |
166 big = max( nuc_errors ) | |
167 except: | |
168 #print 'Invalid numbers in the file. Skipped.' | |
169 invalid_scores += 1 | |
170 big = 0 | |
171 tmp_array.append( big ) | |
172 score_points.append( tmp_array ) | |
173 elif seq_method == '454': | |
174 # skip the last fasta sequence | |
175 score = '' | |
176 for i, line in enumerate( open( infile_name ) ): | |
177 line = line.rstrip( '\r\n' ) | |
178 if not line or line.startswith( '#' ): | |
179 continue | |
180 if line.startswith( '>' ): | |
181 if len( score ) > 0: | |
182 score = ['0'] + score.split() | |
183 read_length = len( score ) | |
184 tmp_array = [] | |
185 if not variable_length: | |
186 score.pop(0) | |
187 score_points.append( score ) | |
188 tmp_array = score | |
189 elif read_length > read_length_threshold: | |
190 score_points_tmp = merge_to_20_datapoints( score ) | |
191 score_points.append( score_points_tmp ) | |
192 tmp_array = score_points_tmp | |
193 score = '' | |
194 else: | |
195 score = "%s %s" % ( score, line ) | |
196 if len( score ) > 0: | |
197 score = ['0'] + score.split() | |
198 read_length = len( score ) | |
199 if not variable_length: | |
200 score.pop(0) | |
201 score_points.append( score ) | |
202 elif read_length > read_length_threshold: | |
203 score_points_tmp = merge_to_20_datapoints( score ) | |
204 score_points.append( score_points_tmp ) | |
205 tmp_array = score_points_tmp | |
206 | |
207 # reverse the matrix, for R | |
208 for i in range( number_of_points - 1 ): | |
209 tmp_array = [] | |
210 for j in range( len( score_points ) ): | |
211 try: | |
212 tmp_array.append( int( score_points[j][i] ) ) | |
213 except: | |
214 invalid_lines += 1 | |
215 score_matrix.append( tmp_array ) | |
216 | |
217 # generate pdf figures | |
218 #outfile_R_pdf = outfile_R_name | |
219 #r.pdf( outfile_R_pdf ) | |
220 outfile_R_png = outfile_R_name | |
221 print 'Writing bitmap' | |
222 r.bitmap( outfile_R_png ) | |
223 | |
224 title = "boxplot of quality scores" | |
225 empty_score_matrix_columns = 0 | |
226 for i, subset in enumerate( score_matrix ): | |
227 if not subset: | |
228 empty_score_matrix_columns += 1 | |
229 score_matrix[i] = [0] | |
230 | |
231 if not variable_length: | |
232 print 'Creating fixed boxplot ' | |
233 r.boxplot( score_matrix, xlab="location in read length", main=title ) | |
234 else: | |
235 print 'Creating variable boxplot' | |
236 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) | |
237 x_old_range = [] | |
238 x_new_range = [] | |
239 step = read_length_threshold / number_of_points | |
240 for i in xrange( 0, read_length_threshold, step ): | |
241 x_old_range.append( ( i / step ) ) | |
242 x_new_range.append( i ) | |
243 print 'Writing axis' | |
244 r.axis( 1, x_old_range, x_new_range ) | |
245 | |
246 print 'calling dev.off()' | |
247 r('dev.off()') | |
248 | |
249 if invalid_scores > 0: | |
250 print 'Skipped %d invalid scores. ' % invalid_scores | |
251 if invalid_lines > 0: | |
252 print 'Skipped %d invalid lines. ' % invalid_lines | |
253 if empty_score_matrix_columns > 0: | |
254 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns | |
255 | |
256 #r.quit(save = "no") | |
257 | |
258 if __name__=="__main__":__main__() |