annotate short_reads_figure_score.py @ 80:c4a3a8999945 draft

Uploaded
author bernhardlutz
date Mon, 20 Jan 2014 14:39:43 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
80
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
1 #!/usr/bin/env python
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
2 """
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
3 boxplot:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
4 - box: first quartile and third quartile
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
5 - line inside the box: median
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
7 IQR = third quartile - first quartile
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
9 """
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
10
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
11 import os, sys, math, tempfile, re
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
12 #from rpy import *
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
13 import rpy2.robjects as robjects
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
14 import rpy2.rlike.container as rlc
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
15 import rpy2.rinterface as ri
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
16 r = robjects.r
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
17
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
18 assert sys.version_info[:2] >= ( 2, 4 )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
19
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
20 def stop_err( msg ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
21 sys.stderr.write( "%s\n" % msg )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
22 sys.exit()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
23
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
24 def merge_to_20_datapoints( score ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
25 number_of_points = 20
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
26 read_length = len( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
27 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
28 scores = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
29 point = 1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
30 point_sum = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
31 step_average = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
32 score_points = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
33
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
34 for i in xrange( 1, read_length ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
35 if i < ( point * step ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
36 point_sum += int( score[i] )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
37 step_average += 1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
38 else:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
39 point_avg = point_sum * 1.0 / step_average
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
40 scores.append( point_avg )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
41 point += 1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
42 point_sum = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
43 step_average = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
44 if step_average > 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
45 point_avg = point_sum * 1.0 / step_average
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
46 scores.append( point_avg )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
47 if len( scores ) > number_of_points:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
48 last_avg = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
49 for j in xrange( number_of_points - 1, len( scores ) ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
50 last_avg += scores[j]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
51 last_avg = last_avg / ( len(scores) - number_of_points + 1 )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
52 else:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
53 last_avg = scores[-1]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
54 score_points = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
55 for k in range( number_of_points - 1 ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
56 score_points.append( scores[k] )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
57 score_points.append( last_avg )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
58 return score_points
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
59
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
60 def __main__():
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
61
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
62 invalid_lines = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
63
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
64 infile_score_name = sys.argv[1].strip()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
65 outfile_R_name = sys.argv[2].strip()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
66
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
67 infile_name = infile_score_name
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
68
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
69 # Determine tabular or fasta format within the first 100 lines
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
70 seq_method = None
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
71 data_type = None
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
72 for i, line in enumerate( file( infile_name ) ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
73 line = line.rstrip( '\r\n' )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
74 if not line or line.startswith( '#' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
75 continue
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
76 if data_type == None:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
77 if line.startswith( '>' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
78 data_type = 'fasta'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
79 continue
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
80 elif len( line.split( '\t' ) ) > 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
81 fields = line.split()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
82 for score in fields:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
83 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
84 int( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
85 data_type = 'tabular'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
86 seq_method = 'solexa'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
87 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
88 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
89 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
90 elif data_type == 'fasta':
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
91 fields = line.split()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
92 for score in fields:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
93 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
94 int( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
95 seq_method = '454'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
96 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
97 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
98 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
99 if i == 100:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
100 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
101
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
102 if data_type is None:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
103 stop_err( 'This tool can only use fasta data or tabular data.' )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
104 if seq_method is None:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
105 stop_err( 'Invalid data for fasta format.')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
106
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
107 # Determine fixed length or variable length within the first 100 lines
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
108 read_length = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
109 variable_length = False
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
110 if seq_method == 'solexa':
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
111 for i, line in enumerate( file( infile_name ) ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
112 line = line.rstrip( '\r\n' )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
113 if not line or line.startswith( '#' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
114 continue
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
115 scores = line.split('\t')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
116 if read_length == 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
117 read_length = len( scores )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
118 if read_length != len( scores ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
119 variable_length = True
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
120 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
121 if i == 100:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
122 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
123 elif seq_method == '454':
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
124 score = ''
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
125 for i, line in enumerate( file( infile_name ) ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
126 line = line.rstrip( '\r\n' )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
127 if not line or line.startswith( '#' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
128 continue
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
129 if line.startswith( '>' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
130 if len( score ) > 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
131 score = score.split()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
132 if read_length == 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
133 read_length = len( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
134 if read_length != len( score ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
135 variable_length = True
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
136 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
137 score = ''
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
138 else:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
139 score = score + ' ' + line
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
140 if i == 100:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
141 break
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
142
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
143 if variable_length:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
144 number_of_points = 20
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
145 else:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
146 number_of_points = read_length
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
147 read_length_threshold = 100 # minimal read length for 454 file
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
148 score_points = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
149 score_matrix = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
150 invalid_scores = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
151
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
152 if seq_method == 'solexa':
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
153 for i, line in enumerate( open( infile_name ) ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
154 line = line.rstrip( '\r\n' )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
155 if not line or line.startswith( '#' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
156 continue
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
157 tmp_array = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
158 scores = line.split( '\t' )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
159 for bases in scores:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
160 nuc_errors = bases.split()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
161 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
162 nuc_errors[0] = int( nuc_errors[0] )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
163 nuc_errors[1] = int( nuc_errors[1] )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
164 nuc_errors[2] = int( nuc_errors[2] )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
165 nuc_errors[3] = int( nuc_errors[3] )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
166 big = max( nuc_errors )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
167 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
168 #print 'Invalid numbers in the file. Skipped.'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
169 invalid_scores += 1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
170 big = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
171 tmp_array.append( big )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
172 score_points.append( tmp_array )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
173 elif seq_method == '454':
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
174 # skip the last fasta sequence
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
175 score = ''
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
176 for i, line in enumerate( open( infile_name ) ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
177 line = line.rstrip( '\r\n' )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
178 if not line or line.startswith( '#' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
179 continue
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
180 if line.startswith( '>' ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
181 if len( score ) > 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
182 score = ['0'] + score.split()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
183 read_length = len( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
184 tmp_array = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
185 if not variable_length:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
186 score.pop(0)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
187 score_points.append( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
188 tmp_array = score
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
189 elif read_length > read_length_threshold:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
190 score_points_tmp = merge_to_20_datapoints( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
191 score_points.append( score_points_tmp )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
192 tmp_array = score_points_tmp
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
193 score = ''
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
194 else:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
195 score = "%s %s" % ( score, line )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
196 if len( score ) > 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
197 score = ['0'] + score.split()
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
198 read_length = len( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
199 if not variable_length:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
200 score.pop(0)
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
201 score_points.append( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
202 elif read_length > read_length_threshold:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
203 score_points_tmp = merge_to_20_datapoints( score )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
204 score_points.append( score_points_tmp )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
205 tmp_array = score_points_tmp
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
206
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
207 # reverse the matrix, for R
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
208 for i in range( number_of_points - 1 ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
209 tmp_array = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
210 for j in range( len( score_points ) ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
211 try:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
212 tmp_array.append( int( score_points[j][i] ) )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
213 except:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
214 invalid_lines += 1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
215 score_matrix.append( tmp_array )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
216
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
217 # generate pdf figures
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
218 #outfile_R_pdf = outfile_R_name
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
219 #r.pdf( outfile_R_pdf )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
220 outfile_R_png = outfile_R_name
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
221 print 'Writing bitmap'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
222 r.bitmap( outfile_R_png )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
223
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
224 title = "boxplot of quality scores"
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
225 empty_score_matrix_columns = 0
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
226 for i, subset in enumerate( score_matrix ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
227 if not subset:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
228 empty_score_matrix_columns += 1
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
229 score_matrix[i] = [0]
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
230
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
231 if not variable_length:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
232 print 'Creating fixed boxplot '
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
233 r.boxplot( score_matrix, xlab="location in read length", main=title )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
234 else:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
235 print 'Creating variable boxplot'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
236 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
237 x_old_range = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
238 x_new_range = []
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
239 step = read_length_threshold / number_of_points
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
240 for i in xrange( 0, read_length_threshold, step ):
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
241 x_old_range.append( ( i / step ) )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
242 x_new_range.append( i )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
243 print 'Writing axis'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
244 r.axis( 1, x_old_range, x_new_range )
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
245
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
246 print 'calling dev.off()'
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
247 r('dev.off()')
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
248
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
249 if invalid_scores > 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
250 print 'Skipped %d invalid scores. ' % invalid_scores
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
251 if invalid_lines > 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
252 print 'Skipped %d invalid lines. ' % invalid_lines
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
253 if empty_score_matrix_columns > 0:
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
254 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
255
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
256 #r.quit(save = "no")
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
257
c4a3a8999945 Uploaded
bernhardlutz
parents:
diff changeset
258 if __name__=="__main__":__main__()