0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 trim reads based on the quality scores
|
|
4 input: read file and quality score file
|
|
5 output: trimmed read file
|
|
6 """
|
|
7
|
|
8 import os, sys, math, tempfile, re
|
|
9
|
|
10 assert sys.version_info[:2] >= ( 2, 4 )
|
|
11
|
|
12 def stop_err( msg ):
|
|
13 sys.stderr.write( "%s\n" % msg )
|
|
14 sys.exit()
|
|
15
|
|
16 def append_to_outfile( outfile_name, seq_title, segments ):
|
|
17 segments = segments.split( ',' )
|
|
18 if len( segments ) > 1:
|
|
19 outfile = open( outfile_name, 'a' )
|
|
20 for i in range( len( segments ) ):
|
|
21 outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) )
|
|
22 outfile.close()
|
|
23 elif segments[0]:
|
|
24 outfile = open( outfile_name, 'a' )
|
|
25 outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) )
|
|
26 outfile.close()
|
|
27
|
|
28 def trim_seq( seq, score, arg, trim_score, threshold ):
|
|
29 seq_method = '454'
|
|
30 trim_pos = 0
|
|
31 # trim after a certain position
|
|
32 if arg.isdigit():
|
|
33 keep_homopolymers = False
|
|
34 trim_pos = int( arg )
|
|
35 if trim_pos > 0 and trim_pos < len( seq ):
|
|
36 seq = seq[0:trim_pos]
|
|
37 else:
|
|
38 keep_homopolymers = arg=='yes'
|
|
39
|
|
40 new_trim_seq = ''
|
|
41 max_segment = 0
|
|
42
|
|
43 for i in range( len( seq ) ):
|
|
44 if i >= len( score ):
|
|
45 score.append(-1)
|
|
46 if int( score[i] ) >= trim_score:
|
|
47 pass_nuc = seq[ i:( i + 1 ) ]
|
|
48 else:
|
|
49 if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ):
|
|
50 pass_nuc = seq[ i:( i + 1 ) ]
|
|
51 else:
|
|
52 pass_nuc = ' '
|
|
53 new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc )
|
|
54 # find the max substrings
|
|
55 segments = new_trim_seq.split()
|
|
56 max_segment = ''
|
|
57 len_max_segment = 0
|
|
58 if threshold == 0:
|
|
59 for seg in segments:
|
|
60 if len_max_segment < len( seg ):
|
|
61 max_segment = '%s,' % seg
|
|
62 len_max_segment = len( seg )
|
|
63 elif len_max_segment == len( seg ):
|
|
64 max_segment = '%s%s,' % ( max_segment, seg )
|
|
65 else:
|
|
66 for seg in segments:
|
|
67 if len( seg ) >= threshold:
|
|
68 max_segment = '%s%s,' % ( max_segment, seg )
|
|
69 return max_segment[ 0:-1 ]
|
|
70
|
|
71 def __main__():
|
|
72
|
|
73 try:
|
|
74 threshold_trim = int( sys.argv[1].strip() )
|
|
75 except:
|
|
76 stop_err( "Minimal quality score must be numeric." )
|
|
77 try:
|
|
78 threshold_report = int( sys.argv[2].strip() )
|
|
79 except:
|
|
80 stop_err( "Minimal length of trimmed reads must be numeric." )
|
|
81 outfile_seq_name = sys.argv[3].strip()
|
|
82 infile_seq_name = sys.argv[4].strip()
|
|
83 infile_score_name = sys.argv[5].strip()
|
|
84 arg = sys.argv[6].strip()
|
|
85
|
|
86 seq_infile_name = infile_seq_name
|
|
87 score_infile_name = infile_score_name
|
|
88
|
|
89
|
|
90 # Determine quailty score format: tabular or fasta format within the first 100 lines
|
|
91 seq_method = None
|
|
92 data_type = None
|
|
93 for i, line in enumerate( file( score_infile_name ) ):
|
|
94 line = line.rstrip( '\r\n' )
|
|
95 if not line or line.startswith( '#' ):
|
|
96 continue
|
|
97 if data_type == None:
|
|
98 if line.startswith( '>' ):
|
|
99 data_type = 'fasta'
|
|
100 continue
|
|
101 elif len( line.split( '\t' ) ) > 0:
|
|
102 fields = line.split()
|
|
103 for score in fields:
|
|
104 try:
|
|
105 int( score )
|
|
106 data_type = 'tabular'
|
|
107 seq_method = 'solexa'
|
|
108 break
|
|
109 except:
|
|
110 break
|
|
111 elif data_type == 'fasta':
|
|
112 fields = line.split()
|
|
113 for score in fields:
|
|
114 try:
|
|
115 int( score )
|
|
116 seq_method = '454'
|
|
117 break
|
|
118 except:
|
|
119 break
|
|
120 if i == 100:
|
|
121 break
|
|
122
|
|
123 if data_type is None:
|
|
124 stop_err( 'This tool can only use fasta data or tabular data.' )
|
|
125 if seq_method is None:
|
|
126 stop_err( 'Invalid data for fasta format.')
|
|
127
|
|
128 if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ):
|
|
129 seq = None
|
|
130 score = None
|
|
131 score_found = False
|
|
132
|
|
133 score_file = open( score_infile_name, 'r' )
|
|
134
|
|
135 for i, line in enumerate( open( seq_infile_name ) ):
|
|
136 line = line.rstrip( '\r\n' )
|
|
137 if not line or line.startswith( '#' ):
|
|
138 continue
|
|
139 if line.startswith( '>' ):
|
|
140 if seq:
|
|
141 scores = []
|
|
142 if data_type == 'fasta':
|
|
143 score = None
|
|
144 score_found = False
|
|
145 score_line = 'start'
|
|
146 while not score_found and score_line:
|
|
147 score_line = score_file.readline().rstrip( '\r\n' )
|
|
148 if not score_line or score_line.startswith( '#' ):
|
|
149 continue
|
|
150 if score_line.startswith( '>' ):
|
|
151 if score:
|
|
152 scores = score.split()
|
|
153 score_found = True
|
|
154 score = None
|
|
155 else:
|
|
156 for val in score_line.split():
|
|
157 try:
|
|
158 int( val )
|
|
159 except:
|
|
160 score_file.close()
|
|
161 stop_err( "Non-numerical value '%s' in score file." % val )
|
|
162 if not score:
|
|
163 score = score_line
|
|
164 else:
|
|
165 score = '%s %s' % ( score, score_line )
|
|
166 elif data_type == 'tabular':
|
|
167 score = score_file.readline().rstrip('\r\n')
|
|
168 loc = score.split( '\t' )
|
|
169 for base in loc:
|
|
170 nuc_error = base.split()
|
|
171 try:
|
|
172 nuc_error[0] = int( nuc_error[0] )
|
|
173 nuc_error[1] = int( nuc_error[1] )
|
|
174 nuc_error[2] = int( nuc_error[2] )
|
|
175 nuc_error[3] = int( nuc_error[3] )
|
|
176 big = max( nuc_error )
|
|
177 except:
|
|
178 score_file.close()
|
|
179 stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
|
|
180 scores.append( big )
|
|
181 if scores:
|
|
182 new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
|
|
183 append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )
|
|
184
|
|
185 seq_title = line
|
|
186 seq = None
|
|
187 else:
|
|
188 if not seq:
|
|
189 seq = line
|
|
190 else:
|
|
191 seq = "%s%s" % ( seq, line )
|
|
192 if seq:
|
|
193 scores = []
|
|
194 if data_type == 'fasta':
|
|
195 score = None
|
|
196 while score_line:
|
|
197 score_line = score_file.readline().rstrip( '\r\n' )
|
|
198 if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ):
|
|
199 continue
|
|
200 for val in score_line.split():
|
|
201 try:
|
|
202 int( val )
|
|
203 except:
|
|
204 score_file.close()
|
|
205 stop_err( "Non-numerical value '%s' in score file." % val )
|
|
206 if not score:
|
|
207 score = score_line
|
|
208 else:
|
|
209 score = "%s %s" % ( score, score_line )
|
|
210 if score:
|
|
211 scores = score.split()
|
|
212 elif data_type == 'tabular':
|
|
213 score = score_file.readline().rstrip('\r\n')
|
|
214 loc = score.split( '\t' )
|
|
215 for base in loc:
|
|
216 nuc_error = base.split()
|
|
217 try:
|
|
218 nuc_error[0] = int( nuc_error[0] )
|
|
219 nuc_error[1] = int( nuc_error[1] )
|
|
220 nuc_error[2] = int( nuc_error[2] )
|
|
221 nuc_error[3] = int( nuc_error[3] )
|
|
222 big = max( nuc_error )
|
|
223 except:
|
|
224 score_file.close()
|
|
225 stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
|
|
226 scores.append( big )
|
|
227 if scores:
|
|
228 new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
|
|
229 append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )
|
|
230 score_file.close()
|
|
231 else:
|
|
232 stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) )
|
|
233
|
|
234 if __name__ == "__main__": __main__()
|