Mercurial > repos > devteam > short_reads_trim_seq
comparison short_reads_trim_seq.py @ 0:8c0b907e6e5b draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 10:59:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8c0b907e6e5b |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 trim reads based on the quality scores | |
4 input: read file and quality score file | |
5 output: trimmed read file | |
6 """ | |
7 | |
8 import os, sys, math, tempfile, re | |
9 | |
10 assert sys.version_info[:2] >= ( 2, 4 ) | |
11 | |
12 def stop_err( msg ): | |
13 sys.stderr.write( "%s\n" % msg ) | |
14 sys.exit() | |
15 | |
16 def append_to_outfile( outfile_name, seq_title, segments ): | |
17 segments = segments.split( ',' ) | |
18 if len( segments ) > 1: | |
19 outfile = open( outfile_name, 'a' ) | |
20 for i in range( len( segments ) ): | |
21 outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) ) | |
22 outfile.close() | |
23 elif segments[0]: | |
24 outfile = open( outfile_name, 'a' ) | |
25 outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) ) | |
26 outfile.close() | |
27 | |
28 def trim_seq( seq, score, arg, trim_score, threshold ): | |
29 seq_method = '454' | |
30 trim_pos = 0 | |
31 # trim after a certain position | |
32 if arg.isdigit(): | |
33 keep_homopolymers = False | |
34 trim_pos = int( arg ) | |
35 if trim_pos > 0 and trim_pos < len( seq ): | |
36 seq = seq[0:trim_pos] | |
37 else: | |
38 keep_homopolymers = arg=='yes' | |
39 | |
40 new_trim_seq = '' | |
41 max_segment = 0 | |
42 | |
43 for i in range( len( seq ) ): | |
44 if i >= len( score ): | |
45 score.append(-1) | |
46 if int( score[i] ) >= trim_score: | |
47 pass_nuc = seq[ i:( i + 1 ) ] | |
48 else: | |
49 if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ): | |
50 pass_nuc = seq[ i:( i + 1 ) ] | |
51 else: | |
52 pass_nuc = ' ' | |
53 new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc ) | |
54 # find the max substrings | |
55 segments = new_trim_seq.split() | |
56 max_segment = '' | |
57 len_max_segment = 0 | |
58 if threshold == 0: | |
59 for seg in segments: | |
60 if len_max_segment < len( seg ): | |
61 max_segment = '%s,' % seg | |
62 len_max_segment = len( seg ) | |
63 elif len_max_segment == len( seg ): | |
64 max_segment = '%s%s,' % ( max_segment, seg ) | |
65 else: | |
66 for seg in segments: | |
67 if len( seg ) >= threshold: | |
68 max_segment = '%s%s,' % ( max_segment, seg ) | |
69 return max_segment[ 0:-1 ] | |
70 | |
71 def __main__(): | |
72 | |
73 try: | |
74 threshold_trim = int( sys.argv[1].strip() ) | |
75 except: | |
76 stop_err( "Minimal quality score must be numeric." ) | |
77 try: | |
78 threshold_report = int( sys.argv[2].strip() ) | |
79 except: | |
80 stop_err( "Minimal length of trimmed reads must be numeric." ) | |
81 outfile_seq_name = sys.argv[3].strip() | |
82 infile_seq_name = sys.argv[4].strip() | |
83 infile_score_name = sys.argv[5].strip() | |
84 arg = sys.argv[6].strip() | |
85 | |
86 seq_infile_name = infile_seq_name | |
87 score_infile_name = infile_score_name | |
88 | |
89 | |
90 # Determine quailty score format: tabular or fasta format within the first 100 lines | |
91 seq_method = None | |
92 data_type = None | |
93 for i, line in enumerate( file( score_infile_name ) ): | |
94 line = line.rstrip( '\r\n' ) | |
95 if not line or line.startswith( '#' ): | |
96 continue | |
97 if data_type == None: | |
98 if line.startswith( '>' ): | |
99 data_type = 'fasta' | |
100 continue | |
101 elif len( line.split( '\t' ) ) > 0: | |
102 fields = line.split() | |
103 for score in fields: | |
104 try: | |
105 int( score ) | |
106 data_type = 'tabular' | |
107 seq_method = 'solexa' | |
108 break | |
109 except: | |
110 break | |
111 elif data_type == 'fasta': | |
112 fields = line.split() | |
113 for score in fields: | |
114 try: | |
115 int( score ) | |
116 seq_method = '454' | |
117 break | |
118 except: | |
119 break | |
120 if i == 100: | |
121 break | |
122 | |
123 if data_type is None: | |
124 stop_err( 'This tool can only use fasta data or tabular data.' ) | |
125 if seq_method is None: | |
126 stop_err( 'Invalid data for fasta format.') | |
127 | |
128 if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ): | |
129 seq = None | |
130 score = None | |
131 score_found = False | |
132 | |
133 score_file = open( score_infile_name, 'r' ) | |
134 | |
135 for i, line in enumerate( open( seq_infile_name ) ): | |
136 line = line.rstrip( '\r\n' ) | |
137 if not line or line.startswith( '#' ): | |
138 continue | |
139 if line.startswith( '>' ): | |
140 if seq: | |
141 scores = [] | |
142 if data_type == 'fasta': | |
143 score = None | |
144 score_found = False | |
145 score_line = 'start' | |
146 while not score_found and score_line: | |
147 score_line = score_file.readline().rstrip( '\r\n' ) | |
148 if not score_line or score_line.startswith( '#' ): | |
149 continue | |
150 if score_line.startswith( '>' ): | |
151 if score: | |
152 scores = score.split() | |
153 score_found = True | |
154 score = None | |
155 else: | |
156 for val in score_line.split(): | |
157 try: | |
158 int( val ) | |
159 except: | |
160 score_file.close() | |
161 stop_err( "Non-numerical value '%s' in score file." % val ) | |
162 if not score: | |
163 score = score_line | |
164 else: | |
165 score = '%s %s' % ( score, score_line ) | |
166 elif data_type == 'tabular': | |
167 score = score_file.readline().rstrip('\r\n') | |
168 loc = score.split( '\t' ) | |
169 for base in loc: | |
170 nuc_error = base.split() | |
171 try: | |
172 nuc_error[0] = int( nuc_error[0] ) | |
173 nuc_error[1] = int( nuc_error[1] ) | |
174 nuc_error[2] = int( nuc_error[2] ) | |
175 nuc_error[3] = int( nuc_error[3] ) | |
176 big = max( nuc_error ) | |
177 except: | |
178 score_file.close() | |
179 stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) ) | |
180 scores.append( big ) | |
181 if scores: | |
182 new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report ) | |
183 append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments ) | |
184 | |
185 seq_title = line | |
186 seq = None | |
187 else: | |
188 if not seq: | |
189 seq = line | |
190 else: | |
191 seq = "%s%s" % ( seq, line ) | |
192 if seq: | |
193 scores = [] | |
194 if data_type == 'fasta': | |
195 score = None | |
196 while score_line: | |
197 score_line = score_file.readline().rstrip( '\r\n' ) | |
198 if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ): | |
199 continue | |
200 for val in score_line.split(): | |
201 try: | |
202 int( val ) | |
203 except: | |
204 score_file.close() | |
205 stop_err( "Non-numerical value '%s' in score file." % val ) | |
206 if not score: | |
207 score = score_line | |
208 else: | |
209 score = "%s %s" % ( score, score_line ) | |
210 if score: | |
211 scores = score.split() | |
212 elif data_type == 'tabular': | |
213 score = score_file.readline().rstrip('\r\n') | |
214 loc = score.split( '\t' ) | |
215 for base in loc: | |
216 nuc_error = base.split() | |
217 try: | |
218 nuc_error[0] = int( nuc_error[0] ) | |
219 nuc_error[1] = int( nuc_error[1] ) | |
220 nuc_error[2] = int( nuc_error[2] ) | |
221 nuc_error[3] = int( nuc_error[3] ) | |
222 big = max( nuc_error ) | |
223 except: | |
224 score_file.close() | |
225 stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) ) | |
226 scores.append( big ) | |
227 if scores: | |
228 new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report ) | |
229 append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments ) | |
230 score_file.close() | |
231 else: | |
232 stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) ) | |
233 | |
234 if __name__ == "__main__": __main__() |