| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 """
 | 
| 
 | 
     3 trim reads based on the quality scores
 | 
| 
 | 
     4 input: read file and quality score file
 | 
| 
 | 
     5 output: trimmed read file
 | 
| 
 | 
     6 """
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 import os, sys, math, tempfile, re
 | 
| 
 | 
     9 
 | 
| 
 | 
    10 assert sys.version_info[:2] >= ( 2, 4 )
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 def stop_err( msg ):
 | 
| 
 | 
    13     sys.stderr.write( "%s\n" % msg )
 | 
| 
 | 
    14     sys.exit()
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 def append_to_outfile( outfile_name, seq_title, segments ):
 | 
| 
 | 
    17     segments = segments.split( ',' )
 | 
| 
 | 
    18     if len( segments ) > 1:
 | 
| 
 | 
    19         outfile = open( outfile_name, 'a' )
 | 
| 
 | 
    20         for i in range( len( segments ) ):
 | 
| 
 | 
    21             outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) )
 | 
| 
 | 
    22         outfile.close()
 | 
| 
 | 
    23     elif segments[0]:
 | 
| 
 | 
    24         outfile = open( outfile_name, 'a' )
 | 
| 
 | 
    25         outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) )
 | 
| 
 | 
    26         outfile.close()
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 def trim_seq( seq, score, arg, trim_score, threshold ):
 | 
| 
 | 
    29     seq_method = '454'
 | 
| 
 | 
    30     trim_pos = 0
 | 
| 
 | 
    31     # trim after a certain position
 | 
| 
 | 
    32     if arg.isdigit():
 | 
| 
 | 
    33         keep_homopolymers = False
 | 
| 
 | 
    34         trim_pos = int( arg )    
 | 
| 
 | 
    35         if trim_pos > 0 and trim_pos < len( seq ):
 | 
| 
 | 
    36             seq = seq[0:trim_pos]
 | 
| 
 | 
    37     else:
 | 
| 
 | 
    38         keep_homopolymers = arg=='yes'
 | 
| 
 | 
    39         
 | 
| 
 | 
    40     new_trim_seq = ''
 | 
| 
 | 
    41     max_segment = 0
 | 
| 
 | 
    42 
 | 
| 
 | 
    43     for i in range( len( seq ) ):
 | 
| 
 | 
    44         if i >= len( score ):
 | 
| 
 | 
    45             score.append(-1)   
 | 
| 
 | 
    46         if int( score[i] ) >= trim_score:
 | 
| 
 | 
    47             pass_nuc = seq[ i:( i + 1 ) ]
 | 
| 
 | 
    48         else:
 | 
| 
 | 
    49             if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ):
 | 
| 
 | 
    50                 pass_nuc = seq[ i:( i + 1 ) ]
 | 
| 
 | 
    51             else:
 | 
| 
 | 
    52                 pass_nuc = ' '    
 | 
| 
 | 
    53         new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc )
 | 
| 
 | 
    54         # find the max substrings
 | 
| 
 | 
    55         segments = new_trim_seq.split()
 | 
| 
 | 
    56         max_segment = ''
 | 
| 
 | 
    57         len_max_segment = 0
 | 
| 
 | 
    58         if threshold == 0:
 | 
| 
 | 
    59             for seg in segments:
 | 
| 
 | 
    60                 if len_max_segment < len( seg ):
 | 
| 
 | 
    61                     max_segment = '%s,' % seg
 | 
| 
 | 
    62                     len_max_segment = len( seg )
 | 
| 
 | 
    63                 elif len_max_segment == len( seg ):
 | 
| 
 | 
    64                     max_segment = '%s%s,' % ( max_segment, seg )
 | 
| 
 | 
    65         else:
 | 
| 
 | 
    66             for seg in segments:
 | 
| 
 | 
    67                 if len( seg ) >= threshold:
 | 
| 
 | 
    68                     max_segment = '%s%s,' % ( max_segment, seg )
 | 
| 
 | 
    69     return max_segment[ 0:-1 ]
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 def __main__():
 | 
| 
 | 
    72     
 | 
| 
 | 
    73     try:
 | 
| 
 | 
    74         threshold_trim = int( sys.argv[1].strip() )
 | 
| 
 | 
    75     except:
 | 
| 
 | 
    76         stop_err( "Minimal quality score must be numeric." )
 | 
| 
 | 
    77     try:
 | 
| 
 | 
    78         threshold_report = int( sys.argv[2].strip() )
 | 
| 
 | 
    79     except:
 | 
| 
 | 
    80         stop_err( "Minimal length of trimmed reads must be numeric." )
 | 
| 
 | 
    81     outfile_seq_name = sys.argv[3].strip()
 | 
| 
 | 
    82     infile_seq_name = sys.argv[4].strip()
 | 
| 
 | 
    83     infile_score_name = sys.argv[5].strip()
 | 
| 
 | 
    84     arg = sys.argv[6].strip()
 | 
| 
 | 
    85 
 | 
| 
 | 
    86     seq_infile_name = infile_seq_name
 | 
| 
 | 
    87     score_infile_name = infile_score_name
 | 
| 
 | 
    88     
 | 
| 
 | 
    89 
 | 
| 
 | 
    90     # Determine quailty score format: tabular or fasta format within the first 100 lines
 | 
| 
 | 
    91     seq_method = None
 | 
| 
 | 
    92     data_type = None
 | 
| 
 | 
    93     for i, line in enumerate( file( score_infile_name ) ):
 | 
| 
 | 
    94         line = line.rstrip( '\r\n' )
 | 
| 
 | 
    95         if not line or line.startswith( '#' ):
 | 
| 
 | 
    96             continue
 | 
| 
 | 
    97         if data_type == None:
 | 
| 
 | 
    98             if line.startswith( '>' ):
 | 
| 
 | 
    99                 data_type = 'fasta'
 | 
| 
 | 
   100                 continue
 | 
| 
 | 
   101             elif len( line.split( '\t' ) ) > 0:
 | 
| 
 | 
   102                 fields = line.split()
 | 
| 
 | 
   103                 for score in fields:
 | 
| 
 | 
   104                     try:
 | 
| 
 | 
   105                         int( score )
 | 
| 
 | 
   106                         data_type = 'tabular'
 | 
| 
 | 
   107                         seq_method = 'solexa'
 | 
| 
 | 
   108                         break
 | 
| 
 | 
   109                     except:
 | 
| 
 | 
   110                         break
 | 
| 
 | 
   111         elif data_type == 'fasta':
 | 
| 
 | 
   112             fields = line.split()
 | 
| 
 | 
   113             for score in fields:
 | 
| 
 | 
   114                 try: 
 | 
| 
 | 
   115                     int( score )
 | 
| 
 | 
   116                     seq_method = '454'
 | 
| 
 | 
   117                     break
 | 
| 
 | 
   118                 except:
 | 
| 
 | 
   119                     break
 | 
| 
 | 
   120         if i == 100:
 | 
| 
 | 
   121             break
 | 
| 
 | 
   122 
 | 
| 
 | 
   123     if data_type is None:
 | 
| 
 | 
   124         stop_err( 'This tool can only use fasta data or tabular data.' ) 
 | 
| 
 | 
   125     if seq_method is None:
 | 
| 
 | 
   126         stop_err( 'Invalid data for fasta format.')
 | 
| 
 | 
   127     
 | 
| 
 | 
   128     if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ):
 | 
| 
 | 
   129         seq = None
 | 
| 
 | 
   130         score = None
 | 
| 
 | 
   131         score_found = False
 | 
| 
 | 
   132         
 | 
| 
 | 
   133         score_file = open( score_infile_name, 'r' )
 | 
| 
 | 
   134 
 | 
| 
 | 
   135         for i, line in enumerate( open( seq_infile_name ) ):
 | 
| 
 | 
   136             line = line.rstrip( '\r\n' )
 | 
| 
 | 
   137             if not line or line.startswith( '#' ):
 | 
| 
 | 
   138                 continue
 | 
| 
 | 
   139             if line.startswith( '>' ):
 | 
| 
 | 
   140                 if seq:
 | 
| 
 | 
   141                     scores = []
 | 
| 
 | 
   142                     if data_type == 'fasta':
 | 
| 
 | 
   143                         score = None
 | 
| 
 | 
   144                         score_found = False
 | 
| 
 | 
   145                         score_line = 'start'
 | 
| 
 | 
   146                         while not score_found and score_line:
 | 
| 
 | 
   147                             score_line = score_file.readline().rstrip( '\r\n' )
 | 
| 
 | 
   148                             if not score_line or score_line.startswith( '#' ):
 | 
| 
 | 
   149                                 continue
 | 
| 
 | 
   150                             if score_line.startswith( '>' ):
 | 
| 
 | 
   151                                 if score:
 | 
| 
 | 
   152                                     scores = score.split()
 | 
| 
 | 
   153                                     score_found = True    
 | 
| 
 | 
   154                                 score = None
 | 
| 
 | 
   155                             else:
 | 
| 
 | 
   156                                 for val in score_line.split():
 | 
| 
 | 
   157                                     try:
 | 
| 
 | 
   158                                         int( val ) 
 | 
| 
 | 
   159                                     except:
 | 
| 
 | 
   160                                         score_file.close()
 | 
| 
 | 
   161                                         stop_err( "Non-numerical value '%s' in score file." % val )
 | 
| 
 | 
   162                                 if not score:
 | 
| 
 | 
   163                                     score = score_line
 | 
| 
 | 
   164                                 else:
 | 
| 
 | 
   165                                     score = '%s %s' % ( score, score_line )                                        
 | 
| 
 | 
   166                     elif data_type == 'tabular':
 | 
| 
 | 
   167                         score = score_file.readline().rstrip('\r\n')
 | 
| 
 | 
   168                         loc = score.split( '\t' )
 | 
| 
 | 
   169                         for base in loc:
 | 
| 
 | 
   170                             nuc_error = base.split()
 | 
| 
 | 
   171                             try:
 | 
| 
 | 
   172                                 nuc_error[0] = int( nuc_error[0] )
 | 
| 
 | 
   173                                 nuc_error[1] = int( nuc_error[1] )
 | 
| 
 | 
   174                                 nuc_error[2] = int( nuc_error[2] )
 | 
| 
 | 
   175                                 nuc_error[3] = int( nuc_error[3] )
 | 
| 
 | 
   176                                 big = max( nuc_error )
 | 
| 
 | 
   177                             except:
 | 
| 
 | 
   178                                 score_file.close()
 | 
| 
 | 
   179                                 stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
 | 
| 
 | 
   180                             scores.append( big )
 | 
| 
 | 
   181                     if scores:
 | 
| 
 | 
   182                         new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
 | 
| 
 | 
   183                         append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )  
 | 
| 
 | 
   184                                 
 | 
| 
 | 
   185                 seq_title = line
 | 
| 
 | 
   186                 seq = None
 | 
| 
 | 
   187             else:
 | 
| 
 | 
   188                 if not seq:
 | 
| 
 | 
   189                     seq = line
 | 
| 
 | 
   190                 else:
 | 
| 
 | 
   191                     seq = "%s%s" % ( seq, line )
 | 
| 
 | 
   192         if seq:
 | 
| 
 | 
   193             scores = []
 | 
| 
 | 
   194             if data_type == 'fasta':
 | 
| 
 | 
   195                 score = None
 | 
| 
 | 
   196                 while score_line:
 | 
| 
 | 
   197                     score_line = score_file.readline().rstrip( '\r\n' )
 | 
| 
 | 
   198                     if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ):
 | 
| 
 | 
   199                         continue
 | 
| 
 | 
   200                     for val in score_line.split():
 | 
| 
 | 
   201                         try:
 | 
| 
 | 
   202                             int( val )
 | 
| 
 | 
   203                         except:
 | 
| 
 | 
   204                             score_file.close()
 | 
| 
 | 
   205                             stop_err( "Non-numerical value '%s' in score file." % val )
 | 
| 
 | 
   206                     if not score:
 | 
| 
 | 
   207                         score = score_line
 | 
| 
 | 
   208                     else:
 | 
| 
 | 
   209                         score = "%s %s" % ( score, score_line ) 
 | 
| 
 | 
   210                 if score: 
 | 
| 
 | 
   211                     scores = score.split()
 | 
| 
 | 
   212             elif data_type == 'tabular':
 | 
| 
 | 
   213                 score = score_file.readline().rstrip('\r\n')
 | 
| 
 | 
   214                 loc = score.split( '\t' )
 | 
| 
 | 
   215                 for base in loc:
 | 
| 
 | 
   216                     nuc_error = base.split()
 | 
| 
 | 
   217                     try:
 | 
| 
 | 
   218                         nuc_error[0] = int( nuc_error[0] )
 | 
| 
 | 
   219                         nuc_error[1] = int( nuc_error[1] )
 | 
| 
 | 
   220                         nuc_error[2] = int( nuc_error[2] )
 | 
| 
 | 
   221                         nuc_error[3] = int( nuc_error[3] )
 | 
| 
 | 
   222                         big = max( nuc_error )
 | 
| 
 | 
   223                     except:
 | 
| 
 | 
   224                         score_file.close()
 | 
| 
 | 
   225                         stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
 | 
| 
 | 
   226                     scores.append( big )
 | 
| 
 | 
   227             if scores:
 | 
| 
 | 
   228                 new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
 | 
| 
 | 
   229                 append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )  
 | 
| 
 | 
   230         score_file.close()
 | 
| 
 | 
   231     else:
 | 
| 
 | 
   232         stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) )    
 | 
| 
 | 
   233 
 | 
| 
 | 
   234 if __name__ == "__main__": __main__()
 |