Mercurial > repos > devteam > fastq_trimmer_by_quality
changeset 0:5070dd206927 draft
Imported from capsule None
author | devteam |
---|---|
date | Thu, 23 Jan 2014 12:31:18 -0500 (2014-01-23) |
parents | |
children | f695662366e2 |
files | fastq_trimmer_by_quality.py fastq_trimmer_by_quality.xml test-data/empty_file.dat test-data/sanger_full_range_empty_reads.fastqsanger test-data/sanger_full_range_original_sanger.fastqsanger test-data/sanger_full_range_quality_trimmed_out_1.fastqsanger test-data/sanger_full_range_quality_trimmed_out_2.fastqsanger test-data/sanger_full_range_quality_trimmed_out_3.fastqsanger tool_dependencies.xml |
diffstat | 8 files changed, 320 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq_trimmer_by_quality.py Thu Jan 23 12:31:18 2014 -0500 @@ -0,0 +1,126 @@ +#Dan Blankenberg +from optparse import OptionParser +from galaxy_utils.sequence.fastq import fastqReader, fastqWriter + +def mean( score_list ): + return float( sum( score_list ) ) / float( len( score_list ) ) + +ACTION_METHODS = { 'min':min, 'max':max, 'sum':sum, 'mean':mean } + +def compare( aggregated_value, operator, threshold_value ): + if operator == '>': + return aggregated_value > threshold_value + elif operator == '>=': + return aggregated_value >= threshold_value + elif operator == '==': + return aggregated_value == threshold_value + elif operator == '<': + return aggregated_value < threshold_value + elif operator == '<=': + return aggregated_value <= threshold_value + elif operator == '!=': + return aggregated_value != threshold_value + +def exclude( value_list, exclude_indexes ): + rval = [] + for i, val in enumerate( value_list ): + if i not in exclude_indexes: + rval.append( val ) + return rval + +def exclude_and_compare( aggregate_action, aggregate_list, operator, threshold_value, exclude_indexes = None ): + if not aggregate_list or compare( aggregate_action( aggregate_list ), operator, threshold_value ): + return True + if exclude_indexes: + for exclude_index in exclude_indexes: + excluded_list = exclude( aggregate_list, exclude_index ) + if not excluded_list or compare( aggregate_action( excluded_list ), operator, threshold_value ): + return True + return False + +def main(): + usage = "usage: %prog [options] input_file output_file" + parser = OptionParser( usage=usage ) + parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) + parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' ) + parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' ) + parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' ) + parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' ) + parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' ) + parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' ) + parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) + parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length") + ( options, args ) = parser.parse_args() + + if len ( args ) != 2: + parser.error( "Need to specify an input file and an output file" ) + + if options.window_size < 1: + parser.error( 'You must specify a strictly positive window size' ) + + if options.window_step < 1: + parser.error( 'You must specify a strictly positive step size' ) + + #determine an exhaustive list of window indexes that can be excluded from aggregation + exclude_window_indexes = [] + last_exclude_indexes = [] + for exclude_count in range( min( options.exclude_count, options.window_size ) ): + if last_exclude_indexes: + new_exclude_indexes = [] + for exclude_list in last_exclude_indexes: + for window_index in range( options.window_size ): + if window_index not in exclude_list: + new_exclude = sorted( exclude_list + [ window_index ] ) + if new_exclude not in exclude_window_indexes + new_exclude_indexes: + new_exclude_indexes.append( new_exclude ) + exclude_window_indexes += new_exclude_indexes + last_exclude_indexes = new_exclude_indexes + else: + for window_index in range( options.window_size ): + last_exclude_indexes.append( [ window_index ] ) + exclude_window_indexes = list( last_exclude_indexes ) + + out = fastqWriter( open( args[1], 'wb' ), format = options.format ) + action = ACTION_METHODS[ options.aggregation_action ] + + num_reads = None + num_reads_excluded = 0 + for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): + for trim_end in options.trim_ends: + quality_list = fastq_read.get_decimal_quality_scores() + if trim_end == '5': + lwindow_position = 0 #left position of window + while True: + if lwindow_position >= len( quality_list ): + fastq_read.sequence = '' + fastq_read.quality = '' + break + if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ): + fastq_read = fastq_read.slice( lwindow_position, None ) + break + lwindow_position += options.window_step + else: + rwindow_position = len( quality_list ) #right position of window + while True: + lwindow_position = rwindow_position - options.window_size #left position of window + if rwindow_position <= 0 or lwindow_position < 0: + fastq_read.sequence = '' + fastq_read.quality = '' + break + if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ): + fastq_read = fastq_read.slice( None, rwindow_position ) + break + rwindow_position -= options.window_step + if options.keep_zero_length or len( fastq_read ): + out.write( fastq_read ) + else: + num_reads_excluded += 1 + out.close() + if num_reads is None: + print "No valid FASTQ reads could be processed." + else: + print "%i FASTQ reads were processed." % ( num_reads + 1 ) + if num_reads_excluded: + print "%i reads of zero length were excluded from the output." % num_reads_excluded + +if __name__ == "__main__": main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq_trimmer_by_quality.xml Thu Jan 23 12:31:18 2014 -0500 @@ -0,0 +1,148 @@ +<tool id="fastq_quality_trimmer" name="FASTQ Quality Trimmer" version="1.0.0"> + <description>by sliding window</description> + <requirements> + <requirement type="package" version="1.0.0">galaxy_sequence_utils</requirement> + </requirements> + <command interpreter="python">fastq_trimmer_by_quality.py '$input_file' '$output_file' -f '${input_file.extension[len( 'fastq' ):]}' -s '$window_size' + -t '$step_size' -e '$trim_ends' -a '$aggregation_action' -x '$exclude_count' -c '$score_comparison' -q '$quality_score' + #if $keep_zero_length.value: + -k + #end if + </command> + <inputs> + <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File"/> + <param name="keep_zero_length" label="Keep reads with zero length" type="boolean" truevalue="keep_zero_length" falsevalue="exclude_zero_length" selected="False"/> + <param name="trim_ends" type="select" label="Trim ends"> + <option value="53" selected="True">5' and 3'</option> + <option value="5">5' only</option> + <option value="3">3' only</option> + </param> + <param name="window_size" type="integer" value="1" label="Window size"/> + <param name="step_size" type="integer" value="1" label="Step Size" /> + <param name="exclude_count" label="Maximum number of bases to exclude from the window during aggregation" value="0" type="integer" /> + <param name="aggregation_action" type="select" label="Aggregate action for window"> + <option value="min" selected="True">min score</option> + <option value="max">max score</option> + <option value="sum">sum of scores</option> + <option value="mean">mean of scores</option> + </param> + <param name="score_comparison" type="select" label="Trim until aggregate score is"> + <sanitizer> + <valid initial="none"> + <add value="<>=!"/> <!-- only allow lt, gt, e, le, ge, ne for this parameter; will be single-quote escaped on commandline --> + </valid> + </sanitizer> + <option value=">">></option> + <option value=">=" selected="true">>=</option> + <option value="==">==</option> + <option value="!=">!=</option> + <option value="<"><</option> + <option value="<="><=</option> + </param> + <param name="quality_score" label="Quality Score" value="0" type="float" /> + </inputs> + <outputs> + <data name="output_file" format="input" /> + </outputs> + <tests> + <test> + <!-- Trim until window size 1 >= 20;both ends --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="exclude_zero_length" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_1.fastqsanger" /> + </test> + <test> + <!-- Trim until window size 1 >= 20; 5' end only --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="exclude_zero_length" /> + <param name="trim_ends" value="5"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_2.fastqsanger" /> + </test> + <test> + <!-- Trim until window size 1 >= 20; 3' end only --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="exclude_zero_length" /> + <param name="trim_ends" value="3"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_3.fastqsanger" /> + </test> + <test> + <!-- Trim until window size 2 >= 1;both ends, 1 deviant score --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="exclude_zero_length" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="2"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="1"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="1"/> + <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" /> + </test> + <test> + <!-- Trim entire sequences; keep empty reads --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="true" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="999"/> + <output name="output_file" file="sanger_full_range_empty_reads.fastqsanger" /> + </test> + <test> + <!-- Trim entire sequences; discard empty reads --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length"/> + <param name="trim_ends" value="53"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="999"/> + <output name="output_file" file="empty_file.dat" /> + </test> + </tests> + <help> +This tool allows you to trim the ends of reads based upon the aggregate value of quality scores found within a sliding window; a sliding window of size 1 is equivalent to 'simple' trimming of the ends. + +The user specifies the aggregating action (min, max, sum, mean) to perform on the quality score values found within the sliding window to be used with the user defined comparison operation and comparison value. + +The user can provide a maximum count of bases that can be excluded from the aggregation within the window. When set, this tool will first check the aggregation of the entire window, then after removing 1 value, then after removing 2 values, up to the number declared. Setting this value to be equal to or greater than the window size will cause no trimming to occur. + +----- + +.. class:: warningmark + +Trimming a color space read will cause any adapter base to be lost. + +------ + +**Citation** + +If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_empty_reads.fastqsanger Thu Jan 23 12:31:18 2014 -0500 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) + ++ + +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) + ++ +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_original_sanger.fastqsanger Thu Jan 23 12:31:18 2014 -0500 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_quality_trimmed_out_1.fastqsanger Thu Jan 23 12:31:18 2014 -0500 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +56789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:98765
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_quality_trimmed_out_2.fastqsanger Thu Jan 23 12:31:18 2014 -0500 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +56789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_quality_trimmed_out_3.fastqsanger Thu Jan 23 12:31:18 2014 -0500 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:98765
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Jan 23 12:31:18 2014 -0500 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="galaxy_sequence_utils" version="1.0.0"> + <repository changeset_revision="195699b1562a" name="package_galaxy_utils_1_0" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>