Mercurial > repos > devteam > fastq_trimmer
changeset 0:feb5479a48ff draft
Imported from capsule None
author | devteam |
---|---|
date | Thu, 23 Jan 2014 12:31:36 -0500 (2014-01-23) |
parents | |
children | 3be753901f6e |
files | fastq_trimmer.py fastq_trimmer.xml test-data/empty_file.dat test-data/fastq_trimmer_out1.fastqsanger test-data/sanger_full_range_original_sanger.fastqsanger tool_dependencies.xml |
diffstat | 5 files changed, 186 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq_trimmer.py Thu Jan 23 12:31:36 2014 -0500 @@ -0,0 +1,41 @@ +#Dan Blankenberg +import sys +from galaxy_utils.sequence.fastq import fastqReader, fastqWriter + +def main(): + input_filename = sys.argv[1] + output_filename = sys.argv[2] + left_offset = sys.argv[3] + right_offset = sys.argv[4] + percent_offsets = sys.argv[5] == 'offsets_percent' + input_type = sys.argv[6] or 'sanger' + keep_zero_length = sys.argv[7] == 'keep_zero_length' + + out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) + num_reads_excluded = 0 + num_reads = None + for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): + if percent_offsets: + left_column_offset = int( round( float( left_offset ) / 100.0 * float( len( fastq_read ) ) ) ) + right_column_offset = int( round( float( right_offset ) / 100.0 * float( len( fastq_read ) ) ) ) + else: + left_column_offset = int( left_offset ) + right_column_offset = int( right_offset ) + if right_column_offset > 0: + right_column_offset = -right_column_offset + else: + right_column_offset = None + fastq_read = fastq_read.slice( left_column_offset, right_column_offset ) + if keep_zero_length or len( fastq_read ): + out.write( fastq_read ) + else: + num_reads_excluded += 1 + out.close() + if num_reads is None: + print "No valid fastq reads could be processed." + else: + print "%i fastq reads were processed." % ( num_reads + 1 ) + if num_reads_excluded: + print "%i reads of zero length were excluded from the output." % num_reads_excluded + +if __name__ == "__main__": main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq_trimmer.xml Thu Jan 23 12:31:36 2014 -0500 @@ -0,0 +1,123 @@ +<tool id="fastq_trimmer" name="FASTQ Trimmer" version="1.0.0"> + <description>by column</description> + <requirements> + <requirement type="package" version="1.0.0">galaxy_sequence_utils</requirement> + </requirements> + <command interpreter="python">fastq_trimmer.py '$input_file' '$output_file' '${offset_type['left_column_offset']}' '${offset_type['right_column_offset']}' '${offset_type['base_offset_type']}' '${input_file.extension[len( 'fastq' ):]}' '$keep_zero_length'</command> + <inputs> + <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File"/> + <conditional name="offset_type"> + <param name="base_offset_type" type="select" label="Define Base Offsets as" help="Use Absolute for fixed length reads (Illumina, SOLiD)<br>Use Percentage for variable length reads (Roche/454)"> + <option value="offsets_absolute" selected="true">Absolute Values</option> + <option value="offsets_percent">Percentage of Read Length</option> + </param> + <when value="offsets_absolute"> + <param name="left_column_offset" label="Offset from 5' end" value="0" type="integer" help="Values start at 0, increasing from the left"> + <validator type="in_range" message="Base Offsets must be positive" min="0" max="inf"/> + <validator type="expression" message="An integer is required.">int( float( value ) ) == float( value )</validator> + </param> + <param name="right_column_offset" label="Offset from 3' end" value="0" type="integer" help="Values start at 0, increasing from the right"> + <validator type="in_range" message="Base Offsets must be positive" min="0" max="inf"/> + <validator type="expression" message="An integer is required.">int( float( value ) ) == float( value )</validator> + </param> + </when> + <when value="offsets_percent"> + <param name="left_column_offset" label="Offset from 5' end" value="0" type="float"> + <validator type="in_range" message="Base Offsets must be between 0 and 100" min="0" max="100"/> + </param> + <param name="right_column_offset" label="Offset from 3' end" value="0" type="float"> + <validator type="in_range" message="Base Offsets must be between 0 and 100" min="0" max="100"/> + </param> + </when> + </conditional> + <param name="keep_zero_length" label="Keep reads with zero length" type="boolean" truevalue="keep_zero_length" falsevalue="exclude_zero_length" selected="False"/> + </inputs> + <outputs> + <data name="output_file" format="input" /> + </outputs> + <tests> + <test> + <!-- Do nothing trim --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="base_offset_type" value="offsets_absolute"/> + <param name="left_column_offset" value="0"/> + <param name="right_column_offset" value="0"/> + <param name="keep_zero_length" value="keep_zero_length" /> + <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" /> + </test> + <!-- Trim to empty File --> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="base_offset_type" value="offsets_absolute"/> + <param name="left_column_offset" value="30"/> + <param name="right_column_offset" value="64"/> + <param name="keep_zero_length" value="exclude_zero_length" /> + <output name="output_file" file="empty_file.dat" /> + </test> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="base_offset_type" value="offsets_percent"/> + <param name="left_column_offset" value="50"/> + <param name="right_column_offset" value="50"/> + <param name="keep_zero_length" value="exclude_zero_length" /> + <output name="output_file" file="empty_file.dat" /> + </test> + <!-- Trim to 4 inner-most bases --> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="base_offset_type" value="offsets_absolute"/> + <param name="left_column_offset" value="45"/> + <param name="right_column_offset" value="45"/> + <param name="keep_zero_length" value="exclude_zero_length" /> + <output name="output_file" file="fastq_trimmer_out1.fastqsanger" /> + </test> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="base_offset_type" value="offsets_percent"/> + <param name="left_column_offset" value="47.87"/> + <param name="right_column_offset" value="47.87"/> + <param name="keep_zero_length" value="exclude_zero_length" /> + <output name="output_file" file="fastq_trimmer_out1.fastqsanger" /> + </test> + </tests> + <help> +This tool allows you to trim the ends of reads. + +You can specify either absolute or percent-based offsets. Offsets are calculated, starting at 0, from the respective end to be trimmed. When using the percent-based method, offsets are rounded to the nearest integer. + +For example, if you have a read of length 36:: + + @Some FASTQ Sanger Read + CAATATGTNCTCACTGATAAGTGGATATNAGCNCCA + + + =@@.@;B-%?8>CBA@>7@7BBCA4-48%<;;%<B@ + +And you set absolute offsets of 2 and 9:: + + @Some FASTQ Sanger Read + ATATGTNCTCACTGATAAGTGGATA + + + @.@;B-%?8>CBA@>7@7BBCA4-4 + +Or you set percent offsets of 6% and 20% (corresponds to absolute offsets of 2,7 for a read length of 36):: + + @Some FASTQ Sanger Read + ATATGTNCTCACTGATAAGTGGATATN + + + @.@;B-%?8>CBA@>7@7BBCA4-48% + +----- + +.. class:: warningmark + +Trimming a color space read will cause any adapter base to be lost. + +------ + +**Citation** + +If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_ + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastq_trimmer_out1.fastqsanger Thu Jan 23 12:31:36 2014 -0500 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +CGTA ++ +NOPQ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +ATGC ++ +QPON
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_original_sanger.fastqsanger Thu Jan 23 12:31:36 2014 -0500 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Jan 23 12:31:36 2014 -0500 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="galaxy_sequence_utils" version="1.0.0"> + <repository changeset_revision="195699b1562a" name="package_galaxy_utils_1_0" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>