changeset 0:feb5479a48ff draft

Imported from capsule None
author devteam
date Thu, 23 Jan 2014 12:31:36 -0500 (2014-01-23)
parents
children 3be753901f6e
files fastq_trimmer.py fastq_trimmer.xml test-data/empty_file.dat test-data/fastq_trimmer_out1.fastqsanger test-data/sanger_full_range_original_sanger.fastqsanger tool_dependencies.xml
diffstat 5 files changed, 186 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_trimmer.py	Thu Jan 23 12:31:36 2014 -0500
@@ -0,0 +1,41 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
+
+def main():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    left_offset = sys.argv[3]
+    right_offset = sys.argv[4]
+    percent_offsets = sys.argv[5] == 'offsets_percent'
+    input_type = sys.argv[6] or 'sanger'
+    keep_zero_length = sys.argv[7] == 'keep_zero_length'
+    
+    out = fastqWriter( open( output_filename, 'wb' ), format = input_type )
+    num_reads_excluded = 0
+    num_reads = None
+    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        if percent_offsets:
+            left_column_offset = int( round( float( left_offset ) / 100.0 * float( len( fastq_read ) ) ) )
+            right_column_offset = int( round( float( right_offset ) / 100.0 * float( len( fastq_read ) ) ) )
+        else:
+            left_column_offset = int( left_offset )
+            right_column_offset = int( right_offset )
+        if right_column_offset > 0:
+            right_column_offset = -right_column_offset
+        else:
+            right_column_offset = None
+        fastq_read = fastq_read.slice( left_column_offset, right_column_offset )
+        if keep_zero_length or len( fastq_read ):
+            out.write( fastq_read )
+        else:
+            num_reads_excluded += 1
+    out.close()
+    if num_reads is None:
+        print "No valid fastq reads could be processed."
+    else:
+        print "%i fastq reads were processed." % ( num_reads + 1 )
+    if num_reads_excluded:
+        print "%i reads of zero length were excluded from the output." % num_reads_excluded
+
+if __name__ == "__main__": main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_trimmer.xml	Thu Jan 23 12:31:36 2014 -0500
@@ -0,0 +1,123 @@
+<tool id="fastq_trimmer" name="FASTQ Trimmer" version="1.0.0">
+  <description>by column</description>
+  <requirements>
+    <requirement type="package" version="1.0.0">galaxy_sequence_utils</requirement>
+  </requirements>
+  <command interpreter="python">fastq_trimmer.py '$input_file' '$output_file' '${offset_type['left_column_offset']}' '${offset_type['right_column_offset']}' '${offset_type['base_offset_type']}' '${input_file.extension[len( 'fastq' ):]}' '$keep_zero_length'</command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File"/>
+    <conditional name="offset_type">
+      <param name="base_offset_type" type="select" label="Define Base Offsets as" help="Use Absolute for fixed length reads (Illumina, SOLiD)&lt;br&gt;Use Percentage for variable length reads (Roche/454)">
+        <option value="offsets_absolute" selected="true">Absolute Values</option>
+        <option value="offsets_percent">Percentage of Read Length</option>
+      </param>
+      <when value="offsets_absolute">
+        <param name="left_column_offset" label="Offset from 5' end" value="0" type="integer" help="Values start at 0, increasing from the left">
+          <validator type="in_range" message="Base Offsets must be positive" min="0" max="inf"/>
+          <validator type="expression" message="An integer is required.">int( float( value ) ) == float( value )</validator>
+        </param>
+        <param name="right_column_offset" label="Offset from 3' end" value="0" type="integer" help="Values start at 0, increasing from the right">
+          <validator type="in_range" message="Base Offsets must be positive" min="0" max="inf"/>
+          <validator type="expression" message="An integer is required.">int( float( value ) ) == float( value )</validator>
+        </param>
+      </when>
+      <when value="offsets_percent">
+        <param name="left_column_offset" label="Offset from 5' end" value="0" type="float">
+          <validator type="in_range" message="Base Offsets must be between 0 and 100" min="0" max="100"/>
+        </param>
+        <param name="right_column_offset" label="Offset from 3' end" value="0" type="float">
+          <validator type="in_range" message="Base Offsets must be between 0 and 100" min="0" max="100"/>
+        </param>
+      </when>
+    </conditional>
+  <param name="keep_zero_length" label="Keep reads with zero length" type="boolean" truevalue="keep_zero_length" falsevalue="exclude_zero_length" selected="False"/>
+  </inputs>
+  <outputs>
+    <data name="output_file" format="input" />
+  </outputs>
+  <tests>
+    <test>
+      <!-- Do nothing trim -->
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_absolute"/>
+      <param name="left_column_offset" value="0"/>
+      <param name="right_column_offset" value="0"/>
+      <param name="keep_zero_length" value="keep_zero_length" />
+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
+    </test>
+    <!-- Trim to empty File -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_absolute"/>
+      <param name="left_column_offset" value="30"/>
+      <param name="right_column_offset" value="64"/>
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <output name="output_file" file="empty_file.dat" />
+    </test>
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_percent"/>
+      <param name="left_column_offset" value="50"/>
+      <param name="right_column_offset" value="50"/>
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <output name="output_file" file="empty_file.dat" />
+    </test>
+    <!-- Trim to 4 inner-most bases -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_absolute"/>
+      <param name="left_column_offset" value="45"/>
+      <param name="right_column_offset" value="45"/>
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <output name="output_file" file="fastq_trimmer_out1.fastqsanger" />
+    </test>
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <param name="base_offset_type" value="offsets_percent"/>
+      <param name="left_column_offset" value="47.87"/>
+      <param name="right_column_offset" value="47.87"/>
+      <param name="keep_zero_length" value="exclude_zero_length" />
+      <output name="output_file" file="fastq_trimmer_out1.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+This tool allows you to trim the ends of reads.
+
+You can specify either absolute or percent-based offsets. Offsets are calculated, starting at 0, from the respective end to be trimmed. When using the percent-based method, offsets are rounded to the nearest integer. 
+
+For example, if you have a read of length 36::
+  
+  @Some FASTQ Sanger Read
+  CAATATGTNCTCACTGATAAGTGGATATNAGCNCCA
+  +
+  =@@.@;B-%?8&gt;CBA@&gt;7@7BBCA4-48%&lt;;;%&lt;B@
+  
+And you set absolute offsets of 2 and 9::
+  
+  @Some FASTQ Sanger Read
+  ATATGTNCTCACTGATAAGTGGATA
+  +
+  @.@;B-%?8&gt;CBA@&gt;7@7BBCA4-4
+  
+Or you set percent offsets of 6% and 20% (corresponds to absolute offsets of 2,7 for a read length of 36)::
+  
+  @Some FASTQ Sanger Read
+  ATATGTNCTCACTGATAAGTGGATATN
+  +
+  @.@;B-%?8&gt;CBA@&gt;7@7BBCA4-48%
+  
+-----
+
+.. class:: warningmark
+
+Trimming a color space read will cause any adapter base to be lost.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_trimmer_out1.fastqsanger	Thu Jan 23 12:31:36 2014 -0500
@@ -0,0 +1,8 @@
+@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)
+CGTA
++
+NOPQ
+@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)
+ATGC
++
+QPON
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sanger_full_range_original_sanger.fastqsanger	Thu Jan 23 12:31:36 2014 -0500
@@ -0,0 +1,8 @@
+@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)
+ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)
+CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA
++
+~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Jan 23 12:31:36 2014 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="galaxy_sequence_utils" version="1.0.0">
+      <repository changeset_revision="195699b1562a" name="package_galaxy_utils_1_0" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>