view fastq_groomer.xml @ 6:3213285ee5d2 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/galaxy_sequence_utils/fastq_groomer commit bb5df9e62585e12f08dfc0a9f86eec8e205b4845
author iuc
date Fri, 04 Oct 2024 10:33:13 +0000
parents 4fda2184efa5
children
line wrap: on
line source

<tool id="fastq_groomer" name="FASTQ Groomer" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>convert between various FASTQ quality formats</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <edam_topics>
        <edam_topic>topic_0622</edam_topic>
    </edam_topics>
    <edam_operations>
        <edam_operation>operation_0233</edam_operation>
    </edam_operations>
    <expand macro="requirements"/>
    <command><![CDATA[
gx-fastq-groomer '$input_file'
#if $input_file.extension.endswith(".gz"):
    #set $suffix = ".gz"
#elif $input_file.extension.endswith(".bz2"):
    #set $suffix = ".bz2"
#else:
    #set $suffix = ""
#end if
$input_type$suffix '$output_file'
#if $options_type.options_type_selector == 'basic':
    #if str($input_type) == 'cssanger':
        cssanger
    #else:
        sanger
    #end if
    ascii summarize_input
#else:
    ${options_type.output_type} ${options_type.force_quality_encoding} ${options_type.summarize_input} ${options_type.fix_id}
#end if
    ]]></command>
    <inputs>
        <param name="input_file" type="data" format="fastq,fastq.gz,fastq.bz2" label="File to groom" />
        <param name="input_type" type="select" label="Input FASTQ quality scores type">
            <option value="solexa">Solexa</option>
            <option value="illumina">Illumina 1.3-1.7</option>
            <option value="sanger" selected="true">Sanger &amp; Illumina 1.8+</option>
            <option value="cssanger">Color Space Sanger</option>
        </param>
        <conditional name="options_type">
            <param name="options_type_selector" type="select" label="Advanced Options">
                <option value="basic" selected="true">Hide Advanced Options</option>
                <option value="advanced">Show Advanced Options</option>
            </param>
            <when value="basic" />
            <when value="advanced">
                <param name="output_type" type="select" label="Output FASTQ quality scores type" help="Galaxy tools are designed to work with the Sanger Quality score format">
                    <option value="solexa">Solexa</option>
                    <option value="illumina">Illumina 1.3-1.7</option>
                    <option value="sanger" selected="true">Sanger</option>
                    <option value="cssanger">Color Space Sanger</option>
                    <option value="solexa.gz">Solexa (gz compressed)</option>
                    <option value="illumina.gz">Illumina 1.3-1.7 (gz compressed)</option>
                    <option value="sanger.gz">Sanger (gz compressed - recommended)</option>
                    <option value="cssanger.gz">Color Space Sanger (gz compressed)</option>
                    <option value="solexa.bz2">Solexa (bz2 compressed)</option>
                    <option value="illumina.bz2">Illumina 1.3-1.7 (bz2 compressed)</option>
                    <option value="sanger.bz2">Sanger (bz2 compressed)</option>
                    <option value="cssanger.bz2">Color Space Sanger (bz2 compressed)</option>
                </param>
                <param name="force_quality_encoding" type="select" label="Force Quality Score encoding">
                    <option value="None">Use Source Encoding</option>
                    <option value="ascii" selected="true">ASCII</option>
                    <option value="decimal">Decimal</option>
                </param>
                <param name="summarize_input" type="select" label="Summarize input data">
                    <option value="summarize_input" selected="true">Summarize Input</option>
                    <option value="dont_summarize_input">Do not Summarize Input (faster)</option>
                </param>
                <param name="fix_id" type="boolean" label="Fix inconsistent identifiers" 
                    truevalue="--fix-id" falsevalue="--no-fix-id" checked="true">
                </param>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="output_file" format="fastqsanger">
            <change_format>
                <when input="input_type" value="cssanger" format="fastqcssanger" />
                <when input="options_type.output_type" value="solexa" format="fastqsolexa" />
                <when input="options_type.output_type" value="illumina" format="fastqillumina" />
                <when input="options_type.output_type" value="sanger" format="fastqsanger" />
                <when input="options_type.output_type" value="cssanger" format="fastqcssanger" />
                <when input="options_type.output_type" value="solexa.gz" format="fastqsolexa.gz" />
                <when input="options_type.output_type" value="illumina.gz" format="fastqillumina.gz" />
                <when input="options_type.output_type" value="sanger.gz" format="fastqsanger.gz" />
                <when input="options_type.output_type" value="cssanger.gz" format="fastqcssanger.gz" />
                <when input="options_type.output_type" value="solexa.bz2" format="fastqsolexa.bz2" />
                <when input="options_type.output_type" value="illumina.bz2" format="fastqillumina.bz2" />
                <when input="options_type.output_type" value="sanger.bz2" format="fastqsanger.bz2" />
                <when input="options_type.output_type" value="cssanger.bz2" format="fastqcssanger.bz2" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <!-- Test fix-id by default -->
        <test>
            <param name="input_file" value="fastq_invalid-line3" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="basic" />
            <output name="output_file" file="fastq_invalid-line3_fixed" />
        </test>
        <!-- Test fix-id by setting the option -->
        <test>
            <param name="input_file" value="fastq_invalid-line3" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="fix_id" value="--fix-id" />
            <output name="output_file" file="fastq_invalid-line3_fixed" />
        </test>
        <!-- Test fix-id / option not set; expect failure -->
        <test expect_failure="true">
            <param name="input_file" value="fastq_invalid-line3" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="fix_id" value="--no-fix-id" />
        </test>
        <!-- These tests include test files adapted from supplemental material in Cock PJ, Fields CJ, Goto N, Heuer ML, Rice PM. The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids Res. 2009 Dec 16. -->
        <!-- Test basic options -->
        <test>
            <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="basic" />
            <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastq" />
            <param name="input_type" value="cssanger" />
            <param name="options_type_selector" value="basic" />
            <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" />
        </test>
        <test>
            <param name="input_file" value="illumina_full_range_original_illumina.fastqillumina" ftype="fastq" />
            <param name="input_type" value="illumina" />
            <param name="options_type_selector" value="basic" />
            <output name="output_file" file="illumina_full_range_as_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="solexa_full_range_original_solexa.fastqsolexa" ftype="fastq" />
            <param name="input_type" value="solexa" />
            <param name="options_type_selector" value="basic" />
            <output name="output_file" file="solexa_full_range_as_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_as_illumina.fastqillumina" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="basic" />
            <output name="output_file" file="sanger_full_range_as_illumina.fastqillumina" />
        </test>
        <!-- Test grooming from illumina -->
        <test>
            <param name="input_file" value="illumina_full_range_original_illumina.fastqillumina" ftype="fastq" />
            <param name="input_type" value="illumina" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="illumina" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="illumina_full_range_original_illumina.fastqillumina" />
        </test>
        <test>
            <param name="input_file" value="illumina_full_range_original_illumina.fastqillumina" ftype="fastq" />
            <param name="input_type" value="illumina" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="sanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="illumina_full_range_as_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="illumina_full_range_original_illumina.fastqillumina" ftype="fastq" />
            <param name="input_type" value="illumina" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="solexa" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="illumina_full_range_as_solexa.fastqsolexa" />
        </test>
        <test>
            <param name="input_file" value="illumina_full_range_original_illumina.fastqillumina" ftype="fastq" />
            <param name="input_type" value="illumina" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="cssanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="illumina_full_range_as_cssanger.fastqcssanger" />
        </test>
        <!-- Test grooming from sanger -->
        <test>
            <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="sanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="illumina" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_as_illumina.fastqillumina" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="solexa" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_as_solexa.fastqsolexa" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="cssanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" />
        </test>
        <!-- Test grooming from solexa -->
        <test>
            <param name="input_file" value="solexa_full_range_original_solexa.fastqsolexa" ftype="fastq" />
            <param name="input_type" value="solexa" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="solexa" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="solexa_full_range_original_solexa.fastqsolexa" />
        </test>
        <test>
            <param name="input_file" value="solexa_full_range_original_solexa.fastqsolexa" ftype="fastq" />
            <param name="input_type" value="solexa" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="illumina" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="solexa_full_range_as_illumina.fastqillumina" />
        </test>
        <test>
            <param name="input_file" value="solexa_full_range_original_solexa.fastqsolexa" ftype="fastq" />
            <param name="input_type" value="solexa" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="sanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="solexa_full_range_as_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="solexa_full_range_original_solexa.fastqsolexa" ftype="fastq" />
            <param name="input_type" value="solexa" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="cssanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="solexa_full_range_as_cssanger.fastqcssanger" />
        </test>
        <!-- Test grooming from cssanger -->
        <test>
            <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastq" />
            <param name="input_type" value="cssanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="cssanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastq" />
            <param name="input_type" value="cssanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="sanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastq" />
            <param name="input_type" value="cssanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="illumina" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_as_illumina.fastqillumina" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastq" />
            <param name="input_type" value="cssanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="solexa" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_as_solexa.fastqsolexa" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_as_cssanger_adapter_base_with_quality_score.fastqcssanger_fake_score" ftype="fastq" />
            <param name="input_type" value="cssanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="cssanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" />
        </test>
        <!-- Test fastq with line wrapping -->
        <test>
            <param name="input_file" value="wrapping_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="sanger" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="wrapping_as_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="wrapping_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="illumina" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="wrapping_as_illumina.fastqillumina" />
        </test>
        <test>
            <param name="input_file" value="wrapping_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="solexa" />
            <param name="force_quality_encoding" value="None" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="wrapping_as_solexa.fastqsolexa" />
        </test>
        <!-- Test forcing quality score encoding -->
        <!-- Sanger, range 0 - 93 -->
        <test>
            <param name="input_file" value="sanger_full_range_as_decimal_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="sanger" />
            <param name="force_quality_encoding" value="ascii" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="sanger" />
            <param name="force_quality_encoding" value="decimal" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_as_decimal_sanger.fastqsanger" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_as_tab_decimal_sanger.fastqsanger" ftype="fastq" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="sanger" />
            <param name="force_quality_encoding" value="ascii" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
        </test>
        <!-- Solexa, range -5 - 62 -->
        <test>
            <param name="input_file" value="solexa_full_range_as_decimal_solexa.fastqsolexa" ftype="fastq" />
            <param name="input_type" value="solexa" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="solexa" />
            <param name="force_quality_encoding" value="ascii" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="solexa_full_range_original_solexa.fastqsolexa" />
        </test>
        <test>
            <param name="input_file" value="solexa_full_range_original_solexa.fastqsolexa" ftype="fastq" />
            <param name="input_type" value="solexa" />
            <param name="options_type_selector" value="advanced" />
            <param name="output_type" value="solexa" />
            <param name="force_quality_encoding" value="decimal" />
            <param name="summarize_input" value="summarize_input" />
            <output name="output_file" file="solexa_full_range_as_decimal_solexa.fastqsolexa" />
        </test>
        <!-- compressed formats -->
        <test>
            <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger.gz" ftype="fastq.gz" />
            <param name="input_type" value="sanger" />
            <param name="options_type_selector" value="basic" />
            <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" decompress="true" />
        </test>
        <test>
            <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger.bz2" ftype="fastq.bz2" />
            <param name="input_type" value="cssanger" />
            <param name="options_type_selector" value="basic" />
            <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" decompress="true" />
        </test>
    </tests>
    <help><![CDATA[
**What it does**

This tool offers several conversions options relating to the FASTQ format.

When using *Basic* options, the output will be *sanger* formatted or *cssanger* formatted (when the input is Color Space Sanger). Inconsistent identifiers are fixed by default.

When converting, if a quality score falls outside of the target score range, it will be coerced to the closest available value (i.e. the minimum or maximum).

When converting between Solexa and the other formats, quality scores are mapped between Solexa and PHRED scales using the equations found in `Cock PJ, Fields CJ, Goto N, Heuer ML, Rice PM. The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids Res. 2009 Dec 16.`_

When converting between color space (csSanger) and base/sequence space (Sanger, Illumina, Solexa) formats, adapter bases are lost or gained; if gained, the base 'G' is used as the adapter. You cannot convert a color space read to base space if there is no adapter present in the color space sequence. Any masked or ambiguous nucleotides in base space will be converted to 'N's when determining color space encoding.

-----

**Quality Score Comparison**

::

    SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
    ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
    ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
    !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
    |                         |    |        |                              |                     |
   33                        59   64       73                            104                   126

   S - Sanger       Phred+33,  93 values  (0, 93) (0 to 60 expected in raw reads)
   I - Illumina 1.3 Phred+64,  62 values  (0, 62) (0 to 40 expected in raw reads)
   X - Solexa       Solexa+64, 67 values (-5, 62) (-5 to 40 expected in raw reads)

Diagram adapted from http://en.wikipedia.org/wiki/FASTQ_format

.. class:: infomark

Output from Illumina 1.8+ pipelines are Sanger encoded.

------

.. _Cock PJ, Fields CJ, Goto N, Heuer ML, Rice PM. The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids Res. 2009 Dec 16.: https://doi.org/10.1093/nar/gkp1137
    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btq281</citation>
    </citations>
</tool>