view fasta_merge_files_and_filter_unique_sequences.xml @ 3:1c12ec822e1b draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit b4c90f4b5d7e9b233f1150dbc9e5dcbe156809e8
author galaxyp
date Mon, 24 Jul 2017 17:12:56 -0400
parents 7892a1fd1648
children c94c2b84221e
line wrap: on
line source

<tool id="fasta_merge_files_and_filter_unique_sequences" name="FASTA Merge Files and Filter Unique Sequences" version="1.2.0">
    <description>Concatenate FASTA database files together</description>
    <requirements>
        <requirement type="package" version="2.7.12">python</requirement>
    </requirements>
    <command>
        python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py'
        '$output' $uniqueness_criterion '$accession_parser'

        #if $batchmode.processmode == 'merge':
          #set $inputs = $batchmode.input_fastas
        #else:
          #set $inputs = [ $batchmode.input_fastas ]
        #end if
        #for $input in $inputs:
            '$input'
        #end for
    </command>
    <inputs>
        <conditional name="batchmode">
            <param name="processmode" type="select" label="Run in batch mode?" help="The 'merge all' mode produces one output FASTA for all input FASTA files. The individual mode generates one FASTA file for each set of input FASTAs. For example, if the tool is given 2 collections of 10 FASTAs, it will merge the collections pairwise to create an output collection of 10 FASTAs." display="radio">
                <option value="individual" selected="True">Merge individual FASTAs (output collection if input is collection)</option>
                <option value="merge">Merge all FASTAs (always output a single FASTA)</option>
            </param>
            <when value="individual">
                <param name="input_fastas" type="data" format="fasta" label="FASTA file" />
            </when>
            <when value="merge">
                <param name="input_fastas" type="data" format="fasta" multiple="True" label="FASTA file" />
            </when>
        </conditional>
        <param name="uniqueness_criterion" type="select" label="How are sequences judged to be unique?">
            <option value="sequence" selected="true">Accession and Sequence</option>
            <option value="accession">Accession Only</option>
        </param>
        <param name="accession_parser" type="text" label="Accession Parsing Regex" value="^&gt;([^ ]+).*$" help="Regular expression with 1 capture group; the capture group is the accession (which must be unique)">
          <sanitizer>
            <valid>
              <add preset="string.printable"/>
              <remove value="&#92;" />
              <remove value="&apos;" />
            </valid>
            <mapping initial="none">
              <add source="&#92;" target="__backslash__" />
              <add source="&apos;" target="__sq__"/>
            </mapping>
          </sanitizer>
        </param>
    </inputs>
    <outputs>
        <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/>
    </outputs>
    <tests>
        <test>
          <param name="input_fastas" value="1.fa,2.fa" ftype="fasta" />
          <param name="processmode" value="merge" />
          <param name="uniqueness_criterion" value="sequence" />
          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
          <output name="output" file="res-sequence.fa" ftype="fasta" />
          <assert_stdout>
            <has_line line="Skipping protein '&gt;one_2' with duplicate sequence (first seen as '&gt;one')" />
            <has_line line="Skipping protein '&gt;two_2' with duplicate sequence (first seen as '&gt;two')" />
            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
          </assert_stdout>
        </test>
        <test>
          <param name="input_fastas" value="1.fa,2.fa" ftype="fasta" />
          <param name="processmode" value="merge" />
          <param name="uniqueness_criterion" value="accession" />
          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
          <output name="output" file="res-accession.fa" ftype="fasta" />
          <assert_stdout>
            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
          </assert_stdout>
        </test>
    </tests>
    <help>
<![CDATA[
**What it does**

Concatenate FASTA database files together.

If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output.
Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output.

The default accession parser will treat everything in the header before the first space as the accession.

------

**Citation**

If you use this tool in Galaxy, please the GalaxyP developers at: https://github.com/galaxyproteomics/

]]>
    </help>
</tool>