view tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml @ 25:cc9baedbc059 draft

planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/ncbi_blast_plus commit 038641dc56c4f5880eedf536ba8313786c71dc0b-dirty
author peterjc
date Fri, 15 Sep 2017 10:44:01 -0400
parents 3b4efe26da79
children 82e833f02332
line wrap: on
line source

<tool id="ncbi_blastdbcmd_wrapper" name="NCBI BLAST+ blastdbcmd entry(s)" version="@WRAPPER_VERSION@">
    <description>Extract sequence(s) from BLAST database</description>
    <macros>
        <token name="@BINARY@">blastdbcmd</token>
        <import>ncbi_macros.xml</import>
    </macros>
    <expand macro="preamble" />
    <command detect_errors="aggressive" strict="true">
## The command is a Cheetah template which allows some Python based syntax.
## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}"

##TODO: What about -ctrl_a and -target_only as advanced options?

#if $id_opts.id_type=="file":
-entry_batch '$id_opts.entries'
#else:
##Perform some simple search/replaces to remove whitespace
##and make it comma separated. Quoted so don't escape pipes.
-entry "$id_opts.entries.replace('\r',',').replace('\n',',').replace(' ','').replace(',,',',').replace(',,',',').strip(',')"
#end if

##When building a BLAST database, to ensure unique IDs makeblastdb will
##do things like turning a FASTA entry with ID of ERP44 into lcl|ERP44
##(if using -parse_seqids) or simply assign it an ID using the record
##number like gnl|BL_ORD_ID|123 (to cope with duplicate IDs in the FASTA
##file). In -parse_seqids mode, a duplicate FASTA ID gives an error.
##
##The BLAST plain text and XML output will contain these BLAST IDs, but
##the tabular output does not (at least, not in BLAST 2.2.25+).
##Therefore in general, Galaxy users won't care about the (internal)
##BLAST identifiers.
##
##The blastdbcmd FASTA output will also contain these IDs, but in the
##context of the BLAST tabular output they are not helpful. Therefore
##to recover the original ID as used in the FASTA file for makeblastdb
##we need a litte post processing.
##
##We remove the NCBI's lcl|... or gnl|BL_ORD_ID|123 prefixes
##using sed, however the exact syntax differs for Mac OS X's sed

#if str($outfmt)=="blastid":
-out '$seq'
#else if sys.platform == "darwin":
| sed -E 's/^&gt;(lcl\||gnl\|BL_ORD_ID\|[0-9]* )/&gt;/1' &gt; "$seq"
#else:
| sed 's/&gt;\(lcl|\|gnl|BL_ORD_ID|[0-9]* \)/&gt;/1' &gt; "$seq"
#end if
    </command>
    <inputs>
        <expand macro="input_conditional_choose_db_type" />
        <conditional name="id_opts">
            <param name="id_type" type="select" label="Type of identifier list">
              <option value="file">From file</option>
              <option value="prompt">User entered</option>
            </param>
            <when value="file">
                <param name="entries" argument="-entry_batch" type="data" format="txt,tabular" label="Sequence identifier(s)" help="Plain text file with one ID per line (i.e. single column tabular file)"/>
            </when>
            <when value="prompt">
                <param name="entries" argument="-entry" type="text" optional="false" area="true" size="10x30" label="Sequence identifier(s)" help="Comma or new line separated list"/>
            </when>
        </conditional>
        <param name="outfmt" type="select" label="Output format">
          <option value="original">FASTA with original identifiers</option>
          <option value="blastid">FASTA with BLAST assigned identifiers</option>
        </param>
    </inputs>
    <outputs>
        <data name="seq" format="fasta" label="Sequences from ${db_opts.database.fields.name}" />
    </outputs>
    <tests>
        <test>
            <param name="db_opts|db_type" value="prot" />
            <param name="db_opts|database" value="four_human_proteins" />
            <param name="id_opts|id_type" value="prompt" />
            <param name="id_opts|entries" value="all" />
            <param name="outfmt" value="original" />
            <output name="seq" file="four_human_proteins.fasta" ftype="fasta" />
        </test>
        <test>
            <!-- This used to recover the original FASTA file, but had GI numbers -->
            <param name="db_opts|db_type" value="nucl" />
            <param name="db_opts|database" value="rhodopsin_nucs" />
            <param name="id_opts|id_type" value="prompt" />
            <param name="id_opts|entries" value="all" />
            <param name="outfmt" value="original" />
            <output name="seq" file="rhodopsin_nucs.no_gi.fasta" ftype="fasta" />
        </test>
        <test>
            <param name="db_opts|db_type" value="nucl" />
            <param name="db_opts|database" value="rhodopsin_nucs" />
            <param name="id_opts|id_type" value="prompt" />
            <param name="id_opts|entries" value="U59921.1" />
            <param name="outfmt" value="original" />
            <output name="seq" file="rhodopsin_bufo.fasta" ftype="fasta" />
        </test>
        <test>
            <param name="db_opts|db_type" value="nucl" />
            <param name="db_opts|database" value="rhodopsin_nucs" />
            <param name="id_opts|id_type" value="prompt" />
            <param name="id_opts|entries" value="gi|2734705|gb|U59921.1|BBU59921" />
            <param name="outfmt" value="original" />
            <output name="seq" file="rhodopsin_bufo.fasta" ftype="fasta" />
        </test>
    </tests>
    <help>

**What it does**

Extracts FASTA formatted sequences from a BLAST database
using the NCBI BLAST+ blastdbcmd command line tool.

.. class:: warningmark

**BLAST assigned identifiers**

When a BLAST database is constructed from a FASTA file, the
original identifiers can be replaced with BLAST assigned
identifiers, partly to ensure uniqueness. e.g. Sometimes
a prefix of 'lcl|' is added (lcl is short for local),
or an arbitrary name starting 'gnl|BL_ORD_ID|' is created.

If you are using the tabular output from BLAST, it will contain
the original identifiers - not the BLAST assigned identifiers
suitable for use with the blastdbcmd tool.

If you are using the XML or plain text output, this will also
contain the BLAST assigned identifiers. However, this means
getting a list of BLAST assigned identifiers isn't straightforward.

-------

**References**

If you use this Galaxy tool in work leading to a scientific publication please
cite the following papers:

@REFERENCES@
    </help>
    <expand macro="blast_citations" />
</tool>