Mercurial > repos > iuc > data_manager_hisat2_index_builder

<tool id="hisat2_index_builder_data_manager" name="HISAT2 index" tool_type="manage_data" version="2.1.0" profile="19.05">
    <description>builder</description>
    <requirements>
        <requirement type="package" version="2.1.0">hisat2</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        #if $advanced.adv_param_select == 'yes' and $advanced.gtf_input:
            ln -s '${advanced.gtf_input}' gtf_file.gtf &&
            hisat2_extract_splice_sites.py gtf_file.gtf > splice_sites.txt &&
            hisat2_extract_exons.py gtf_file.gtf > exon.txt &&
        #end if
        #if $advanced.adv_param_select == 'yes' and $advanced.snps:
            ln -s '${advanced.snps}' snps.tabular &&
            #if $advanced.snps.is_of_type('vcf')
                hisat2_extract_snps_haplotypes_VCF.py '${all_fasta_source.fields.path}' snps.tabular extracted &&
            #else
                hisat2_extract_snps_haplotypes_UCSC.py '${all_fasta_source.fields.path}' snps.tabular extracted &&
            #end if
        #end if
        python '$__tool_directory__/hisat2_index_builder.py' --output '${out_file}'
            --fasta_filename '${all_fasta_source.fields.path}'
            --fasta_dbkey '${all_fasta_source.fields.dbkey}'
            --fasta_description '${all_fasta_source.fields.name}'
            --data_table_name hisat2_indexes
            --indexer_options "-p \${GALAXY_SLOTS:-1}
            #if $advanced.adv_param_select == 'yes':
                --noauto
                #if $advanced.snps:
                    --snps "`pwd`/extracted.snp"
                    --haplotype "`pwd`/extracted.haplotype"
                #end if
                #if $advanced.gtf_input:
                    --ss "`pwd`/splice_sites.txt"
                    --exon "`pwd`/exon.txt"
                #end if
                --bmax $advanced.bmax
                --bmaxdivn $advanced.bmaxdivn
                --dcv $advanced.dcv
                --offrate $advanced.offrate
            #end if
            "
        ]]>
    </command>
    <inputs>
        <param label="Source FASTA Sequence" name="all_fasta_source" type="select">
            <options from_data_table="all_fasta" />
        </param>
        <conditional name="advanced" label="Advanced parameters">
            <param name="adv_param_select" type="select" label="Advanced parameters">
                <option value="no">Use defaults</option>
                <option value="yes">Fine-tune indexing parameters</option>
            </param>
            <when value="no" />
            <when value="yes">
                <param argument="--bmax" type="integer" value="4" label="Maximum number of suffixes allowed in a block" />
                <param argument="--bmaxdivn" type="integer" value="4" label="Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference" />
                <param argument="--dcv" type="integer" min="2" max="4096" value="1024" label="Period for the difference-cover sample" help="A larger period yields less memory overhead, but may make suffix sorting slower, especially if repeats are present. Must be a power of 2 no greater than 4096" />
                <param argument="--offrate" type="integer" value="4" label="Mark rows in the Burrows-Wheeler transform" help="To map alignments back to positions on the reference sequences, it's necessary to annotate (&quot;mark&quot;) some or all of the Burrows-Wheeler rows with their corresponding location on the genome. This parameter governs how many rows get marked: the indexer will mark every 2^&lt;int&gt; rows. Marking more rows makes reference-position lookups faster, but requires more memory to hold the annotations at runtime. The default is 4 (every 16th row is marked; for human genome, annotations occupy about 680 megabytes)" />
                <param name="snps" type="data" format="tabular,vcf" optional="true" label="Provide a list of SNPs in the UCSC dbSNP or VCF format" help="If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction" />
                <param name="gtf_input" type="data" format="gtf" optional="true" label="Provide a GTF file for HISAT2 to extract splice sites from" help="If you include SNPs or splice sites and exons, building an index on the human genome will consume up to 200GB RAM as index building involves a graph construction" />
            </when>
        </conditional>
        <param name="sequence_name" type="text" value="" label="Name of sequence" />
        <param name="sequence_id" type="text" value="" label="ID for sequence" />
    </inputs>
    <outputs>
        <data name="out_file" format="data_manager_json" />
    </outputs>
    <tests>
        <test>
            <param name="all_fasta_source" value="phiX174"/>
            <output name="out_file" file="hisat2_data_manager.json"/>
        </test>
    </tests>
    <help>
<![CDATA[
.. class:: infomark

**Notice:** If you leave name, description, or id blank, it will be generated automatically.

What is HISAT2?
---------------

`HISAT <http://ccb.jhu.edu/software/hisat>`__ is a fast and sensitive alignment
program for mapping next-generation sequencing reads (both DNA and RNA) against
the general human population (as well as against a single reference genome).
Based on an extension of BWT for graphs (`BWT <http://dl.acm.org/citation.cfm?id=2674828>`__)
we designed and implemented a graph FM index (GFM), an original approach and
its first implementation to the best of our knowledge. In addition to using one
global GFM index that represents the general population, HISAT2 uses a large set
of small GFM indexes that collectively cover the whole genome (each index
representing a genomic region of 56 Kbp, with 55,000 indexes needed to cover
the human population). These small indexes (called local indexes), combined
with several alignment strategies, enable rapid and accurate alignment of
sequencing reads. This new indexing scheme is called a Hierarchical Graph
FM index (HGFM).  In addition to spliced alignment, HISAT handles reads
involving indels and supports a paired-end alignment mode. Multiple processors
can be used simultaneously to achieve greater alignment speed. HISAT outputs
alignments in `SAM <http://samtools.sourceforge.net/SAM1.pdf>`__ format, enabling
interoperation with a large number of other tools (e.g. `SAMtools <http://samtools.sourceforge.net>`__,
`GATK <http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit>`__)
that use SAM. HISAT is distributed under the `GPLv3 license <http://www.gnu.org/licenses/gpl-3.0.html>`__,
and it runs on the command line under Linux, Mac OS X and Windows.
]]>
    </help>
    <citations>
        <citation type="doi">10.1038/nmeth.3317</citation>
    </citations>
</tool>
author	iuc
date	Mon, 18 Nov 2019 22:22:10 +0000
parents	a9a26db6a455
children