view repeatmasker.xml @ 12:07acb3a096d7 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repeatmasker commit 7a5f368a5859e659aa36d0358bb96ca12574e2cc
author iuc
date Mon, 24 Apr 2023 10:40:41 +0000
parents fa311361317b
children
line wrap: on
line source

<tool id="repeatmasker_wrapper" name="RepeatMasker" version="@TOOL_VERSION@+@VERSION_SUFFIX@" profile="20.01">
    <description>screen DNA sequences for interspersed repeats and low complexity regions</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro='xrefs'/>
    <expand macro='edam_ontology' />
    <expand macro='requirements' />
    <version_command>repeatmasker --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
    RM_PATH=\$(which RepeatMasker) &&
    if [ -z "\$RM_PATH" ] ; then echo "Failed to find RepeatMasker in PATH (\$PATH)" >&2 ; exit 1 ; fi &&

    if [ -z "\$RM_LIB_PATH" ] ; then RM_LIB_PATH=\$(dirname \$RM_PATH)/../share/RepeatMasker/Libraries ; fi &&
    #if $repeat_source.source_type == "dfam_up":
      mkdir lib/ &&
      ln -s '${repeat_source.dfam_lib}' lib/RepeatMaskerLib.h5 &&
      RM_LIB_PATH=\$(pwd)/lib &&
    #end if

    ln -s '${input_fasta}' rm_input.fasta &&

    RepeatMasker -dir \$(pwd)
    -libdir \$RM_LIB_PATH
    #if $repeat_source.source_type == "library":
      -lib '${repeat_source.repeat_lib}'
      -cutoff '${repeat_source.cutoff}'
    #else if $repeat_source.source_type == "dfam":
      #if $repeat_source.species_source.species_from_list == 'yes':
        -species $repeat_source.species_source.species_list
      #else
        -species '${repeat_source.species_source.species_name}'
      #end if
    #else if $repeat_source.source_type == "dfam_up":
        -species '${repeat_source.species_name}'
    #end if
    -parallel \${GALAXY_SLOTS:-1}
    ${gff}
    ${excln}
    ${advanced.is_only}
    ${advanced.is_clip}
    ${advanced.no_is}
    ${advanced.rodspec}
    ${advanced.primspec}
    ${advanced.nolow}
    ${advanced.noint}
    ${advanced.norna}
    ${advanced.alu}
    ${advanced.div}
    ${advanced.search_speed}
    -frag ${advanced.frag}
    ## -maxsize ${advanced.maxsize}
    #if str($advanced.gc):
      -gc ${advanced.gc}
    #end if
    ${advanced.gccalc}
    ${advanced.nocut}
    ${advanced.keep_alignments}
    ${advanced.invert_alignments}
    ${advanced.xout}
    ${xsmall}
    ${advanced.poly}
    rm_input.fasta &&
    #if $advanced.is_only != '-is_only':
      mv rm_input.fasta.masked '${output_masked_genome}' &&
      sed -E 's/^ *// ; s/ *$//; s/\+ //; s/ +/\t/g ;  1,2c SW score\t% div.\t% del.\t% ins.\tquery sequence\tpos in  query: begin\tend\t(left)\trepeat\tclass/family\tpos in repeat: begin\tend\t(left)\tID' rm_input.fasta.out >'${output_log}' &&
      mv rm_input.fasta.tbl '${output_table}' &&
      #if $gff == '-gff':
        mv rm_input.fasta.out.gff '${output_gff}' &&
      #end if
      #if $advanced.keep_alignments == '-ali':
        mv rm_input.fasta.align '${output_alignment}' &&
      #end if
      #if $advanced.poly == '-poly':
        sed -E 's/^ *// ; s/ *$//; s/\+ //; s/ +/\t/g' rm_input.fasta.polyout >'${output_polymorphic}' &&
      #end if
    #end if
    if [ -f 'rm_input.fasta.cat.gz' ]; then
      zcat 'rm_input.fasta.cat.gz' > '${output_repeat_catalog}';
    else
      mv rm_input.fasta.cat '${output_repeat_catalog}';
    fi
    ]]>
  </command>

  <inputs>
    <param name="input_fasta" type="data" format="fasta" label="Genomic DNA" />
    <conditional name="repeat_source">
      <param label="Repeat library source" name="source_type" type="select" help="To use RepBase, choose 'Custom library of repeats' and select a fasta version of this non-free database.">
        <option selected="true" value="dfam">DFam (curated only, bundled with RepeatMasker)</option>
        <option value="dfam_up">DFam (full/specific version)</option>
        <option value="library">Custom library of repeats</option>
      </param>
      <when value="dfam">
        <conditional name="species_source">
          <param label="Select species name from a list?" name="species_from_list" type="select">
            <option value="yes" selected="true">Yes</option>
            <option value="no">No</option>
          </param>
          <when value="yes">
            <param name="species_list" type="select" label="Species">
              <option value="human" selected="true">Human (Homo sapiens)</option>
              <option value="rodent">Rodent (Order Rodentia)</option>
              <option value="mouse">Mouse (Mus musculus)</option>
              <option value="rattus">Rat (Rattus sp.)</option>
              <option value="danio">Danio (zebra fish)</option>
              <option value="drosophila">Fruit fly (Drosophila melanogaster)</option>
              <option value="elegans">Caenorhabditis elegans (nematode)</option>
            </param>
          </when>
          <when value="no">
            <param name="species_name" type="text" value="human" label="Repeat source species" help="Source species (or clade name) used to select repeats from DFam" />
          </when>
        </conditional>
      </when>
      <when value="dfam_up">
          <param name="dfam_lib" type="data" format="h5" label="DFam library" help="The full DFam library can be downloaded from https://www.dfam.org/releases/current/families/Dfam.h5.gz" />
          <param name="species_name" type="text" value="human" label="Repeat source species" help="Source species (or clade name) used to select repeats from DFam" />
      </when>
      <when value="library">
        <param name="repeat_lib" type="data" format="fasta" label="Custom library of repeats" />
        <param name="cutoff" type="integer" argument="-cutoff" value="225" label="Cutoff score for masking repeats" />
      </when>
    </conditional>
    <param type="boolean" argument="-gff" truevalue="-gff" falsevalue="" label="Output annotation of repeats in GFF format" checked="false" />
    <param argument="-excln" type="boolean" truevalue="-excln" falsevalue="" label="Ignore stretches of Ns when computing statistics" checked="true" help="Scaffolds are sometimes joined with stretches of 25 or more Ns. This option ignores them when calculating repeat statistics" />
    <param type="boolean" argument="-xsmall" truevalue="-xsmall" falsevalue="" checked="false" label="Perform softmasking instead of hardmasking" help="Output repetitive regions as lowercase, non-repetitive regions as uppercase (instead of replacing repetitive regions with 'N's)" />
    <section name="advanced" title="Advanced options" expanded="false">
      <param argument="-is_only" type="boolean" truevalue="-is_only" falsevalue="" checked="false" label="Only clip E coli insertion elements" />
      <param argument="-is_clip" type="boolean" truevalue="-is_clip" falsevalue="" checked="false" label="Clip IS elements before analysis" help="Normally RepeatMasker will report on IS element, with this option selected it will clip them before analysis" />
      <param argument="-no_is" type="boolean" truevalue="-no_is" falsevalue="" checked="false" label="Skip bacterial insertion element check" />
      <param argument="-rodspec" type="boolean" truevalue="-rodspec" falsevalue="" checked="false" label="Only check for rodent specific repeats" help="If this option is select a check for rodent specific repeats is done instead of a full RepeatMasker run" />
      <param argument="-primspec" type="boolean" truevalue="-primspec" falsevalue="" checked="false" label="Only check for primate specific repeats" help="If this option is select a check for primate specific repeats is done instead of a full RepeatMasker run" />
      <param argument="-nolow" type="boolean" truevalue="-nolow" falsevalue="" checked="false" label="No low complexity masking" help="Skip masking of simple tandem repeats and low complexity regions." />
      <param argument="-noint" type="boolean" truevalue="-noint" falsevalue="" checked="false" label="No interspersed repeat masking" help="Only mask simple repeats, skip masking of interspersed repeats." />
      <param argument="-norna" type="boolean" truevalue="-norna" falsevalue="" checked="false" label="No repeat-like-RNA masking" help="Skip masking of small pol III transcribed RNA (these are masked by default because they resemble SINEs)" />
      <param argument="-alu" type="boolean" truevalue="-alu" falsevalue="" checked="false" label="Limit masking to (primate) Alu repeats" />
      <param argument="-div" type="boolean" truevalue="-div" falsevalue="" checked="false" label="Limit masking to less diverged (younger) repeats" />
      <param type="select" name="search_speed" label="Search speed vs sensitiviy trade-off">
        <option value="">Default</option>
        <option value="-q">Quick (5-10% less sensitive, 3-4 times speedup)</option>
        <option value="-qq">Rush (10% less sensitive)</option>
        <option value="-s">Slow (0-5% more sensitive, 2.5 times slowdown)</option>
      </param>
      <param type="integer" argument="-frag" value="40000" label="Maximum contiguous sequence searched" help="Maximum length of sequencing that is search without fragmenting" />
      <!-- -maxsize option is in the help, but not in the code of repeatmasker-->
      <!--param type="integer" argument="-maxsize" value="4000000" label="Maximum length for IS or repeat clipped sequences" /-->
      <param type="integer" argument="-gc" optional="true" label="Select matrices for this GC%" help="Valid values are a percentage or -1 to choose the default" />
      <param type="boolean" argument="-gccalc" truevalue="-gcccalc" falsevalue="" checked="false" label="Calculate GC % for all sequences" help="By default RepeatMasker skips calculating GC % for small sequences" />
      <param type="boolean" argument="-nocut" truevalue="-nocut" falsevalue="" checked="false" label="Skips cutting of repeats" />
      <param name="xout" type="boolean" argument="-x" truevalue="-x" falsevalue="" checked="false" label="Mask with X instead of N characters" />
      <param name="keep_alignments" type="boolean" argument="-ali" truevalue="-ali" falsevalue="" checked="false" label="Output alignments file" />
      <param name="invert_alignments" type="boolean" argument="-inv" truevalue="-inv" falsevalue="" checked="false" label="Invert alignments in alignment file" help="Show alignments in the orientation of the repeat sequence, not the query sequence" />
      <param type="boolean" argument="-poly" truevalue="-poly" falsevalue="" checked="false" label="Output list of potentially polymorphic microsatellites" />
    </section>
  </inputs>
  <outputs>
    <data name="output_masked_genome" format="fasta" label="RepeatMasker masked sequence on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_log" format="tabular" label="RepeatMasker output log on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_table" format="txt" label="RepeatMasker repeat statistics on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_repeat_catalog" format="txt" label="RepeatMasker repeat catalogue on ${on_string}" />
    <data name="output_alignment" format="txt" label="RepeatMasker alignment on ${on_string}">
      <filter>not advanced['is_only'] and advanced['keep_alignments']</filter>
    </data>
    <data name="output_polymorphic" format="tabular" label="RepeatMasker possible polymorphic repeats on ${on_string}">
      <filter>not advanced['is_only'] and advanced['poly']</filter>
    </data>
    <data name="output_gff" format="gff" label="RepeatMasker repeat annotation on ${on_string}">
      <filter>not advanced['is_only'] and gff is True</filter>
    </data>
  </outputs>
  <tests>
    <test expect_num_outputs="4">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="library" />
      <param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
      <output name="output_masked_genome" file="small.fasta.masked" />
      <output name="output_table" file="small.fasta.stats" lines_diff="6" />
      <output name="output_repeat_catalog" file="small.fasta.cat" lines_diff="2" />
      <output name="output_log" file="small.fasta.log" lines_diff="2"/>
    </test>
    <test expect_num_outputs="7">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="library" />
      <param name="gff" value="-gff" />
      <param name="keep_alignments" value="-ali" />
      <param name="poly" value="-poly" />
      <param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
      <output name="output_masked_genome" file="small.fasta.masked" />
      <output name="output_table" file="small.fasta.stats" lines_diff="6" />
      <output name="output_repeat_catalog" file="small.fasta.cat" lines_diff="2" />
      <output name="output_log" file="small.fasta.log" lines_diff="2"/>
      <output name="output_alignment" file="small.fasta.align" />
      <output name="output_polymorphic" file="small.fasta.poly" />
      <output name="output_gff" file="small.fasta.gff" lines_diff="4" />
    </test>
    <test expect_num_outputs="4">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="dfam" />
      <param name="species_list" value="human" />
      <output name="output_masked_genome" file="small_dfam.fasta.masked" />
      <output name="output_table" file="small_dfam.fasta.stats" lines_diff="2" />
      <output name="output_repeat_catalog" file="small_dfam.fasta.cat" lines_diff="2" />
      <output name="output_log" file="small_dfam.fasta.log" lines_diff="2"/>
    </test>
    <test expect_num_outputs="4">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="dfam_up" />
      <param name="dfam_lib" value="Dfam_partial_test.h5" ftype="h5" />
      <param name="species_name" value="rodent" />
      <output name="output_masked_genome" file="small_dfam_up.fasta.masked" />
      <output name="output_table" file="small_dfam_up.fasta.stats" lines_diff="2" />
      <output name="output_repeat_catalog" file="small_dfam_up.fasta.cat" lines_diff="2" />
      <output name="output_log" file="small_dfam_up.fasta.log" lines_diff="2"/>
    </test>
    <test expect_num_outputs="4">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="dfam" />
      <param name="species_list" value="rattus" />
      <output name="output_masked_genome" file="small_dfam_rattus.fasta.masked" />
      <output name="output_table" file="small_dfam_rattus.fasta.stats" lines_diff="2" />
      <output name="output_repeat_catalog" file="small_dfam_rattus.fasta.cat" lines_diff="2" />
      <output name="output_log" file="small_dfam_rattus.fasta.log" lines_diff="2"/>
    </test>
  </tests>
  <help><![CDATA[
RepeatMasker is a program that screens DNA for interspersed repeats and low
complexity DNA sequences. The database of repeats to screen for can be
provided as a FASTA file or downloaded from RepBase_. If the RepBase option is
chosen the RepBaseRepeatMaskerEdition file should be downloaded and
unpacked, and the enclosed EMBL format file ('RMRBSeqs.embl') should
be uploaded to Galaxy for use with this tool.

Further documentation is available on the RepeatMasker homepage_.

.. _RepBase: http://www.girinst.org/repbase/
.. _homepage: http://www.repeatmasker.org/webrepeatmaskerhelp.html
    ]]>
  </help>
  <expand macro="citations" />
</tool>