view repeatmasker.xml @ 2:d05014300627 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/repeat_masker commit cbba6947f1380751e1db3fa5b043af630523fd86
author iuc
date Fri, 04 May 2018 07:59:35 -0400
parents 26c5f217aad7
children 2f097bbdd0b4
line wrap: on
line source

<tool id="repeatmasker_wrapper" name="RepeatMasker" version="4.0.7+galaxy1" profile="17.01">
  <description>RepeatMasker</description>

  <requirements>
    <requirement type="package" version="4.0.7">repeatmasker</requirement>
  </requirements>

  <command detect_errors="exit_code"><![CDATA[
    RM_LIB_PATH=\$(dirname \$(which RepeatMasker))/../share/RepeatMasker/Libraries &&
    mkdir lib &&
    export REPEATMASKER_LIB_DIR=\$(pwd)/lib &&
      for file in \$(ls \$RM_LIB_PATH) ; do  ln -s \$RM_LIB_PATH/\$file lib/\$file ; done &&
    #if $repeat_source.source_type == "repbase":
      cp '${repeat_source.repbase_file}' 'lib/${repeat_source.repbase_file_name}' &&
    #end if
    ln -s '${input_fasta}' rm_input.fasta &&
    RepeatMasker -dir \$(pwd)
    #if $repeat_source.source_type == "library":
      -lib '${repeat_source.repeat_lib}'
      -cutoff '${repeat_source.cutoff}'
    #else if $repeat_source.source_type == "repbase":
      #if $repeat_source.species_source.species_from_list == 'yes':
        $repeat_source.species_source.species_list
      #else
        -species '${repeat_source.species_source.species_name}'
      #end if
    #end if
    -parallel \${GALAXY_SLOTS:-1}
    ${gff}
    ${excln}
    ${advanced.is_only}
    ${advanced.is_clip}
    ${advanced.no_is}
    ${advanced.rodspec}
    ${advanced.primspec}
    ${advanced.nolow}
    ${advanced.noint}
    ${advanced.norna}
    ${advanced.alu}
    ${advanced.div}
    ${advanced.search_speed}
    -frag ${advanced.frag}
    ## -maxsize ${advanced.maxsize}
    #if str($advanced.gc):
      -gc ${advanced.gc}
    #end if
    ${advanced.gccalc}
    ${advanced.nocut}
    ${advanced.keep_alignments}
    ${advanced.invert_alignments}
    ${advanced.xout}
    ${advanced.xsmall}
    ${advanced.poly}
    rm_input.fasta &&
    #if $advanced.is_only != '-is_only':
      mv rm_input.fasta.masked '${output_masked_genome}' &&
      sed -r 's/^ *// ; s/ *$//; s/\+ //; s/ +/\t/g ;  1,2c SW score\t% div.\t% del.\t% ins.\tquery sequence\tpos in  query: begin\tend\t(left)\trepeat\tclass/family\tpos in repeat: begin\tend\t(left)\tID' rm_input.fasta.out >'${output_log}' &&
      mv rm_input.fasta.tbl '${output_table}' &&
      #if $gff == '-gff':
        mv rm_input.fasta.out.gff '${output_gff}' &&
      #end if
      #if $advanced.keep_alignments == '-ali':
        mv rm_input.fasta.align '${output_alignment}' &&
      #end if
      #if $advanced.poly == '-poly':
        sed -r 's/^ *// ; s/ *$//; s/\+ //; s/ +/\t/g' rm_input.fasta.polyout >'${output_polymorphic}' &&
      #end if
    #end if
    mv rm_input.fasta.cat '${output_repeat_catalog}'
    ]]>
  </command>

  <inputs>
    <param name="input_fasta" type="data" format="fasta" label="Genomic DNA" />
    <conditional name="repeat_source">
      <param label="Repeat library source" name="source_type" type="select">
        <option selected="true" value="repbase">RepBase</option>
        <option value="library">Custom library of repeats</option>
      </param>
      <when value="repbase">
        <param name="repbase_file" type="data" format="embl" label="RepBase (RMRBSeqs.embl) file" />
        <param name="repbase_file_name" type="hidden" value="RMRBSeqs.embl"/> <!-- This is an ugly hack to allow testing with a fake repbase -->
        <conditional name="species_source">
          <param label="Select species name from a list?" name="species_from_list" type="select">
            <option value="yes" selected="true">Yes</option>
            <option value="no">No</option>
          </param>
          <when value="yes">
            <param name="species_list" type="select" label="Species">
              <option value="-species anopheles" selected="true">anopheles</option>
              <option value="-species arabidopsis">arabidopsis</option>
              <option value="-species artiodactyl">artiodactyl</option>
              <option value="-species aspergillus">aspergillus</option>
              <option value="-species carnivore">carnivore</option>
              <option value="-species cat">cat</option>
              <option value="-species chicken">chicken</option>
              <option value="-species 'ciona intestinalis'">ciona intestinalis</option>
              <option value="-species 'ciona savignyi'">ciona savignyi</option>
              <option value="-species cow">cow</option>
              <option value="-species danio">danio</option>
              <option value="-species diatoaea">diatomea</option>
              <option value="-species dog">dog</option>
              <option value="-species drosophila">drosophila</option>
              <option value="-species elegans">elegans</option>
              <option value="-species fugu">fugu</option>
              <option value="-species fungi" selected="true">fungi</option>
              <option value="-species human">human</option>
              <option value="-species maize">maize</option>
              <option value="-species mammal">mammal</option>
              <option value="-species mouse">mouse</option>
              <option value="-species pig">pig</option>
              <option value="-species rat">rat</option>
              <option value="-species rice">rice</option>
              <option value="-species rodentia">rodentia</option>
              <option value="-species ruminantia">ruminantia</option>
              <option value="-species wheat">wheat</option>
            </param>
          </when>
          <when value="no">
            <param name="species_name" type="text" value="homo sapiens" label="Repeat source species" help="Source species (or clade name) used to select repeats from RepBase" />
          </when>
        </conditional>
      </when>
      <when value="library">
        <param name="repeat_lib" type="data" format="fasta" label="Custom library of repeats" />
        <param name="cutoff" type="integer" argument="-cutoff" value="225" label="Cutoff score for masking repeats" />
      </when>
    </conditional>
    <param type="boolean" argument="-gff" truevalue="-gff" falsevalue="" label="Output annotation of repeats in GFF format" checked="false" />
    <param argument="-excln" type="boolean" truevalue="-excln" falsevalue="" label="Ignore stretches of Ns when computing statistics" checked="true" help="Scaffolds are sometimes joined with stretches of 25 or more Ns. This option ignores them when calculating repeat statistics" />
    <section name="advanced" title="Advanced options" expanded="false">
      <param argument="-is_only" type="boolean" truevalue="-is_only" falsevalue="" checked="false" label="Only clip E coli insertion elements" />
      <param argument="-is_clip" type="boolean" truevalue="-is_clip" falsevalue="" checked="false" label="Clip IS elements before analysis" help="Normally RepeatMasker will report on IS element, with this option selected it will clip them before analysis" />
      <param argument="-no_is" type="boolean" truevalue="-no_is" falsevalue="" checked="false" label="Skip bacterial insertion element check" />
      <param argument="-rodspec" type="boolean" truevalue="-rodspec" falsevalue="" checked="false" label="Only check for rodent specific repeats" help="If this option is select a check for rodent specific repeats is done instead of a full RepeatMasker run" />
      <param argument="-primspec" type="boolean" truevalue="-primspec" falsevalue="" checked="false" label="Only check for primate specific repeats" help="If this option is select a check for primate specific repeats is done instead of a full RepeatMasker run" />
      <param argument="-nolow" type="boolean" truevalue="-nolow" falsevalue="" checked="false" label="No low complexity masking" help="Skip masking of simple tandem repeats and low complexity regions." />
      <param argument="-noint" type="boolean" truevalue="-noint" falsevalue="" checked="false" label="No interspersed repeat masking" help="Only mask simple repeats, skip masking of interspersed repeats." />
      <param argument="-norna" type="boolean" truevalue="-norna" falsevalue="" checked="false" label="No repeat-like-RNA masking" help="Skip masking of small pol III transcribed RNA (these are masked by default because they resemble SINEs)" />
      <param argument="-alu" type="boolean" truevalue="-alu" falsevalue="" checked="false" label="Limit masking to (primate) Alu repeats" />
      <param argument="-div" type="boolean" truevalue="-div" falsevalue="" checked="false" label="Limit masking to less diverged (younger) repeats" />
      <param type="select" name="search_speed" label="Search speed vs sensitiviy trade-off">
        <option value="">Default</option>
        <option value="-q">Quick (5-10% less sensitive, 3-4 times speedup)</option>
        <option value="-qq">Rush (10% less sensitive)</option>
        <option value="-s">Slow (0-5% more sensitive, 2.5 times slowdown)</option>
      </param>
      <param type="integer" argument="-frag" value="40000" label="Maximum contiguous sequence searched" help="Maximum length of sequencing that is search without fragmenting" />
      <!-- -maxsize option is in the help, but not in the code of repeatmasker-->
      <!--param type="integer" argument="-maxsize" value="4000000" label="Maximum length for IS or repeat clipped sequences" /-->
      <param type="integer" argument="-gc" optional="true" label="Select matrices for this GC%" help="Valid values are a percentage or -1 to choose the default" />
      <param type="boolean" argument="-gccalc" truevalue="-gcccalc" falsevalue="" checked="false" label="Calculate GC % for all sequences" help="By default RepeatMasker skips calculating GC % for small sequences" />
      <param type="boolean" argument="-nocut" truevalue="-nocut" falsevalue="" checked="false" label="Skips cutting of repeats" />
      <param name="xout" type="boolean" argument="-x" truevalue="-x" falsevalue="" checked="false" label="Mask with X instead of N characters" />
      <param name="keep_alignments" type="boolean" argument="-ali" truevalue="-ali" falsevalue="" checked="false" label="Output alignments file" />
      <param name="invert_alignments" type="boolean" argument="-inv" truevalue="-inv" falsevalue="" checked="false" label="Invert alignments in alignment file" help="Show alignments in the orientation of the repeat sequence, not the query sequence" />
      <param type="boolean" argument="-xsmall" truevalue="-xsmall" falsevalue="" checked="false" label="Output repetitive regions as lowercase, non-repetitive regions as uppercase" />
      <param type="boolean" argument="-poly" truevalue="-poly" falsevalue="" checked="false" label="Output list of potentially polymorphic microsatellites" />
    </section>
  </inputs>
  <outputs>
    <data name="output_masked_genome" format="fasta" label="RepeatMasker masked sequence on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_log" format="tabular" label="RepeatMasker output log on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_table" format="txt" label="RepeatMasker repeat statistics on ${on_string}">
      <filter>not advanced['is_only']</filter>
    </data>
    <data name="output_repeat_catalog" format="txt" label="RepeatMasker repeat catalogue on ${on_string}" />
    <data name="output_alignment" format="txt" label="RepeatMasker alignment on ${on_string}">
      <filter>not advanced['is_only'] and advanced['keep_alignments']</filter>
    </data>
    <data name="output_polymorphic" format="tabular" label="RepeatMasker possible polymorphic repeats on ${on_string}">
      <filter>not advanced['is_only'] and advanced['poly']</filter>
    </data>
    <data name="output_gff" format="gff" label="RepeatMasker repeat annotation on ${on_string}">
      <filter>not advanced['is_only'] and gff is True</filter>
    </data>
  </outputs>
  <tests>
    <test expect_num_outputs="4">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="library" />
      <param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
      <output name="output_masked_genome" file="small.fasta.masked" />
      <output name="output_table" file="small.fasta.stats" lines_diff="2" />
      <output name="output_repeat_catalog" file="small.fasta.cat" />
      <output name="output_log" file="small.fasta.log" />
    </test>
    <test expect_num_outputs="7">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="library" />
      <param name="gff" value="-gff" />
      <!-- <param name="show" value="yes" /> -->
      <param name="keep_alignments" value="-ali" />
      <param name="poly" value="-poly" />
      <param name="repeat_lib" value="repeats.fasta" ftype="fasta" />
      <output name="output_masked_genome" file="small.fasta.masked" />
      <output name="output_table" file="small.fasta.stats" lines_diff="4" />
      <output name="output_repeat_catalog" file="small.fasta.cat" />
      <output name="output_log" file="small.fasta.log" />
      <output name="output_alignment" file="small.fasta.align" />
      <output name="output_polymorphic" file="small.fasta.poly" />
      <output name="output_gff" file="small.fasta.gff" lines_diff="4" />
    </test>
    <test expect_num_outputs="4">
      <param name="input_fasta" value="small.fasta" ftype="fasta" />
      <param name="source_type" value="repbase" />
      <param name="repbase_file" value="fake_repbase.embl" />
      <param name="repbase_file_name" value="fake.embl" />
      <param name="species_list" value="anopheles" />
      <output name="output_masked_genome" file="small.fasta.masked" />
      <output name="output_table" file="small_repbase.fasta.stats" lines_diff="2" />
      <output name="output_repeat_catalog" file="small.fasta.cat" />
      <output name="output_log" file="small_repbase.fasta.log" />
    </test>
  </tests>
  <help><![CDATA[
RepeatMasker is a program that screens DNA for interspersed repeats and low
complexity DNA sequences. The database of repeats to screen for can be
provided as a FASTA file or downloaded from RepBase_. If the RepBase option is
chosen the RepBaseRepeatMaskerEdition file should be downloaded and
unpacked, and the enclosed EMBL format file ('RMRBSeqs.embl') should
be uploaded to Galaxy for use with this tool.

Further documentation is available on the RepeatMasker homepage_.

.. _RepBase: http://www.girinst.org/repbase/
.. _homepage: http://www.repeatmasker.org/webrepeatmaskerhelp.html
    ]]>
  </help>
  <citations>
    <citation type="bibtex">
      @misc{RepeatMasker,
        title = {RepeatMasker Open-4.0},
        howpublished = {\url{http://www.repeatmasker.org}},
        author = {Smit, AFA and Hubley, R and Green, P.},
        year = {2013-2015}}
    </citation>
  </citations>
</tool>