Mercurial > repos > bgruening > repeat_masker

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RepeatMasker.xml	Sun Apr 29 07:03:52 2018 -0400
@@ -0,0 +1,321 @@
+<tool id="repeatmasker_wrapper" name="RepeatMasker" version="0.1.2">
+    <description>Masks different kind of repeats</description>
+    <requirements>
+        <requirement type="binary">RepeatMasker</requirement>
+    </requirements>
+    <command>
+<![CDATA[
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+
+## create temp directory
+#import tempfile, os
+#set $dirname = os.path.abspath( tempfile.mkdtemp() )
+#set $input_filename = os.path.split( str($query) )[-1]
+#set $output_basename = os.path.join( $dirname, $input_filename )
+
+
+RepeatMasker
+-parallel \${GALAXY_SLOTS:-8}
+
+$nolow
+$noint
+$norna
+
+#if str($species)!="all":
+    $species
+#end if
+
+
+-dir $dirname
+
+#if $adv_opts.adv_opts_selector=="advanced":
+
+    #if str($adv_opts.gc)!="0":
+        -gc $adv_opts.gc
+    #end if
+
+    $adv_opts.gccalc
+
+    #set $output_files_list = str($adv_opts.output_files).split(',')
+    #if "gff" in $output_files_list:
+        -gff
+    #end if
+    #if "html" in $output_files_list:
+        -html
+    #end if
+
+    $adv_opts.slow_search
+    $adv_opts.quick_search
+    $adv_opts.rush_search
+    $adv_opts.only_alus
+    $adv_opts.is_only
+
+#else:
+    ## Set defaults
+    -gff
+
+## End of advanced options:
+#end if
+
+$query
+
+2>&1;
+
+## Copy the output files to galaxy
+## AgR: if there are no repeats, the output files may not exist.
+## This causes the job to fail, so touch files to ensure they exist.
+#if $adv_opts.adv_opts_selector=="advanced":
+
+    #if "summary" in $output_files_list:
+        ## Write out the summary file (default)
+        #set $summary_file = $output_basename + '.tbl'
+        touch $summary_file;
+        cp $summary_file $output_summary;
+    #end if
+
+    #if "gff" in $output_files_list:
+        ## Write out the gff file (default)
+        #set $gff_file = $output_basename + '.out.gff'
+        touch $gff_file;
+        cp $gff_file $output_gff;
+    #end if
+
+    #if "html" in $output_files_list:
+        ## Write out the html file
+        #set $html_file = $output_basename + '.out.html'
+        touch $html_file;
+        cp $html_file $output_html;
+    #end if
+
+#else:
+
+    ## Write out the summary file (default)
+    #set $summary_file = $output_basename + '.tbl'
+    touch $summary_file;
+    cp $summary_file $output_summary;
+
+    ## Write out the gff file (default)
+    #set $gff_file = $output_basename + '.out.gff'
+    touch $gff_file;
+    cp $gff_file $output_gff;
+
+
+## End of advanced options:
+#end if
+
+## Write out mask sequence file
+#set $mask_sequence_file = $output_basename + '.masked'
+touch $mask_sequence_file;
+cp $mask_sequence_file $output_mask;
+
+## Write out standard file (default)
+## The default '.out' file from RepeatMasker has a 3-line header and spaces rather
+## than tabs. Remove the header and replace the whitespaces with tab
+#set $standard_file = $output_basename + '.out'
+tail -n +4 $standard_file | tr -s ' ' '\t' > $output_std;
+
+## Delete all temporary files
+rm $dirname -r
+
+]]>
+    </command>
+    <inputs>
+        <param name="query" type="data" format="fasta" label="Nucleotide query sequence(s)"/>
+
+        <param name="nolow" type="boolean" label="No low complexity DNA" truevalue="-nolow" falsevalue="" checked="false" help="Does not mask low_complexity DNA or simple repeats."/>
+        <param name="noint" type="boolean" label="No interspersed repeats" truevalue="-noint" falsevalue="" checked="false" help="Only masks low complex/simple repeats (no interspersed repeats)."/>
+
+        <param name="norna" type="boolean" label="No small RNA genes" truevalue="-norna" falsevalue="" checked="false" help="Does not mask small RNA (pseudo) genes."/>
+
+        <!--
+            Specify the species or clade of the input sequence. The species name
+            must be a valid NCBI Taxonomy Database species name and be contained
+            in the RepeatMasker repeat database. The following collection is not complete.
+        -->
+        <param name="species" type="select" label="Species" help="The list is not complete, if you need other species contact your administrator.">
+            <option value="-species anopheles">anopheles</option>
+            <option value="-species arabidopsis">arabidopsis</option>
+            <option value="-species artiodactyl">artiodactyl</option>
+            <option value="-species aspergillus">aspergillus</option>
+            <option value="-species carnivore">carnivore</option>
+            <option value="-species cat">cat</option>
+            <option value="-species chicken">chicken</option>
+            <option value="-species 'ciona intestinalis'">ciona intestinalis</option>
+            <option value="-species 'ciona savignyi'">ciona savignyi</option>
+            <option value="-species cow">cow</option>
+            <option value="-species danio">danio</option>
+            <option value="-species diatoaea">diatoaea</option>
+            <option value="-species dog">dog</option>
+            <option value="-species drosophila">drosophila</option>
+            <option value="-species elegans">elegans</option>
+            <option value="-species fugu">fugu</option>
+            <option value="-species fungi" selected="true">fungi</option>
+            <option value="-species human">human</option>
+            <option value="-species maize">maize</option>
+            <option value="-species mammal">mammal</option>
+            <option value="-species mouse">mouse</option>
+            <option value="-species pig">pig</option>
+            <option value="-species rat">rat</option>
+            <option value="-species rice">rice</option>
+            <option value="-species rodentia">rodentia</option>
+            <option value="-species ruminantia">ruminantia</option>
+            <option value="-species wheat">wheat</option>
+        </param>
+
+        <conditional name="adv_opts">
+            <param name="adv_opts_selector" type="select" label="Advanced Options">
+              <option value="basic" selected="True">Hide Advanced Options</option>
+              <option value="advanced">Show Advanced Options</option>
+            </param>
+            <when value="basic" />
+            <when value="advanced">
+              <param name="is_only" type="boolean" label="Mask only E coli insertion elements" truevalue="-is_only" falsevalue="" checked="false" help="Only clips E coli insertion elements out of fasta and .qual files."/>
+              <param name="slow_search" type="boolean" label="Slow search" truevalue="-s" falsevalue="" checked="false" help="0-5% more sensitive, 2-3 times slower than default."/>
+              <param name="quick_search" type="boolean" label="Quick search" truevalue="-q" falsevalue="" checked="false" help="5-10% less sensitive, 2-5 times faster than default."/>
+              <param name="rush_search" type="boolean" label="Rush search" truevalue="-qq" falsevalue="" checked="false" help="about 10% less sensitive, 4->10 times faster than default."/>
+              <param name="only_alus" type="boolean" label="Only Alus" truevalue="-alu" falsevalue="" checked="false" help="Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA)."/>
+              <param name="gccalc" type="boolean" label="Use GC depended matrices, automaticly" truevalue="-gccalc" falsevalue="" checked="true" help="RepeatMasker calculates the GC content even for batch files/small seqs"/>
+
+              <param name="output_files" type="select" multiple="true" label="Additional output files">
+                  <option selected="true" value="summary">Summary file</option>
+                  <option value="gff">GFF file</option>
+                  <option value="html">HTML file</option>
+                  <option value="mask">Mask FastA file</option>
+              </param>
+
+              <param name="gc" type="integer" value="0" label="Use GC depended matrices" help="Use matrices calculated for 'number' percentage background GC level">
+                    <validator type="in_range" min="0" />
+                    <validator type="in_range" max="100" />
+              </param>
+
+            </when>
+        </conditional>
+
+    </inputs>
+    <outputs>
+        <data name="output_std" format="tabular" label="${tool.name} on ${on_string}: Standard" />
+        <data name="output_mask" format="fasta" label="${tool.name} on ${on_string}: Mask sequence">
+            <filter>
+                    (adv_opts['adv_opts_selector'] == 'advanced' and 'mask' in adv_opts['output_files'])
+            </filter>
+        </data>
+        <data name="output_summary" format="txt" label="${tool.name} on ${on_string}: Summary">
+            <filter>(
+                    (adv_opts['adv_opts_selector'] == 'advanced' and 'summary' in adv_opts['output_files'])
+                    or
+                    (adv_opts['adv_opts_selector'] == 'basic')
+                    )
+            </filter>
+        </data>
+        <data name="output_html" format="html" label="${tool.name} on ${on_string}: HTML">
+            <filter>(adv_opts['adv_opts_selector'] == 'advanced' and 'html' in adv_opts['output_files'])</filter>
+        </data>
+        <data name="output_gff" format="gff" label="${tool.name} on ${on_string}: GFF">
+            <filter>
+                    (adv_opts['adv_opts_selector'] == 'advanced' and 'gff' in adv_opts['output_files'])
+            </filter>
+        </data>
+    </outputs>
+    <help>
+<![CDATA[
+
+.. class:: warningmark
+
+**What it does**
+
+RepeatMasker is a program that screens DNA sequences for *interspersed repeats*
+and *low complexity* DNA sequences. The output of the program is a detailed
+annotation of the repeats that are present in the query sequence as well as a
+modified version of the query sequence in which all the annotated repeats have
+been masked (default: replaced by Ns).
+
+-----
+
+**How to read the results**
+
+
+
+The annotation file contains the cross_match output lines. It lists all best matches
+(above a set minimum score) between the query sequence and any of the sequences in
+the repeat database or with low complexity DNA. The term "best matches" reflects
+that a match is not shown if its domain is over 80% contained within the domain
+of a higher scoring match, where the "domain" of a match is the region in
+the query sequence that is defined by the alignment start and stop. These domains
+have been masked in the returned masked sequence file. In the output, matches are
+ordered by query name, and for each query by position of the start of the alignment.
+
+Example:
+
+======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
+SW score perc div. perc del. perc ins. query seq. q-pos begin q-pos end (left)    w complement matching repeat repeat class/family repeat-pos begin repeat-pos end (left)  ID
+======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
+    1306 15.6      6.2       0.0       HSU08988   6563        6781      \(22462)  C            MER7A           DNA/MER2_type       336              103            \(0)    1
+   12204 10.0      2.4       1.8       HSU08988   6782        7714      \(21529)  C            TIGGER1         DNA/MER2_type       2418             1493           \(0)    2
+     279  3.0      0.0       0.0       HSU08988   7719        7751      \(21492)  +            (TTTTA)n        Simple_repeat       1                33             \(0)    3
+    1765 13.4      6.5       1.8       HSU08988   7752        8022      \(21221)  C            AluSx           SINE/Alu            289              1              \(23)   4
+   12204 10.0      2.4       1.8       HSU08988   8023        8694      \(20549)  C            TIGGER1         DNA/MER2_type       1493             827            \(925)  5
+    1984 11.1      0.3       0.7       HSU08988   8695        9000      \(20243)  C            AluSg           SINE/Alu            305              1              \(5)    6
+   12204 10.0      2.4       1.8       HSU08988   9001        9695      \(19548)  C            TIGGER1         DNA/MER2_type       827              2              \(1591) 7
+     711 21.2      1.4       0.0       HSU08988   9696        9816      \(19427)  C            MER7A           DNA/MER2_type       122              2              \(224)  8
+======== ========= ========= ========= ========== =========== ========= ========= ============ =============== =================== ================ ============== ======= ==
+
+This is a sequence in which a Tigger1 DNA transposon has integrated into a MER7 DNA transposon copy.
+Subsequently two Alus integrated in the Tigger1 sequence. The simple repeat is derived from the
+poly A of the Alu element. The first line is interpreted like this:
+
+:Table description:
+
+1. **1306** = Smith-Waterman score of the match, usually complexity adjusted
+        The SW scores are not always directly comparable. Sometimes
+        the complexity adjustment has been turned off, and a variety of
+        scoring-matrices are used.
+
+#. **15.6** = % substitutions in matching region compared to the consensus
+#. **6.2** = % of bases opposite a gap in the query sequence (deleted bp)
+#. **0.0** = % of bases opposite a gap in the repeat consensus (inserted bp)
+#. **HSU08988** = name of query sequence
+#. **6563** = starting position of match in query sequence
+#. **7714** = ending position of match in query sequence
+#. **(22462)** = no. of bases in query sequence past the ending position of match
+#. **C**       = match is with the Complement of the consensus sequence in the database
+#. **MER7A**   = name of the matching interspersed repeat
+#. **DNA/MER2_type** = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references)
+#. **2418**    = starting position of match in database sequence (using top-strand numbering)
+#. **1465**    = ending position of match in database sequence
+#. **(0)**     = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (so 0 means that the match extended all the way to the end of the repeat consensus sequence)
+#. **1**    = Identifier
+
+An asterisk (\*) in the final column (no example shown) indicates that there is
+a higher-scoring match whose domain partly (<80%) includes the domain of this match.
+
+Note that the SW score and divergence numbers for the three Tigger1 lines are identical.
+This is because the information is derived from a single alignment (the Alus were deleted
+from the query before the alignment with the Tigger element was performed).
+The program makes educated guesses about many fragments if they are derived from
+the same element (e.g. it knows that the MER7A fragments represent one insert).
+In a next version I can identify each element with a unique ID, if interest exists
+(this could help to represent repeats cleaner in graphic displays).
+
+
+-------
+
+**License and citation**
+
+This tool uses `RepeatMasker`_, `Tandem Repeats Finder`_ and `Repbase Update`_, which are licensed separately. Please cite:
+
+- |Smit1996|
+- |Benson1999|_
+- |Jurka2000|_.
+
+.. _RepeatMasker: http://www.repeatmasker.org/
+.. _Tandem Repeats Finder: http://tandem.bu.edu/trf/trf.html
+.. _Repbase Update: http://www.girinst.org/repbase/
+.. |Smit1996| replace:: Smit, A.F.A., Hubley, R., Green, P. (1996-2010) RepeatMasker Open-4.0
+.. |Benson1999| replace:: Benson, G. (1999) Tandem repeats finder: a program to analyze DNA sequences. *Nucleic Acids Res.* 27(2), 573-580
+.. _Benson1999: http://nar.oxfordjournals.org/content/27/2/573
+.. |Jurka2000| replace:: Jurka, J. (2000) Repbase Update: a database and an electronic journal of repetitive elements. *Trends Genet.* 16(9), 418-420
+.. _Jurka2000: http://www.cell.com/trends/genetics/abstract/S0168-9525(00)02093-X
+]]>
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.rst	Sun Apr 29 07:03:52 2018 -0400
@@ -0,0 +1,63 @@
+===============================
+Galaxy wrapper for RepeatMasker
+===============================
+
+This wrapper is copyright 2013 by Björn Grüning.
+
+This is a wrapper for the command line tool of RepeatMasker from the Institute for Systems Biology.
+http://www.repeatmasker.org/
+
+
+Smit, AFA, Hubley, R & Green, P. RepeatMasker Open-3.0.
+1996-2010 <http://www.repeatmasker.org>.
+
+
+Additional Information:
+Using RepeatMasker to identify repetitive elements in genomic sequences.
+http://www.ncbi.nlm.nih.gov/pubmed/19274634
+
+============
+Installation
+============
+
+To install RepeatMasker, please use the following instructions:
+
+http://www.repeatmasker.org/RMDownload.html
+
+To install the wrapper copy the file RepeatMasker.xml in the galaxy tools
+folder and modify the tools_conf.xml file to make the tool available to Galaxy.
+Add a line like the following:
+
+Add the tool definition to your tool_conf.xml file under Galaxy root.
+	<tool file="RepeatMasker/RepeatMasker.xml" />
+
+=======
+History
+=======
+
+- v1.1: Initial public release
+- v0.1.1: patch from Simon Guest, to create empty files if no repeat is found
+- v0.1.2: remove trailing semicolon, redirect all output to stdout
+
+===============================
+Wrapper Licence (MIT/BSD style)
+===============================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
+