changeset 2:d05014300627 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/repeat_masker commit cbba6947f1380751e1db3fa5b043af630523fd86
author iuc
date Fri, 04 May 2018 07:59:35 -0400
parents 26c5f217aad7
children 2f097bbdd0b4
files repeatmasker.xml test-data/fake_repbase.embl test-data/small.fasta.stats test-data/small_repbase.fasta.log test-data/small_repbase.fasta.stats
diffstat 5 files changed, 135 insertions(+), 30 deletions(-) [+]
line wrap: on
line diff
--- a/repeatmasker.xml	Wed May 02 20:17:55 2018 -0400
+++ b/repeatmasker.xml	Fri May 04 07:59:35 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="repeatmasker_wrapper" name="RepeatMasker" version="4.0.7" profile="17.01">
+<tool id="repeatmasker_wrapper" name="RepeatMasker" version="4.0.7+galaxy1" profile="17.01">
   <description>RepeatMasker</description>
 
   <requirements>
@@ -11,7 +11,7 @@
     export REPEATMASKER_LIB_DIR=\$(pwd)/lib &&
       for file in \$(ls \$RM_LIB_PATH) ; do  ln -s \$RM_LIB_PATH/\$file lib/\$file ; done &&
     #if $repeat_source.source_type == "repbase":
-      cp '${repeat_source.repbase_file}' lib/RMRBSeqs.embl &&
+      cp '${repeat_source.repbase_file}' 'lib/${repeat_source.repbase_file_name}' &&
     #end if
     ln -s '${input_fasta}' rm_input.fasta &&
     RepeatMasker -dir \$(pwd)
@@ -26,31 +26,31 @@
       #end if
     #end if
     -parallel \${GALAXY_SLOTS:-1}
-    '${gff}'
-    '${ignore_n_stretches}'
-    '${advanced.is_only}'
-    '${advanced.is_clip}'
-    '${advanced.no_is}'
-    '${advanced.rodspec}'
-    '${advanced.primspec}'
-    '${advanced.nolow}'
-    '${advanced.noint}'
-    '${advanced.norna}'
-    '${advanced.alu}'
-    '${advanced.div}'
-    '${advanced.search_speed}'
-    '${advanced.frag}'
-    '${advanced.maxsize}'
-    #if $advanced.gc is not None:
-      '${advanced.gc}'
+    ${gff}
+    ${excln}
+    ${advanced.is_only}
+    ${advanced.is_clip}
+    ${advanced.no_is}
+    ${advanced.rodspec}
+    ${advanced.primspec}
+    ${advanced.nolow}
+    ${advanced.noint}
+    ${advanced.norna}
+    ${advanced.alu}
+    ${advanced.div}
+    ${advanced.search_speed}
+    -frag ${advanced.frag}
+    ## -maxsize ${advanced.maxsize}
+    #if str($advanced.gc):
+      -gc ${advanced.gc}
     #end if
-    '${advanced.gccalc}'
-    '${advanced.nocut}'
-    '${advanced.keep_alignments}'
-    '${advanced.invert_alignments}'
-    '${advanced.xout}'
-    '${advanced.xsmall}'
-    '${advanced.poly}'
+    ${advanced.gccalc}
+    ${advanced.nocut}
+    ${advanced.keep_alignments}
+    ${advanced.invert_alignments}
+    ${advanced.xout}
+    ${advanced.xsmall}
+    ${advanced.poly}
     rm_input.fasta &&
     #if $advanced.is_only != '-is_only':
       mv rm_input.fasta.masked '${output_masked_genome}' &&
@@ -79,6 +79,7 @@
       </param>
       <when value="repbase">
         <param name="repbase_file" type="data" format="embl" label="RepBase (RMRBSeqs.embl) file" />
+        <param name="repbase_file_name" type="hidden" value="RMRBSeqs.embl"/> <!-- This is an ugly hack to allow testing with a fake repbase -->
         <conditional name="species_source">
           <param label="Select species name from a list?" name="species_from_list" type="select">
             <option value="yes" selected="true">Yes</option>
@@ -126,7 +127,7 @@
       </when>
     </conditional>
     <param type="boolean" argument="-gff" truevalue="-gff" falsevalue="" label="Output annotation of repeats in GFF format" checked="false" />
-    <param name="ignore_n_stretches" type="boolean" argument="-excln" falsevalue="" label="Ignore stretches of Ns when computing statistics" checked="true" help="Scaffolds are sometimes joined with stretches of 25 or more Ns. This option ignores them when calculating repeat statistics" />
+    <param argument="-excln" type="boolean" truevalue="-excln" falsevalue="" label="Ignore stretches of Ns when computing statistics" checked="true" help="Scaffolds are sometimes joined with stretches of 25 or more Ns. This option ignores them when calculating repeat statistics" />
     <section name="advanced" title="Advanced options" expanded="false">
       <param argument="-is_only" type="boolean" truevalue="-is_only" falsevalue="" checked="false" label="Only clip E coli insertion elements" />
       <param argument="-is_clip" type="boolean" truevalue="-is_clip" falsevalue="" checked="false" label="Clip IS elements before analysis" help="Normally RepeatMasker will report on IS element, with this option selected it will clip them before analysis" />
@@ -145,8 +146,9 @@
         <option value="-s">Slow (0-5% more sensitive, 2.5 times slowdown)</option>
       </param>
       <param type="integer" argument="-frag" value="40000" label="Maximum contiguous sequence searched" help="Maximum length of sequencing that is search without fragmenting" />
-      <param type="integer" argument="-maxsize" value="4000000" label="Maximum length for IS or repeat clipped sequences" />
-      <param type="integer" argument="-gc" optional="True" label="Select matrices for this GC%" help="Valid values are a percentage or -1 to choose the default" />
+      <!-- -maxsize option is in the help, but not in the code of repeatmasker-->
+      <!--param type="integer" argument="-maxsize" value="4000000" label="Maximum length for IS or repeat clipped sequences" /-->
+      <param type="integer" argument="-gc" optional="true" label="Select matrices for this GC%" help="Valid values are a percentage or -1 to choose the default" />
       <param type="boolean" argument="-gccalc" truevalue="-gcccalc" falsevalue="" checked="false" label="Calculate GC % for all sequences" help="By default RepeatMasker skips calculating GC % for small sequences" />
       <param type="boolean" argument="-nocut" truevalue="-nocut" falsevalue="" checked="false" label="Skips cutting of repeats" />
       <param name="xout" type="boolean" argument="-x" truevalue="-x" falsevalue="" checked="false" label="Mask with X instead of N characters" />
@@ -203,6 +205,17 @@
       <output name="output_polymorphic" file="small.fasta.poly" />
       <output name="output_gff" file="small.fasta.gff" lines_diff="4" />
     </test>
+    <test expect_num_outputs="4">
+      <param name="input_fasta" value="small.fasta" ftype="fasta" />
+      <param name="source_type" value="repbase" />
+      <param name="repbase_file" value="fake_repbase.embl" />
+      <param name="repbase_file_name" value="fake.embl" />
+      <param name="species_list" value="anopheles" />
+      <output name="output_masked_genome" file="small.fasta.masked" />
+      <output name="output_table" file="small_repbase.fasta.stats" lines_diff="2" />
+      <output name="output_repeat_catalog" file="small.fasta.cat" />
+      <output name="output_log" file="small_repbase.fasta.log" />
+    </test>
   </tests>
   <help><![CDATA[
 RepeatMasker is a program that screens DNA for interspersed repeats and low
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fake_repbase.embl	Fri May 04 07:59:35 2018 -0400
@@ -0,0 +1,21 @@
+CC Fake repbase-like embl file, using data from DfamConsensus.embl*
+CC ****************************************************************
+XX
+ID   ACROBAT1     repeatmasker; DNA;  ???;  768 BP.
+CC   consensus - See RepBase for additional annotations.
+XX
+SQ   Sequence 768 BP; 178 A; 194 C; 215 G; 181 T; 0 other;
+     ggtgatgctg ccaacttact gatttagtgt atgatggtgt ttttgaggtg ctccagtggc   60
+     ttctgtttct atcagctgtc cctcctgttc agctactgac ggggtggtgc gtaacggcaa   120
+     aagcaccgcc ggacatcagc gctatctctg ctctcactgc cgtaaaacat ggcaactgca   180
+     gttcacttac accgcttctc aacccggtac gcaccagaaa atcattgata tggccatgaa   240
+     tggcgttgga tgccgggcaa cagcccgcat tatgggcgtt ggcctcaaca cgattttacg   300
+     tcacttaaaa aactcaggcc gcagtcggta acctcgcgca tacagccggg cagtgacgtc   360
+     atcgtctgcg cggaaatgga cgaacagtgg ggctatgtcg gggctaaatc gcgccagcgc   420
+     tggctgtttt acgcgtatga cagtctccgg aagacggttg ttgcgcacgt attcggtgaa   480
+     cgcactatgg cgacgctggg gcgtcttatg agcctgctgt caccctttga cgtggtgata   540
+     tggatgacgg atggctggcc gctgtatgaa tcccgcctga agggaaagct gcacgtaatc   600
+     agcaagcgat atacgcagcg aattgagcgg cataacctga atctgaggca gcacctggca   660
+     cggctgggac ggaagtcgct gtcgttctca aaatcggtgg agctgcatga caaagtcatc   720
+     gggcattatc tgaacataaa acactatcaa taagttggag tcattacc                768
+//
--- a/test-data/small.fasta.stats	Wed May 02 20:17:55 2018 -0400
+++ b/test-data/small.fasta.stats	Fri May 04 07:59:35 2018 -0400
@@ -41,7 +41,8 @@
 
 * most repeats fragmented by insertions or deletions
   have been counted as one element
-                                                      
+  Runs of >=20 X/Ns in query were excluded in % calcs
+
 
 The query species was assumed to be homo          
 RepeatMasker Combined Database: Dfam_Consensus-20170127
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/small_repbase.fasta.log	Fri May 04 07:59:35 2018 -0400
@@ -0,0 +1,10 @@
+SW score	% div.	% del.	% ins.	query sequence	pos in  query: begin	end	(left)	repeat	class/family	pos in repeat: begin	end	(left)	ID
+
+18	0.0	0.0	0.0	scaffold_1	613	632	(13588)	(GT)n	Simple_repeat	1	20	(0)	1
+16	18.3	2.2	2.2	scaffold_1	780	824	(13396)	(ATAATA)n	Simple_repeat	1	45	(0)	2
+12	23.9	4.5	0.0	scaffold_1	2231	2274	(11946)	(CAGA)n	Simple_repeat	1	46	(0)	3
+15	18.4	10.2	0.0	scaffold_1	4853	4901	(9319)	(TC)n	Simple_repeat	1	54	(0)	4
+13	19.1	1.8	7.7	scaffold_1	6230	6284	(7936)	(TAATTAA)n	Simple_repeat	1	52	(0)	5
+15	28.3	0.0	3.5	scaffold_1	6548	6606	(7614)	(GACA)n	Simple_repeat	1	57	(0)	6
+67	2.9	1.4	0.0	scaffold_1	11981	12050	(2170)	(CT)n	Simple_repeat	1	71	(0)	7
+19	15.4	2.8	0.0	scaffold_1	12078	12113	(2107)	(CT)n	Simple_repeat	1	37	(0)	7
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/small_repbase.fasta.stats	Fri May 04 07:59:35 2018 -0400
@@ -0,0 +1,60 @@
+==================================================
+file name: rm_input.fasta           
+sequences:             1
+total length:      14220 bp  (14220 bp excl N/X-runs) 
+GC level:         39.94 %
+bases masked:        378 bp ( 2.66 %)
+==================================================
+               number of      length   percentage
+               elements*    occupied  of sequence
+--------------------------------------------------
+Retroelements            0            0 bp    0.00 %
+   SINEs:                0            0 bp    0.00 %
+   Penelope              0            0 bp    0.00 %
+   LINEs:                0            0 bp    0.00 %
+    CRE/SLACS            0            0 bp    0.00 %
+     L2/CR1/Rex          0            0 bp    0.00 %
+     R1/LOA/Jockey       0            0 bp    0.00 %
+     R2/R4/NeSL          0            0 bp    0.00 %
+     RTE/Bov-B           0            0 bp    0.00 %
+     L1/CIN4             0            0 bp    0.00 %
+   LTR elements:         0            0 bp    0.00 %
+     BEL/Pao             0            0 bp    0.00 %
+     Ty1/Copia           0            0 bp    0.00 %
+     Gypsy/DIRS1         0            0 bp    0.00 %
+       Retroviral        0            0 bp    0.00 %
+
+DNA transposons          0            0 bp    0.00 %
+   hobo-Activator        0            0 bp    0.00 %
+   Tc1-IS630-Pogo        0            0 bp    0.00 %
+   En-Spm                0            0 bp    0.00 %
+   MuDR-IS905            0            0 bp    0.00 %
+   PiggyBac              0            0 bp    0.00 %
+   Tourist/Harbinger     0            0 bp    0.00 %
+   Other (Mirage,        0            0 bp    0.00 %
+    P-element, Transib)
+
+Rolling-circles          0            0 bp    0.00 %
+
+Unclassified:            0            0 bp    0.00 %
+
+Total interspersed repeats:           0 bp    0.00 %
+
+
+Small RNA:               0            0 bp    0.00 %
+
+Satellites:              0            0 bp    0.00 %
+Simple repeats:          7          378 bp    2.66 %
+Low complexity:          0            0 bp    0.00 %
+==================================================
+
+* most repeats fragmented by insertions or deletions
+  have been counted as one element
+  Runs of >=20 X/Ns in query were excluded in % calcs
+
+
+The query species was assumed to be anopheles genus
+RepeatMasker Combined Database: Dfam_Consensus-20170127
+                          
+run with rmblastn version 2.2.27+
+