Mercurial > repos > artbio > small_rna_maps

diff small_rna_maps.xml @ 1:615fa2171a34 draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit c3d728b98db4987821feae40952d9797c97eaf5a"
author: artbio
date: Fri, 04 Oct 2019 04:33:08 -0400
parents: 0a06985c0894
children: 59d93aa7cc20
--- a/small_rna_maps.xml	Tue Aug 22 12:06:58 2017 -0400
+++ b/small_rna_maps.xml	Fri Oct 04 04:33:08 2019 -0400
@@ -1,99 +1,368 @@
-<tool id="small_rna_maps" name="small_rna_maps" version="1.0.1">
+<tool id="small_rna_maps" name="small_rna_maps" version="2.15.0">
   <description></description>
   <requirements>
         <requirement type="package" version="1.11.2=py27_0">numpy</requirement>
-        <requirement type="package" version="0.11.2.1=py27_0">pysam</requirement>
-        <requirement type="package" version="1.3.2=r3.3.1_0">r-optparse</requirement>
-        <requirement type="package" version="0.6_28=r3.3.1_0">r-latticeextra</requirement>
-        <requirement type="package" version="2.2.1=r3.3.1_0">r-gridextra</requirement>
+        <requirement type="package" version="0.15.3=py27hda2845c_1">pysam</requirement>
+        <requirement type="package" version="1.6.4=r36h6115d3f_0">r-optparse</requirement>
+        <requirement type="package" version="0.6_28=r36h6115d3f_1002">r-latticeextra</requirement>
+        <requirement type="package" version="2.3=r36h6115d3f_1002">r-gridextra</requirement>
+        <requirement type="package" version="1.4.3=r36h29659fb_0">r-reshape2</requirement>
+        <requirement type="package" version="0.6.6">sambamba</requirement>
+        <requirement type="package" version="1.9=h10a08f8_12">samtools</requirement>
+        <requirement type="package" version="64.2=he1b5a44_1">icu</requirement>
   </requirements>
   <stdio>
       <exit_code range="1:" level="fatal" description="Tool exception" />
   </stdio>
   <command detect_errors="exit_code"><![CDATA[
-      #for $file in $inputs
-          samtools index '$file' &&
-      #end for
-      python '$__tool_directory__'/small_rna_maps.py 
-          --inputs 
-      #for $file in $inputs
-           '$file'
-      #end for
-          --sample_names
-      #for $sample in $inputs
-          '$sample.name'
-      #end for
-          --plot_methods Counts '$extra_plot'
-          --outputs '$output_tab' '$extra_output_tab' &&
-      Rscript '$__tool_directory__'/small_rna_maps.r
-          --first_dataframe '$output_tab' 
-          --extra_dataframe '$extra_output_tab'
-          --extra_plot_method '$extra_plot'
-          --output_pdf '$output_pdf'
+          #import json
+          #import os
+          #for $file in $inputs
+              sambamba view -t \${GALAXY_SLOTS} -F "not unmapped and sequence_length >= ${minsize} and sequence_length <= ${maxsize}" -f bam '$file' -o '$file.element_identifier' &&
+              samtools index '$file.element_identifier' &&
+          #end for
+          python '$__tool_directory__'/small_rna_maps.py
+              --inputs ${ ' '.join(['"%s"' % x.element_identifier for x in $inputs]) }
+              #set $labels = list()
+              #for $file in $inputs:
+                  $labels.append(str($file.element_identifier))
+              #end for
+              --sample_names ${ ' '.join(['"%s"' % x for x in $labels]) }
+              --minsize $minsize
+              --maxsize $maxsize
+              #if str($plots_options.plots_options_selector ) == "two_plot":
+                  --plot_methods '${plots_options.first_plot}' '${plots_options.extra_plot}'
+                  --outputs '$output_tab' '$extra_output_tab' &&
+              #elif str($plots_options.plots_options_selector ) == "global":
+                  --plot_methods 'Size'
+                  --outputs '$output_tab' &&
+              #elif str($plots_options.plots_options_selector ) == "cluster":
+                  --plot_methods 'Counts'
+                  --outputs '$output_tab'
+                  --cluster ${plots_options.cluster}
+                  --bed '$output_bed'
+                  --bed_skipsize ${plots_options.skip_size}
+                  --bed_skipcounts ${plots_options.skip_counts}
+                  --bed_skipdensity ${plots_options.skip_density}
+                  ${plots_options.strandness} &&
+              #else:
+                  --plot_methods '${plots_options.first_plot}'
+                  --outputs '$output_tab' &&
+              #end if
+              
+
+          Rscript '$__tool_directory__'/small_rna_maps.r
+              --first_dataframe '$output_tab'
+              --extra_dataframe '$extra_output_tab'
+              #if len(str($normalization)) != 1:
+              --normalization "${ ' '.join( [factor for factor in $normalization.split()]) }"
+              #else:
+              --normalization "${ ' '.join( ["1" for factor in $inputs] )}"
+              #end if
+              #if $ylimits_cond.ylimits == "no":
+                  --ymin '' --ymax ''
+              #else:
+                  --ymin '${ylimits_cond.ymin}' --ymax '${ylimits_cond.ymax}'
+              #end if
+              #if str($plots_options.plots_options_selector ) == "two_plot":
+                  --first_plot_method '${plots_options.first_plot}'
+                  --extra_plot_method '${plots_options.extra_plot}'
+              #elif str($plots_options.plots_options_selector ) == "global":
+                  --first_plot_method 'Size'
+                  --extra_plot_method ''
+                  --global '${plots_options.mergestrands}'
+              #else:
+                  --first_plot_method '${plots_options.first_plot}'
+                  --extra_plot_method ''
+              #end if
+              --output_pdf '$output_pdf'
   ]]></command>
  <inputs>
-   <param name="inputs" type="data" format="bam" label="Select multiple alignments to parse" multiple="True"/> 
-   <param name="extra_plot" type="select" label="select the type of extra plot in addition to read map">
-       <option value="Coverage">Coverage</option> 
-       <option value="Mean">Mean Sizes</option> 
-       <option value="Median">Median Sizes</option>
-       <option value="Size">Size Distributions</option>
-    </param>
+    <param name="inputs" type="data" format="bam" label="Select a alignment files to parse" multiple="true"
+           help="maps from these bam inputs will be collected in a single pdf output" />
+    <param name="normalization" type="text" label="Enter a size/normalization factor."
+           help="Enter normalisation factors separated by space eg [0.75 1.23 1.1], no normalization if no values,
+                 ignored if a single sample"
+           value="1"/>
+    <param name="minsize" type="integer" label="Minimal size of reads for inclusion in analysis"
+           value="19" help="default value: 19" />
+    <param name="maxsize" type="integer" label="Maximal size of reads for inclusion in analysis"
+           value="29" help="default value: 29" />
+    <conditional name="plots_options">
+        <param name="plots_options_selector" type="select" display="radio" label="Plot Options">
+            <option value="one_plot">Just one plot per chromosome</option>
+            <option value="two_plot" selected="True">Two plots per chromosome</option>
+            <option value="global">Global read size distributions of aligned reads</option>
+            <option value="cluster">Map read clusters</option>
+        </param>
+        <when value="two_plot">
+            <param name="first_plot" type="select" display="radio" label="Select the type of the top plot">
+                <option value="Counts">Counts</option>
+                <option value="Coverage">Coverage</option>
+                <option value="Mean">Mean Sizes</option>
+                <option value="Median">Median Sizes</option>
+                <option value="Size">Size Distributions</option>
+            </param>
+            <param name="extra_plot" type="select" display="radio" label="Select the type of the bottom plot">
+                <option value="Counts">Counts</option>
+                <option value="Coverage">Coverage</option>
+                <option value="Mean">Mean Sizes</option>
+                <option value="Median">Median Sizes</option>
+                <option value="Size">Size Distributions</option>
+            </param>
+        </when>
+        <when value="one_plot">
+            <param name="first_plot" type="select" display="radio" label="select the type of plot">
+                <option value="Counts">Counts</option>
+                <option value="Coverage">Coverage</option>
+                <option value="Mean">Mean Sizes</option>
+                <option value="Median">Median Sizes</option>
+                <option value="Size">Size Distributions</option>
+            </param>
+        </when>
+        <when value="global">
+            <param name="first_plot" type="hidden" value="Size"/>
+            <param name="mergestrands" type="select" display="radio" label="Whether forward and reverse aligned reads should be merged or not in the histogram">
+                <option value="nomerge">Do not merge</option>
+                <option value="merge">Merge forward and reverse reads</option>
+            </param>
+        </when>
+        <when value="cluster">
+            <param name="first_plot" type="hidden" value="Counts"/>
+            <param name="cluster" type="integer" label="Clustering distance in nucleotides" value="1"
+                   help="Sets the distance (in nt) below which reads are clustered to a single median position" />
+            <param name="strandness" argument="--nostrand" type="boolean" truevalue="--nostrand" falsevalue="" checked="false"
+                   label="Ignore polarity of reads ?" help="Set if you wish to cluster reads regardless of whether they are forward or reverse"/>
+            <param name="skip_size" type="integer" label="do not report clusters whose size is less than the specified value" value="1"
+                   help="Cluster size threshod (in nucleotides) for reporting. Set to 1 (default) reports all clusters, including singlets" />
+            <param name="skip_counts" type="integer" label="do not report cluster with a number of reads lower than the specified value" value="1"
+                   help="Number-of-reads threshod (in nucleotides) for cluster reporting. Set to 1 (default) reports all clusters, irrespective of their counts" />
+             <param name="skip_density" type="float" label="do not report cluster with density equal or less than the specified value" value="0"
+                   help="Density threshod (in reads per nucleotides) for reporting. Set to 0 (default) reports all cluster densities" />
+        </when>
+    </conditional>
+    <conditional name="ylimits_cond">
+        <param name="ylimits" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Do you wish to set an y axis limit to the plots?"
+               help="This limit won't be applied to size distribution plots"/>
+        <when value="yes">
+            <param name="ymin" type="float" label="Enter minimum value" value="0.0" help="e.g. '-5.0'"/>
+            <param name="ymax" type="float" label="Enter maximum value" value="0.0" help="e.g. '5.0'"/>
+        </when>
+        <when value="no">
+        </when>
+    </conditional>
  </inputs>
 
  <outputs>
-   <data format="tabular" name="output_tab" label="Read Count dataframe" />
-   <data format="tabular" name="extra_output_tab" label="$extra_plot dataframe" />
-   <data format="pdf" name="output_pdf" label="PDF file" />
-
+    <data format="tabular" name="output_tab" label="$plots_options.first_plot dataframe" />
+    <data format="bed" name="output_bed" label="bed file for clusters" >
+        <filter>plots_options['plots_options_selector'] == 'cluster'</filter>
+    </data>
+    <data format="tabular" name="extra_output_tab" label="$plots_options.extra_plot dataframe">
+        <filter>plots_options['plots_options_selector'] == 'two_plot'</filter>
+    </data>
+    <data format="pdf" name="output_pdf" label="small RNA maps" />
 </outputs>
 
     <tests>
-        <test>
-            <param name="inputs" value="input1.bam,input2.bam" ftype="bam"/>
-            <param name="extra_plot" value="Mean" />
-            <output file="readmap.tab" name="output_tab" />
-            <output file="mean.tab" name="extra_output_tab" />
-            <output file="mean.pdf" name="output_pdf" />
+        <test> <!-- 0 -->
+            <param name="inputs" value="input1.bam,input_new2.bam" ftype="bam" />
+            <param name="normalization" value="1 2" />
+            <param name="plots_options_selector" value="one_plot" />
+            <param name="first_plot" value="Counts" />
+            <output file="input1_input2new_norm_1_2_counts.tab" name="output_tab" />
+            <output file="input1_input2new_norm_1_2_single_plot_counts.pdf" name="output_pdf" />
+        </test>
+        <test> <!-- 1 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="ylimits" value="yes" />
+            <param name="ymin" value="-5" />
+            <param name="ymax" value="5" />
+            <param name="plots_options_selector" value="one_plot" />
+            <param name="first_plot" value="Counts" />
+            <output file="input1_counts_yminneg5_5.tab" name="output_tab" />
+            <output file="input1_yminneg5_5_single_plot_counts.pdf" name="output_pdf" />
+        </test>
+        <test> <!-- 2 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="plots_options_selector" value="cluster" />
+            <param name="first_plot" value="Counts" />
+            <param name="cluster" value="5" />
+            <param name="skip_size" value="1" />
+            <param name="strandness" value="false" />
+            <output file="clustering.tab" name="output_tab" />
+            <output file="clustering.pdf" name="output_pdf" />
+            <output file="bed1.bed" name="output_bed" />
         </test>
-        <test>
-            <param name="inputs" value="input1.bam,input1.bam" ftype="bam"/>
-            <param name="extra_plot" value="Mean" />
-            <output file="doubled_readmap.tab" name="output_tab" />
-            <output file="double_mean.tab" name="extra_output_tab" />
-            <output file="doubled_mean.pdf" name="output_pdf" />
+        <test> <!-- 3 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="plots_options_selector" value="cluster" />
+            <param name="first_plot" value="Counts" />
+            <param name="cluster" value="5" />
+            <param name="skip_size" value="1" />
+            <param name="strandness" value="true" />
+            <output file="clustering_unstranded.tab" name="output_tab" />
+            <output file="clustering_unstranded.pdf" name="output_pdf" />
+            <output file="bed2.bed" name="output_bed" />
+        </test>
+        <test> <!-- 4 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="plots_options_selector" value="cluster" />
+            <param name="first_plot" value="Counts" />
+            <param name="cluster" value="5" />
+            <param name="skip_size" value="2" />
+            <param name="strandness" value="false" />
+            <output file="clustering.tab" name="output_tab" />
+            <output file="clustering.pdf" name="output_pdf" />
+            <output file="bed3.bed" name="output_bed" />
+        </test>
+        <test> <!-- 5 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="plots_options_selector" value="cluster" />
+            <param name="first_plot" value="Counts" />
+            <param name="cluster" value="5" />
+            <param name="skip_size" value="2" />
+            <param name="skip_counts" value="3" />
+            <param name="skip_density" value="1.0" />
+            <param name="strandness" value="false" />
+            <output file="clustering.tab" name="output_tab" />
+            <output file="clustering.pdf" name="output_pdf" />
+            <output file="bed4.bed" name="output_bed" />
         </test>
-        <test>
-            <param name="inputs" value="input1.bam,input2.bam" ftype="bam"/>
-            <param name="extra_plot" value="Median" />
-            <output file="readmap.tab" name="output_tab" />
-            <output file="median.tab" name="extra_output_tab" />
-            <output file="median.pdf" name="output_pdf" />
+        <test> <!-- 6 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="plots_options_selector" value="cluster" />
+            <param name="first_plot" value="Counts" />
+            <param name="cluster" value="5" />
+            <param name="skip_size" value="2" />
+            <param name="skip_counts" value="2" />
+            <param name="skip_density" value="0.4" />
+            <param name="strandness" value="true" />
+            <output file="clustering_unstranded.tab" name="output_tab" />
+            <output file="clustering_unstranded.pdf" name="output_pdf" />
+            <output file="bed5.bed" name="output_bed" />
+        </test>
+        <test> <!-- 7 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="plots_options_selector" value="one_plot" />
+            <param name="first_plot" value="Size" />
+            <output file="input1_min20_max30_size.tab" name="output_tab" />
+            <output file="input1_min20_max30_single_plot_size.pdf" name="output_pdf" />
+        </test>
+        <test> <!-- 8 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="plots_options_selector" value="one_plot" />
+            <param name="first_plot" value="Mean" />
+            <output file="input1_mean.tab" name="output_tab" />
+            <output file="input1__single_plot_mean.pdf" name="output_pdf" />
+        </test>
+        <test> <!-- 9 -->
+            <param name="inputs" value="input1.bam" ftype="bam" />
+            <param name="normalization" value="1.0" />
+            <param name="plots_options_selector" value="one_plot" />
+            <param name="first_plot" value="Median" />
+            <output file="input1_median.tab" name="output_tab" />
+            <output file="input1_single_plot_median.pdf" name="output_pdf" />
         </test>
-        <test>
-            <param name="inputs" value="input1.bam,input2.bam" ftype="bam"/>
-            <param name="extra_plot" value="Coverage" />
-            <output file="readmap.tab" name="output_tab" />
-            <output file="coverage.tab" name="extra_output_tab" />
-            <output file="coverage.pdf" name="output_pdf" />
+        <test> <!-- 10 -->
+            <param name="inputs" value="input1.bam,input2.bam" ftype="bam" />
+            <param name="normalization" value="1.0 2.0" />
+            <param name="plots_options_selector" value="one_plot" />
+            <param name="first_plot" value="Counts" />
+            <output file="input1_input2_norm_1_2_counts.tab" name="output_tab" />
+            <output file="input1_input2_norm_1_2_single_plot_counts.pdf" name="output_pdf" />
+        </test>
+        <test> <!-- 11 -->
+            <param name="inputs" value="input1.bam,input2.bam" ftype="bam" />
+            <param name="normalization" value="1.0 1.0" />
+            <param name="ylimits" value="yes" />
+            <param name="ymin" value="-5" />
+            <param name="ymax" value="5" />
+            <param name="plots_options_selector" value="two_plot" />
+            <param name="first_plot" value="Counts" />
+            <param name="extra_plot" value="Size" />
+            <output file="input1_input2_counts.tab" name="output_tab" />
+            <output file="input1_input2_size.tab" name="extra_output_tab" />
+            <output file="input1_input2_double_plot_counts_size_ylimneg5_5.pdf" name="output_pdf" />
         </test>
-        <test>
-            <param name="inputs" value="input1.bam,input2.bam" ftype="bam"/>
-            <param name="extra_plot" value="Size" />
-            <output file="readmap.tab" name="output_tab" />
-            <output file="size.tab" name="extra_output_tab" />
-            <output file="sizes.pdf" name="output_pdf" />
+        <test> <!-- 12 -->
+            <param name="inputs" value="input_single_chr.bam,input_single_chr.bam,input_single_chr.bam,input_single_chr.bam,input_single_chr.bam,input_single_chr.bam" ftype="bam" />
+            <param name="normalization" value="1.0 1.0 1.0 1.0 1.0 1.0" />
+            <param name="plots_options_selector" value="one_plot" />
+            <param name="first_plot" value="Coverage" />
+            <output file="input_single_chr_x_6_single_plot_coverage.tab" name="output_tab" />
+            <output file="input_single_chr_x_6_single_plot_coverage.pdf" name="output_pdf" />
+        </test>
+        <test> <!-- 13 -->
+            <param name="inputs" value="input1.bam,input2.bam" ftype="bam" />
+            <param name="normalization" value="1.0 1.0" />
+            <param name="plots_options_selector" value="global" />
+            <param name="mergestrands" value="nomerge" />
+            <param name="first_plot" value="Size" />
+            <output file="size.tab" name="output_tab" />
+            <output file="global_nomerge.pdf" name="output_pdf" />
+        </test>
+        <test> <!-- 14 -->
+            <param name="inputs" value="input1.bam,input2.bam" ftype="bam" />
+            <param name="normalization" value="1.0 1.0" />
+            <param name="plots_options_selector" value="global" />
+            <param name="mergestrands" value="merge" />
+            <param name="first_plot" value="Size" />
+            <output file="size.tab" name="output_tab" />
+            <output file="global_merge.pdf" name="output_pdf" />
         </test>
     </tests>
-
-
 <help>
 
 **What it does**
 
-Generate read count maps from alignment BAM files, using pysam and lattice.
+Plots mapping statistics of read alignments along reference chromosomes or genes or arbitrary regions :
+
+ - counts
+ - mean sizes
+ - median sizes
+ - coverage depth
+ - size distribution
+
+Read counts, mean sizes and median sizes are computed by counting the number of 5' end of reads
+in each position of a chromosome reference.
+Coverage depths are computed from the input bam alignment files using the python pysam module.
+
+The metrics mentioned above can be plotted either separately:
+
+.. image:: one_plot.png
+
+Or in all possible pairwise combinations:
+
+.. image:: two_plot.png
 
-In addition to the read counts (lower graphs), median size, mean size and coverage depth of reads(lower graphs) mapping at a given position are plotted.	
+For comparison purposes, values from bam alignment files can be normalized by a size factor
+before plotting (Normalisation field)
+
+*Cluster mode*
+
+Cluster of read alignments are aggregated along regions of *variable* lengths. The Clustering
+algorithm works as follows:
+
+A read is clustered with the following read on the genomic reference if the two reads are
+separated by at maximum the clustering distance (set in nucleotides). If clustered, the step is
+repeated with the following read until clustering fails. A new cluster is then searched.
+
+For clustering procedure, one has the possibility to consider the polarity of reads (only forward
+reads or reverse reads can be clustered separately), or to ignore this polarity.
+
+Cluster reads are plotted as for single reads, their coordinate being the median of extrem coordinates of the cluster.
+
+In addition, cluster are reported in a bed file, where clusters can be filtered out upon various parameters,
+cluster size, cluster read number or cluster read density (number of reads divided by the length of the cluster).
 
 **Inputs**
 
@@ -101,13 +370,18 @@
 
   - single-read
   - sorted
-  - mapping to the same reference
+  - mapped to the same reference
+
+.. class:: warningmark
+
+This tools follows a "map-reduce" procedure: multiple inputs, that can be arranged as a data collection,
+are visualised side by side in a single pdf file.
+ 
+
 
 **Output**
 
-A pdf file generated by the R package lattice
-
-One or two dataframes used to plot data
+A pdf file generated by the R package lattice and one or two dataframes used to plot the data.
 
 </help>
 
@@ -124,4 +398,3 @@
   }</citation>
 </citations>
 </tool>
-
author	artbio
date	Fri, 04 Oct 2019 04:33:08 -0400
parents	0a06985c0894
children	59d93aa7cc20