Mercurial > repos > elixir-it > covacs_variant_recalibrator

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bed_macros.xml	Fri Nov 09 06:04:05 2018 -0500
@@ -0,0 +1,22 @@
+<macros>
+  <macro name="bed_loc">
+      <conditional name="bed_source">
+            <param name="bed_source_selector" type="select" label="Will you select a bed file from your history or use a built-in bed?" help=" bed file over which operate">
+                <option value="cached">Use a built-in bed</option>
+                <option value="history">Use a bed from history as reference</option>
+            </param>
+            <when value="cached">
+                <param name="bed_cached" type="select" label="Using reference bed" help="Select bed from the list">
+                    <options from_data_table="covacs_bed">
+                        <filter type="sort_by" column="2" />
+                        <validator type="no_options" message="No bed are available" />
+                    </options>
+                    <validator type="no_options" message="A built-in bed file is not available"/>
+                </param>
+            </when>
+            <when value="history">
+                <param name="bed_history" type="data" format="bed" label="Use the following dataset as reference bed " help="You can upload a bed file to the history and use it" optional="true" />
+            </when>
+      </conditional>
+  </macro>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/covacs_VariantRecalibrator.xml	Fri Nov 09 06:04:05 2018 -0500
@@ -0,0 +1,223 @@
+ <tool id="covacs_VarianRecalibrator" name="covacs_VariantRecalibrator" version="3.8">
+  <description>GATK VariantRecalibrator wrapper Version = 3.8</description>
+  <macros>
+	<import>bed_macros.xml</import>
+	<import>vcf_macros.xml</import>
+  </macros>
+  <requirements>
+  	<requirement type="package" version="3.8" >gatk</requirement>
+  </requirements>
+  <command>
+    <![CDATA[
+	### call the .sh to untar the package
+        sh $__tool_directory__/mv_untar_gatk.sh &&
+
+	##sym link to run GATK
+	 #if $bed_source.bed_source_selector == "history" and $bed_source.bed_history
+         ln -s $bed_source.bed_history region.bed &&
+	#end if
+
+	ln -s $input1 input1.vcf &&
+
+	##GATK tool call
+	java -jar  \$CONDA_PREFIX/../../GenomeAnalysisTK.jar -T VariantRecalibrator
+
+
+
+	#if $bed_source.bed_source_selector == "history" and $bed_source.bed_history
+        -L  region.bed
+        #end if
+        #if $bed_source.bed_source_selector == "cached"
+        -L $bed_source.bed_cached.fields.path
+        #end if
+
+
+
+	-ip $ip
+	##call chose genome from covacs_gatk_indexes.loc
+      		-R $ref_file.fields.path
+
+	##vcf input parameter
+
+	-input input1.vcf
+
+	## for that permit to insert different resources
+
+	#for $r in $resource#
+
+		#if $r.vcf_source.vcf_source_selector == "history" and $r.vcf_source.vcf_history
+
+        	--resource:${r.nameresource},known=${r.known.value},training=${r.training.value},truth=${r.truth.value},prior=${r.prior.value} ${r.vcf_source.vcf_history}
+		#end if
+
+       	 	#if $r.vcf_source.vcf_source_selector == "cached"
+
+        	--resource:${r.nameresource},known=${r.known.value},training=${r.training.value},truth=${r.truth.value},prior=${r.prior.value} ${r.vcf_source.vcf_cached.fields.path}
+		#end if
+
+	#end for
+
+	-mode $mode_type.mode
+	##chose between INDEL and SNP call
+
+	#if $mode_type.mode == "INDEL"
+		--minNumBadVariants $mode_type.minNumBadVariants
+		--maxGaussians $mode_type.maxGaussian
+		-mNG $mode_type.mNG
+	#end if
+
+	## for that permit to insert different resources
+
+	#for $a in $an#
+
+		-an ${a.an_name.value}
+
+	#end for
+
+	## for that permit to insert different resources
+
+	#for $t in $tranches_name#
+
+		-tranche ${t.tranches.value}
+	#end for
+
+	##outputs
+
+	-recalFile $recal
+	-tranchesFile $tranches
+
+	2> $log
+	]]>
+  </command>
+  <inputs>
+    <param name="ref_file" type="select" label="Using indexed reference genome" help="Select indexed genome from the list">
+       <options from_data_table="covacs_gatk_indexes">
+         <filter type="sort_by" column="2" />
+         <validator type="no_options" message="No indexes are available" />
+       </options>
+       <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
+    </param>
+    <param format="vcf" name="input1" label="VCF of raw input variants to be recalibrated" type="data" optional="true" />
+    <expand macro="bed_loc"/>
+    <param name="ip" type="integer" value="100" help="Amount of padding (in bp) to add to each interval"/>
+    <repeat name="resource" title="-resource" help="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run)">
+      <expand macro="vcf_loc"/>
+      <param name="nameresource" label="name of the resource" type="select" >
+	<option value="hapmap">hapmap</option>
+	<option value="omni">omni</option>
+	<option value="1000G">1000G</option>
+	<option value="mills">mills</option>
+	<option value="dbsnp">dbsnp</option>
+      </param>
+      <param name="known" type="select" display="radio" help="Known - The program only uses known sites for reporting purposes (to indicate whether variants are already known or novel)" >
+	<option value="true">true</option>
+	<option value="false">false</option>
+      </param>
+      <param name="training" type="select" display="radio" help="Training - The program builds the Gaussian mixture model using input variants that overlap with these training sites.">
+	<option value="true">true</option>
+	<option value="false">false</option>
+      </param>
+      <param name="truth" type="select" display="radio" help="Truth - The program uses these truth sites to determine where to set the cutoff in VQSLOD sensitivity">
+	<option value="true">true</option>
+	<option value="false">false</option>
+      </param>
+      <param name="prior" value="10.0" min="0" max="100.0" type="float"/>
+    </repeat>
+   <repeat name="an" title="-an" help="Annotation which should used for calculations">
+      <param name="an_name" label="annotation name" type="select"  help="The name of the annotation which should used for calculations">
+	<option value="DP">DP</option>
+	<option value="QD">QD</option>
+	<option value="MQRankSum">MQRankSum</option>
+	<option value="ReadPosRankSum">ReadPosRankSum</option>
+	<option value="FS">FS</option>
+       </param>
+    </repeat>
+    <repeat name="tranches_name" title="tranches" help="The levels of truth sensitivity at which to slice the data. (in percent, that is 1.0 for 1 percent)">
+      <param name="tranches" value="98.0" min="0" max="100.0" type="float"/>
+    </repeat>
+    <conditional name="mode_type">
+    	<param name="mode" type="select" display="radio" help=" Recalibration mode to employ (SNP|INDEL)">
+		<option value="SNP">snp</option>
+		<option value="INDEL">INDEL</option>
+    	</param>
+    	<when value="INDEL">
+		<param name="maxGaussian" type="integer" value="4"/>
+		<param name="minNumBadVariants" type="integer" value="5000"/>
+		<param name="mNG" type="integer" value="2"/>
+    	</when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="txt" name="recal" from_work_dir="recal" label="${tool.name} on ${on_string}:recal"/>
+    <data format="txt" name="tranches" from_work_dir="tranches" label="${tool.name} on ${on_string}:tranches"/>
+    <data format="txt" name="log" label="log"/>
+  </outputs>
+  <help>
+.. class:: warningmark
+
+**IMPORTANT** to get the wrapper ready to start the admin user have to download gatk GATK version 3.8 from the broadinstitute site https://software.broadinstitute.org/gatk/download/archive and then move it in the conda_prefix folder, the path of the conda_prefix is written in the galaxy.ini(or .yml) file
+
+		**more informations** at https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_variantrecalibration_VariantRecalibrator.php
+
+-----
+
+**Implemented options** VariantRecalibrator:
+
+**-L**                   : One or more genomic intervals over which to operate(file.bed)
+
+**-ip**                  Amount of padding (in bp) to add to each interval
+
+**--resource:NAME,known=true/false,training=true/false,truth=true/false,prior=float $file**   :A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run)
+
+**-mode**   : Recalibration mode to employ (SNP|INDEL)
+
+**-an** : annotations which should used for calculations
+
+**-tranche**     The levels of truth sensitivity at which to slice the data. (in percent, that is 1.0 for 1 percent)
+
+**in case of indels mode**
+
+**--minNumBadVariants**   : Minimum number of bad variants
+
+**--maxGaussians**        : Max number of Gaussians for the positive model
+
+**-mNG**                  : Max number of Gaussians for the negative model
+
+**OUTPUTS**
+
+-recalFile
+
+-tranchesFile
+
+-----
+
+.. class:: infomark
+
+**Recommended CoVaCS command**
+
+**-ip** 100
+
+**-R** genome.fa
+
+**-input** VCF
+
+**-resource**:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap.vcf
+
+**-resource**:omni,known=false,training=true,truth=true,prior=12.0 omni.vcf
+
+**-resource**:1000G,known=false,training=false,truth=false,prior=8.0 1000G.vcf
+
+**-resource**:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.vcf
+
+**-mode** SNP
+
+**-an** DP **-an** QD **-an** MQ **-an** MQRankSum **-an** ReadPosRankSum **-an** FS
+
+**-tranche** 100.0 **-tranche** 99.5 **-tranche** 99.0 **-tranche** 98.5 **-tranche** 90.0
+
+  </help>
+  <citations>
+        <citation type="doi">10.1186/s12864-018-4508-1</citation>
+  </citations>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mv_untar_gatk.sh	Fri Nov 09 06:04:05 2018 -0500
@@ -0,0 +1,9 @@
+#!/bin/bash
+#if the .jar file is not present in the conda_prefix the script search the tar.gz in the conda_prefix of the vm
+#and untar the archive
+if [[ ! -f $CONDA_PREFIX/../../GenomeAnalysisTK.jar ]] ; then
+	tar -zxvf $CONDA_PREFIX/../../GenomeAnalysis*.tar.gz -C $CONDA_PREFIX/../../
+
+else
+	echo GATK is present
+fi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/covacs_bed.loc.sample	Fri Nov 09 06:04:05 2018 -0500
@@ -0,0 +1,17 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory bed file for covacs sequences data files. You will need
+#to create these data files and then create a bed_loc.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bed_loc.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_id>   <dbkey>   <display_name>   <file_path>
+#
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
+hg19	hg19	hg19-padded	/export/BED/S07084713_Padded.bed
+hgbed	hg19	hg19-bed-test	/export/BED/chr22.bed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/covacs_gatk_indexes.loc.sample	Fri Nov 09 06:04:05 2018 -0500
@@ -0,0 +1,36 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of all covacs wrapper that need a gatk reference. You will need
+#to create these data files and then create a covacs_gatk_indexes.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The covacs_gatk_indexes.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_path>
+#
+#So, for example, if you had phiX indexed stored in
+#/depot/data2/galaxy/phiX/base/,
+#then the bwa_index.loc entry would look like this:
+#
+#phiX174   phiX   phiX Pretty   /depot/data2/galaxy/phiX/base/phiX.fa
+#
+#and your /depot/data2/galaxy/phiX/base/ directory
+#would contain phiX.dict, phiX.fa.fai files.
+#
+#
+#Your covacs_gatk_indexes.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files.  For example:
+#
+#phiX174                                phiX    phiX174                 /depot/data2/galaxy/phiX/base/phiX.fa
+#hg18canon                              hg18    hg18 Canonical  /depot/data2/galaxy/hg18/base/hg18canon.fa
+#hg18full                               hg18    hg18 Full               /depot/data2/galaxy/hg18/base/hg18full.fa
+#/orig/path/hg19.fa             hg19    hg19                    /depot/data2/galaxy/hg19/base/hg19.fa
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
+hg38	hg38	hg38_GDC	/export/gatkhg38pl/GRCh38.d1.vd1.fa
+hg19	hg19	hg19	/export/gatk_hg19_index_bundle/ucsc.hg19.fasta
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/covacs_vcf.loc.sample	Fri Nov 09 06:04:05 2018 -0500
@@ -0,0 +1,29 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory vcf file for covacs sequences data files. You will need
+#to create these data files and then create a vcf_loc.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The vcf_loc.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_id>	<display_name>	<file_path>
+#
+#So, for example, if you had vcf file  stored in
+#/export/resource/,
+#then the covacs_vcf.loc entry would look like this:
+#
+#hapmap	hapmap   /export/resource/hapmap.vcf
+#
+#and your /export/resource directory
+#would contain hapmap.vcf.
+#
+#
+#Your covacs_vcf.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files.
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter.
+#
+hapmap	hapmap	/export/resources/hapmap.vcf
+1000G	1000G	/export/resources/1000G.vcf
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Fri Nov 09 06:04:05 2018 -0500
@@ -0,0 +1,17 @@
+<tables>
+ <!-- Location of bed-file for covacs -->
+    <table name="covacs_bed" comment_char="#">
+        <columns> value, dbkey, name, path</columns>
+        <file path="tool-data/covacs_bed.loc" />
+    </table>
+<!-- Location of index file  for covacs gatk wrapper -->
+    <table name="covacs_gatk_indexes" comment_char="#">
+        <columns> value, dbkey, name, path</columns>
+        <file path="tool-data/covacs_gatk_indexes.loc" />
+    </table>
+<!-- Location of vcf-file for covacs VariantRecalibrator -->
+    <table name="covacs_vcf" comment_char="#">
+        <columns> value, name, path</columns>
+        <file path="tool-data/covacs_vcf.loc" />
+    </table>
+</tables>