changeset 0:6d8aa1176a94 draft

Uploaded
author elixir-it
date Fri, 09 Nov 2018 06:03:28 -0500
parents
children 3a37867409fe
files covacs_Select_Filtration.xml mv_untar_gatk.sh tool-data/covacs_gatk_indexes.loc.sample tool_data_table_conf.xml.sample
diffstat 4 files changed, 132 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/covacs_Select_Filtration.xml	Fri Nov 09 06:03:28 2018 -0500
@@ -0,0 +1,75 @@
+ <tool id="covacs_Select_Filtration" name="covacs_Select_filtration" version="3.8">
+  <description>SelectVariants VariantFiltration wrapper for covacs, use in case of not enough snp or indels error in covacs_VariantRecalibrator</description>
+  <macros>
+  </macros>
+  <requirements>
+  	<requirement type="package" version="3.8" >gatk</requirement>
+  </requirements>
+  <command>
+    <![CDATA[
+	### call the .sh to untar the package 
+        sh $__tool_directory__/mv_untar_gatk.sh &&
+
+	##sym link to run GATK
+
+	ln -s $input1 input1.vcf &&
+	
+	##GATK tool call
+	java -jar  \$CONDA_PREFIX/../../GenomeAnalysisTK.jar
+	-T SelectVariants 
+	-R $ref_file.fields.path
+	-V input1.vcf
+	-selectType $TYPE
+	-o variants_recal.indels.vcf 2>$log 
+		&&
+	java -jar  \$CONDA_PREFIX/../../GenomeAnalysisTK.jar 
+	-T VariantFiltration 
+	-R $ref_file.fields.path
+	-V variants_recal.indels.vcf 
+	--filterExpression "DP<8 || QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" 
+	--filterName "filter_LQ_$TYPE" 
+	-o variants_recal.filtered.small.panel.region.vcf
+
+	2>> $log
+	]]>
+  </command>
+  <inputs>
+    <param format="vcf" name="input1" label="input VCF" type="data" optional="true" />
+    <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
+       <options from_data_table="covacs_gatk_indexes">
+      	 <filter type="sort_by" column="2" />
+       	 <validator type="no_options" message="No indexes are available" />
+       </options>
+       <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
+    </param>
+    <param name="TYPE" type="select" optional="true">
+        <option value="INDEL">INDEL</option>
+        <option value="SNP" selected="true" >SNP</option>
+    </param>	
+  </inputs>
+  <outputs>
+    <data format="vcf" name="recal" from_work_dir="variants_recal.indels.vcf" label="SelectVariants on ${on_string} $TYPE :recal"/>
+    <data format="vcf" name="filtered" from_work_dir="variants_recal.filtered.small.panel.region.vcf" label="VariantFiltration on ${on_string} $TYPE :recal"/>
+    <data format="txt" name="log" label="log"/>
+  </outputs>
+  <help>
+ **IMPORTANT** to get the wrapper ready to start the admin user have to download gatk GATK version 3.8 from the broadinstitute site https://software.broadinstitute.org/gatk/download/archive and then move it in the conda_prefix folder, the path of the conda_prefix is written in the galaxy.ini(or .yml) file 
+
+		**more informations** at https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_variantutils_SelectVariants.php , https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_filters_VariantFiltration.php
+
+**Implemented options** SelectVariants <![CDATA[ &]]> VariantFiltration
+
+-R 		Reference sequence file
+
+-V 		vcf input
+
+other options are fixed based on covacs pipeline
+
+**description** this step can be used if covacs_indel_snp have not enough data to create the model, it calls SelectVariants and VariantFiltration applying the filter expression --filterExpression <![CDATA["DP<8 || QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"]]> to filter the variants both snp and indels
+
+  </help>
+  <citations>
+        <citation type="doi">10.1186/s12864-018-4508-1</citation>
+  </citations>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mv_untar_gatk.sh	Fri Nov 09 06:03:28 2018 -0500
@@ -0,0 +1,9 @@
+#!/bin/bash
+#if the .jar file is not present in the conda_prefix the script search the tar.gz in the conda_prefix of the vm
+#and untar the archive
+if [[ ! -f $CONDA_PREFIX/../../GenomeAnalysisTK.jar ]] ; then
+	tar -zxvf $CONDA_PREFIX/../../GenomeAnalysis*.tar.gz -C $CONDA_PREFIX/../../ 
+	
+else
+	echo GATK is present
+fi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/covacs_gatk_indexes.loc.sample	Fri Nov 09 06:03:28 2018 -0500
@@ -0,0 +1,36 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of all covacs wrapper that need a gatk reference. You will need
+#to create these data files and then create a covacs_gatk_indexes.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The covacs_gatk_indexes.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_path>
+#
+#So, for example, if you had phiX indexed stored in 
+#/depot/data2/galaxy/phiX/base/, 
+#then the bwa_index.loc entry would look like this:
+#
+#phiX174   phiX   phiX Pretty   /depot/data2/galaxy/phiX/base/phiX.fa
+#
+#and your /depot/data2/galaxy/phiX/base/ directory
+#would contain phiX.dict, phiX.fa.fai files.
+#
+#
+#Your covacs_gatk_indexes.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files.  For example:
+#
+#phiX174                                phiX    phiX174                 /depot/data2/galaxy/phiX/base/phiX.fa
+#hg18canon                              hg18    hg18 Canonical  /depot/data2/galaxy/hg18/base/hg18canon.fa
+#hg18full                               hg18    hg18 Full               /depot/data2/galaxy/hg18/base/hg18full.fa
+#/orig/path/hg19.fa             hg19    hg19                    /depot/data2/galaxy/hg19/base/hg19.fa
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
+hg38	hg38	hg38_GDC	/export/gatkhg38pl/GRCh38.d1.vd1.fa
+hg19	hg19	hg19	/export/gatk_hg19_index_bundle/ucsc.hg19.fasta
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Fri Nov 09 06:03:28 2018 -0500
@@ -0,0 +1,12 @@
+<tables>
+<!-- Location of bed-file for covacs -->
+    <table name="covacs_bed" comment_char="#">
+        <columns> value, dbkey, name, path</columns>
+        <file path="tool-data/covacs_bed.loc" />
+    </table>
+<!-- Location of index file  for covacs gatk wrapper -->
+    <table name="covacs_gatk_indexes" comment_char="#">
+        <columns> value, dbkey, name, path</columns>
+        <file path="tool-data/covacs_gatk_indexes.loc" />
+    </table>
+</tables>