Mercurial > repos > elixir-it > covacs_variant_recalibrator
changeset 0:3d969c748317 draft
Uploaded
author | elixir-it |
---|---|
date | Fri, 09 Nov 2018 06:04:05 -0500 |
parents | |
children | afcf3ae6dc62 |
files | bed_macros.xml covacs_VariantRecalibrator.xml mv_untar_gatk.sh tool-data/covacs_bed.loc.sample tool-data/covacs_gatk_indexes.loc.sample tool-data/covacs_vcf.loc.sample tool_data_table_conf.xml.sample |
diffstat | 7 files changed, 353 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed_macros.xml Fri Nov 09 06:04:05 2018 -0500 @@ -0,0 +1,22 @@ +<macros> + <macro name="bed_loc"> + <conditional name="bed_source"> + <param name="bed_source_selector" type="select" label="Will you select a bed file from your history or use a built-in bed?" help=" bed file over which operate"> + <option value="cached">Use a built-in bed</option> + <option value="history">Use a bed from history as reference</option> + </param> + <when value="cached"> + <param name="bed_cached" type="select" label="Using reference bed" help="Select bed from the list"> + <options from_data_table="covacs_bed"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No bed are available" /> + </options> + <validator type="no_options" message="A built-in bed file is not available"/> + </param> + </when> + <when value="history"> + <param name="bed_history" type="data" format="bed" label="Use the following dataset as reference bed " help="You can upload a bed file to the history and use it" optional="true" /> + </when> + </conditional> + </macro> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/covacs_VariantRecalibrator.xml Fri Nov 09 06:04:05 2018 -0500 @@ -0,0 +1,223 @@ + <tool id="covacs_VarianRecalibrator" name="covacs_VariantRecalibrator" version="3.8"> + <description>GATK VariantRecalibrator wrapper Version = 3.8</description> + <macros> + <import>bed_macros.xml</import> + <import>vcf_macros.xml</import> + </macros> + <requirements> + <requirement type="package" version="3.8" >gatk</requirement> + </requirements> + <command> + <![CDATA[ + ### call the .sh to untar the package + sh $__tool_directory__/mv_untar_gatk.sh && + + ##sym link to run GATK + #if $bed_source.bed_source_selector == "history" and $bed_source.bed_history + ln -s $bed_source.bed_history region.bed && + #end if + + ln -s $input1 input1.vcf && + + ##GATK tool call + java -jar \$CONDA_PREFIX/../../GenomeAnalysisTK.jar -T VariantRecalibrator + + + + #if $bed_source.bed_source_selector == "history" and $bed_source.bed_history + -L region.bed + #end if + #if $bed_source.bed_source_selector == "cached" + -L $bed_source.bed_cached.fields.path + #end if + + + + -ip $ip + ##call chose genome from covacs_gatk_indexes.loc + -R $ref_file.fields.path + + ##vcf input parameter + + -input input1.vcf + + ## for that permit to insert different resources + + #for $r in $resource# + + #if $r.vcf_source.vcf_source_selector == "history" and $r.vcf_source.vcf_history + + --resource:${r.nameresource},known=${r.known.value},training=${r.training.value},truth=${r.truth.value},prior=${r.prior.value} ${r.vcf_source.vcf_history} + #end if + + #if $r.vcf_source.vcf_source_selector == "cached" + + --resource:${r.nameresource},known=${r.known.value},training=${r.training.value},truth=${r.truth.value},prior=${r.prior.value} ${r.vcf_source.vcf_cached.fields.path} + #end if + + #end for + + -mode $mode_type.mode + ##chose between INDEL and SNP call + + #if $mode_type.mode == "INDEL" + --minNumBadVariants $mode_type.minNumBadVariants + --maxGaussians $mode_type.maxGaussian + -mNG $mode_type.mNG + #end if + + ## for that permit to insert different resources + + #for $a in $an# + + -an ${a.an_name.value} + + #end for + + ## for that permit to insert different resources + + #for $t in $tranches_name# + + -tranche ${t.tranches.value} + #end for + + ##outputs + + -recalFile $recal + -tranchesFile $tranches + + 2> $log + ]]> + </command> + <inputs> + <param name="ref_file" type="select" label="Using indexed reference genome" help="Select indexed genome from the list"> + <options from_data_table="covacs_gatk_indexes"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No indexes are available" /> + </options> + <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> + </param> + <param format="vcf" name="input1" label="VCF of raw input variants to be recalibrated" type="data" optional="true" /> + <expand macro="bed_loc"/> + <param name="ip" type="integer" value="100" help="Amount of padding (in bp) to add to each interval"/> + <repeat name="resource" title="-resource" help="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run)"> + <expand macro="vcf_loc"/> + <param name="nameresource" label="name of the resource" type="select" > + <option value="hapmap">hapmap</option> + <option value="omni">omni</option> + <option value="1000G">1000G</option> + <option value="mills">mills</option> + <option value="dbsnp">dbsnp</option> + </param> + <param name="known" type="select" display="radio" help="Known - The program only uses known sites for reporting purposes (to indicate whether variants are already known or novel)" > + <option value="true">true</option> + <option value="false">false</option> + </param> + <param name="training" type="select" display="radio" help="Training - The program builds the Gaussian mixture model using input variants that overlap with these training sites."> + <option value="true">true</option> + <option value="false">false</option> + </param> + <param name="truth" type="select" display="radio" help="Truth - The program uses these truth sites to determine where to set the cutoff in VQSLOD sensitivity"> + <option value="true">true</option> + <option value="false">false</option> + </param> + <param name="prior" value="10.0" min="0" max="100.0" type="float"/> + </repeat> + <repeat name="an" title="-an" help="Annotation which should used for calculations"> + <param name="an_name" label="annotation name" type="select" help="The name of the annotation which should used for calculations"> + <option value="DP">DP</option> + <option value="QD">QD</option> + <option value="MQRankSum">MQRankSum</option> + <option value="ReadPosRankSum">ReadPosRankSum</option> + <option value="FS">FS</option> + </param> + </repeat> + <repeat name="tranches_name" title="tranches" help="The levels of truth sensitivity at which to slice the data. (in percent, that is 1.0 for 1 percent)"> + <param name="tranches" value="98.0" min="0" max="100.0" type="float"/> + </repeat> + <conditional name="mode_type"> + <param name="mode" type="select" display="radio" help=" Recalibration mode to employ (SNP|INDEL)"> + <option value="SNP">snp</option> + <option value="INDEL">INDEL</option> + </param> + <when value="INDEL"> + <param name="maxGaussian" type="integer" value="4"/> + <param name="minNumBadVariants" type="integer" value="5000"/> + <param name="mNG" type="integer" value="2"/> + </when> + </conditional> + </inputs> + <outputs> + <data format="txt" name="recal" from_work_dir="recal" label="${tool.name} on ${on_string}:recal"/> + <data format="txt" name="tranches" from_work_dir="tranches" label="${tool.name} on ${on_string}:tranches"/> + <data format="txt" name="log" label="log"/> + </outputs> + <help> +.. class:: warningmark + +**IMPORTANT** to get the wrapper ready to start the admin user have to download gatk GATK version 3.8 from the broadinstitute site https://software.broadinstitute.org/gatk/download/archive and then move it in the conda_prefix folder, the path of the conda_prefix is written in the galaxy.ini(or .yml) file + + **more informations** at https://software.broadinstitute.org/gatk/documentation/tooldocs/3.8-0/org_broadinstitute_gatk_tools_walkers_variantrecalibration_VariantRecalibrator.php + +----- + +**Implemented options** VariantRecalibrator: + +**-L** : One or more genomic intervals over which to operate(file.bed) + +**-ip** Amount of padding (in bp) to add to each interval + +**--resource:NAME,known=true/false,training=true/false,truth=true/false,prior=float $file** :A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run) + +**-mode** : Recalibration mode to employ (SNP|INDEL) + +**-an** : annotations which should used for calculations + +**-tranche** The levels of truth sensitivity at which to slice the data. (in percent, that is 1.0 for 1 percent) + +**in case of indels mode** + +**--minNumBadVariants** : Minimum number of bad variants + +**--maxGaussians** : Max number of Gaussians for the positive model + +**-mNG** : Max number of Gaussians for the negative model + +**OUTPUTS** + +-recalFile + +-tranchesFile + +----- + +.. class:: infomark + +**Recommended CoVaCS command** + +**-ip** 100 + +**-R** genome.fa + +**-input** VCF + +**-resource**:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap.vcf + +**-resource**:omni,known=false,training=true,truth=true,prior=12.0 omni.vcf + +**-resource**:1000G,known=false,training=false,truth=false,prior=8.0 1000G.vcf + +**-resource**:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.vcf + +**-mode** SNP + +**-an** DP **-an** QD **-an** MQ **-an** MQRankSum **-an** ReadPosRankSum **-an** FS + +**-tranche** 100.0 **-tranche** 99.5 **-tranche** 99.0 **-tranche** 98.5 **-tranche** 90.0 + + </help> + <citations> + <citation type="doi">10.1186/s12864-018-4508-1</citation> + </citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mv_untar_gatk.sh Fri Nov 09 06:04:05 2018 -0500 @@ -0,0 +1,9 @@ +#!/bin/bash +#if the .jar file is not present in the conda_prefix the script search the tar.gz in the conda_prefix of the vm +#and untar the archive +if [[ ! -f $CONDA_PREFIX/../../GenomeAnalysisTK.jar ]] ; then + tar -zxvf $CONDA_PREFIX/../../GenomeAnalysis*.tar.gz -C $CONDA_PREFIX/../../ + +else + echo GATK is present +fi
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/covacs_bed.loc.sample Fri Nov 09 06:04:05 2018 -0500 @@ -0,0 +1,17 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory bed file for covacs sequences data files. You will need +#to create these data files and then create a bed_loc.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bed_loc.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_id> <dbkey> <display_name> <file_path> +# +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +# +hg19 hg19 hg19-padded /export/BED/S07084713_Padded.bed +hgbed hg19 hg19-bed-test /export/BED/chr22.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/covacs_gatk_indexes.loc.sample Fri Nov 09 06:04:05 2018 -0500 @@ -0,0 +1,36 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of all covacs wrapper that need a gatk reference. You will need +#to create these data files and then create a covacs_gatk_indexes.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The covacs_gatk_indexes.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_path> +# +#So, for example, if you had phiX indexed stored in +#/depot/data2/galaxy/phiX/base/, +#then the bwa_index.loc entry would look like this: +# +#phiX174 phiX phiX Pretty /depot/data2/galaxy/phiX/base/phiX.fa +# +#and your /depot/data2/galaxy/phiX/base/ directory +#would contain phiX.dict, phiX.fa.fai files. +# +# +#Your covacs_gatk_indexes.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#phiX174 phiX phiX174 /depot/data2/galaxy/phiX/base/phiX.fa +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/hg18/base/hg18canon.fa +#hg18full hg18 hg18 Full /depot/data2/galaxy/hg18/base/hg18full.fa +#/orig/path/hg19.fa hg19 hg19 /depot/data2/galaxy/hg19/base/hg19.fa +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +# +hg38 hg38 hg38_GDC /export/gatkhg38pl/GRCh38.d1.vd1.fa +hg19 hg19 hg19 /export/gatk_hg19_index_bundle/ucsc.hg19.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/covacs_vcf.loc.sample Fri Nov 09 06:04:05 2018 -0500 @@ -0,0 +1,29 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory vcf file for covacs sequences data files. You will need +#to create these data files and then create a vcf_loc.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The vcf_loc.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_id> <display_name> <file_path> +# +#So, for example, if you had vcf file stored in +#/export/resource/, +#then the covacs_vcf.loc entry would look like this: +# +#hapmap hapmap /export/resource/hapmap.vcf +# +#and your /export/resource directory +#would contain hapmap.vcf. +# +# +#Your covacs_vcf.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. +# +hapmap hapmap /export/resources/hapmap.vcf +1000G 1000G /export/resources/1000G.vcf +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Fri Nov 09 06:04:05 2018 -0500 @@ -0,0 +1,17 @@ +<tables> + <!-- Location of bed-file for covacs --> + <table name="covacs_bed" comment_char="#"> + <columns> value, dbkey, name, path</columns> + <file path="tool-data/covacs_bed.loc" /> + </table> +<!-- Location of index file for covacs gatk wrapper --> + <table name="covacs_gatk_indexes" comment_char="#"> + <columns> value, dbkey, name, path</columns> + <file path="tool-data/covacs_gatk_indexes.loc" /> + </table> +<!-- Location of vcf-file for covacs VariantRecalibrator --> + <table name="covacs_vcf" comment_char="#"> + <columns> value, name, path</columns> + <file path="tool-data/covacs_vcf.loc" /> + </table> +</tables>