Mercurial > repos > jjohnson > snpsift_dbnsfp_generic

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.rst	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,11 @@
+These are galaxy tools for SnpEff a variant annotation and effect prediction tool by Pablo Cingolani.
+It annotates and predicts the effects of variants on genes (such as amino acid changes).
+( http://snpeff.sourceforge.net/ )
+
+This repository contains a tool_dependencies.xml file that will attempt to automatically install SnpEff and SnpSift.
+
+SnpEff citation:
+"A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3.", Cingolani P, Platts A, Wang le L, Coon M, Nguyen T, Wang L, Land SJ, Lu X, Ruden DM. Fly (Austin). 2012 Apr-Jun;6(2):80-92. PMID: 22728672 [PubMed - in process]
+
+SnpSift citation:
+"Using Drosophila melanogaster as a model for genotoxic chemical mutational studies with a new program, SnpSift", Cingolani, P., et. al., Frontiers in Genetics, 3, 2012.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repository_dependencies.xml	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<repositories description="This requires the SnpEff dbnsfp datatype definitions.">
+  <repository changeset_revision="df236b5e2985" name="snpsift_dbnsfp_datatypes" owner="jjohnson" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+</repositories>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpEff_macros.xml	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,32 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="4.0">snpEff</requirement>
+        </requirements>
+    </xml>
+  <xml name="stdio">
+    <stdio>
+        <exit_code range=":-1"  level="fatal" description="Error: Cannot open file" />
+        <exit_code range="1:"  level="fatal" description="Error" />
+    </stdio>
+  </xml>
+  <token name="@EXTERNAL_DOCUMENTATION@">
+
+For details about this tool, please go to:
+	http://snpeff.sourceforge.net/SnpEff_manual.html
+
+  </token>
+  <token name="@CITATION_SECTION@">------
+
+**Citation**
+
+For the underlying tool, please cite the following two publications:
+
+SnpEff citation:
+"A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3.", Cingolani P, Platts A, Wang le L, Coon M, Nguyen T, Wang L, Land SJ, Lu X, Ruden DM. Fly (Austin). 2012 Apr-Jun;6(2):80-92. PMID: 22728672 [PubMed - in process]
+
+SnpSift citation:
+"Using Drosophila melanogaster as a model for genotoxic chemical mutational studies with a new program, SnpSift", Cingolani, P., et. al., Frontiers in Genetics, 3, 2012.
+
+  </token>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpSift_dbnsfp.xml	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,297 @@
+<tool id="snpSift_dbnsfp_generic" name="SnpSift dbNSFP" version="4.0.0">
+    <description>Add Annotations from dbNSFP and similar annotation DBs</description>
+    <expand macro="requirements" />
+    <macros>
+        <import>snpEff_macros.xml</import>
+    </macros>
+    <command>
+        java -Xmx6G -jar \$SNPEFF_JAR_PATH/SnpSift.jar dbnsfp -v
+        #if $db.dbsrc == 'cached' :
+          -db $db.dbnsfp
+          #if $db.annotations and $db.annotations.__str__ != '':
+            -f "$db.annotations"
+          #end if
+        #else :
+          -db "${db.dbnsfpdb.extra_files_path}/${db.dbnsfpdb.metadata.bgzip}"
+          #if $db.annotations and $db.annotations.__str__ != '':
+            -f "$db.annotations"
+          #end if
+        #end if
+        $input > $output
+        2> tmp.err &amp;&amp; grep -v file tmp.err
+    </command>
+    <inputs>
+        <param name="input" type="data" format="vcf" label="Variant input file in VCF format"/>
+        <conditional name="db">
+            <param name="dbsrc" type="select" label="dbNSFP ">
+                <option value="cached">Locally installed dbNSFP database </option>
+                <option value="history">dbNSFP database from your history</option>
+            </param>
+            <when value="cached">
+                <param name="dbnsfp" type="select" label="Genome">
+                    <options from_data_table="snpsift_dbnsfp">
+                        <column name="name" index="1"/>
+                        <column name="value" index="3"/>
+                    </options>
+                </param>
+                <param name="annotations" type="select" multiple="true" display="checkboxes" label="Annotate with">
+                    <options from_data_table="snpsift_dbnsfp">
+                        <column name="name" index="3"/>
+                        <column name="value" index="3"/>
+                        <filter type="param_value" ref="dbnsfp" column="2" />
+                        <filter type="multiple_splitter" column="3" separator=","/>
+                    </options>
+                </param>
+            </when>
+            <when value="history">
+                <param name="dbnsfpdb" type="data" format="snpsiftdbnsfp" label="DbNSFP"/>
+                <param name="annotations" type="select" multiple="true" display="checkboxes" label="Annotate with">
+                  <options>
+                    <filter type="data_meta" ref="dbnsfpdb" key="annotation" />
+                  </options>
+                </param>
+            </when>
+        </conditional>
+    </inputs>
+    <expand macro="stdio" />
+    <outputs>
+        <data format="vcf" name="output" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" ftype="vcf" value="test_annotate_in.vcf.vcf"/>
+            <param name="dbsrc" value="history"/>
+            <param name="dbnsfpdb" value="test_dbnsfpdb.tabular" ftype="dbnsfp.tabular" />
+            <annotations value="aaref,aaalt,genename,aapos,SIFT_score"/>
+            <output name="output">
+                <assert_contents>
+                    <has_text text="dbNSFP_SIFT_score=0.15" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+
+The dbNSFP is an integrated database of functional predictions from multiple algorithms (SIFT, Polyphen2, LRT and MutationTaster, PhyloP and GERP++, etc.).
+It contains variant annotations such as:
+
+
+  1000Gp1_AC
+    Alternative allele counts in the whole 1000 genomes phase 1 (1000Gp1) data
+  1000Gp1_AF
+    Alternative allele frequency in the whole 1000Gp1 data
+  1000Gp1_AFR_AC
+    Alternative allele counts in the 1000Gp1 African descendent samples
+  1000Gp1_AFR_AF
+    Alternative allele frequency in the 1000Gp1 African descendent samples
+  1000Gp1_AMR_AC
+    Alternative allele counts in the 1000Gp1 American descendent samples
+  1000Gp1_AMR_AF
+    Alternative allele frequency in the 1000Gp1 American descendent samples
+  1000Gp1_ASN_AC
+    Alternative allele counts in the 1000Gp1 Asian descendent samples
+  1000Gp1_ASN_AF
+    Alternative allele frequency in the 1000Gp1 Asian descendent samples
+  1000Gp1_EUR_AC
+    Alternative allele counts in the 1000Gp1 European descendent samples
+  1000Gp1_EUR_AF
+    Alternative allele frequency in the 1000Gp1 European descendent samples
+  aaalt
+    Alternative amino acid. "." if the variant is a splicing site SNP (2bp on each end of an intron)
+  aapos
+    Amino acid position as to the protein. "-1" if the variant is a splicing site SNP (2bp on each end of an intron)
+  aapos_SIFT
+    ENSP id and amino acid positions corresponding to SIFT scores. Multiple entries separated by ";"
+  aapos_FATHMM
+    ENSP id and amino acid positions corresponding to FATHMM scores. Multiple entries separated by ";"
+  aaref
+    Reference amino acid. "." if the variant is a splicing site SNP (2bp on each end of an intron)
+  alt
+    Alternative nucleotide allele (as on the + strand)
+  Ancestral_allele
+    Ancestral allele (based on 1000 genomes reference data)
+  cds_strand
+    Coding sequence (CDS) strand (+ or -)
+  chr
+    Chromosome number
+  codonpos
+    Position on the codon (1, 2 or 3)
+  Ensembl_geneid
+    Ensembl gene ID
+  Ensembl_transcriptid
+    Ensembl transcript IDs (separated by ";")
+  ESP6500_AA_AF
+    Alternative allele frequency in the African American samples of the NHLBI GO Exome Sequencing Project (ESP6500 data set)
+  ESP6500_EA_AF
+    Alternative allele frequency in the European American samples of the NHLBI GO Exome Sequencing Project (ESP6500 data set)
+  FATHMM_pred
+    If a FATHMM_score is &lt;=-1.5 (or rankscore &lt;=0.81415) the corresponding non-synonymous SNP is predicted as "D(AMAGING)"; otherwise it is predicted as "T(OLERATED)". Multiple predictions separated by ";"
+  FATHMM_rankscore
+    FATHMMori scores were ranked among all FATHMMori scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of FATHMMori scores in dbNSFP. If there are multiple scores, only the most damaging (largest) rankscore is presented. The scores range from 0 to 1
+  FATHMM_score
+    FATHMM default score (FATHMMori)
+  fold-degenerate
+    Degenerate type (0, 2 or 3)
+  genename
+    Gene name; if the non-synonymous SNP can be assigned to multiple genes, gene names are separated by ";"
+  GERP++_NR
+    GERP++ neutral rate
+  GERP++_RS
+    GERP++ RS score, the larger the score, the more conserved the site
+  GERP++_RS_rankscore
+    GERP++ RS scores were ranked among all GERP++ RS scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of GERP++ RS scores in dbNSFP
+  hg18_pos(1-coor)
+    Physical position on the chromosome as to hg18 (1-based coordinate)
+  Interpro_domain
+    Domain or conserved site on which the variant locates
+  LR_pred
+    Prediction of our LR based ensemble prediction score, "T(olerated)" or "D(amaging)". The score cutoff between "D" and "T" is 0.5. The rankscore cutoff between "D" and "T" is 0.82268
+  LR_rankscore
+    LR scores were ranked among all LR scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of LR scores in dbNSFP. The scores range from 0 to 1
+  LR_score
+    Our logistic regression (LR) based ensemble prediction score, which incorporated 10 scores (SIFT, PolyPhen-2 HDIV, PolyPhen-2 HVAR, GERP++, MutationTaster, Mutation Assessor, FATHMM, LRT, SiPhy, PhyloP) and the maximum frequency observed in the 1000 genomes populations. Larger value means the SNV is more likely to be damaging. Scores range from 0 to 1
+  LRT_Omega
+    Estimated nonsynonymous-to-synonymous-rate ratio (Omega, reported by LRT)
+  LRT_converted_rankscore
+    LRTori scores were first converted as LRTnew=1-LRTori*0.5 if Omega&lt;1, or LRTnew=LRTori*0.5 if Omega&gt;=1. Then LRTnew scores were ranked among all LRTnew scores in dbNSFP. The rankscore is the ratio of the rank over the total number of the scores in dbNSFP. The scores range from 0.00166 to 0.85682
+  LRT_pred
+    LRT prediction, D(eleterious), N(eutral) or U(nknown), which is not solely determined by the score
+  LRT_score
+    The original LRT two-sided p-value (LRTori), ranges from 0 to 1
+  MutationAssessor_pred
+    MutationAssessor's functional impact of a variant
+  MutationAssessor_rankscore
+    MAori scores were ranked among all MAori scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of MAori scores in dbNSFP. The scores range from 0 to 1
+  MutationAssessor_score
+    MutationAssessor functional impact combined score (MAori)
+  MutationTaster_converted_rankscore
+    The MTori scores were first converted: if the prediction is "A" or "D" MTnew=MTori; if the prediction is "N" or "P", MTnew=1-MTori. Then MTnew scores were ranked among all MTnew scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of MTnew scores in dbNSFP. The scores range from 0.0931 to 0.80722
+  MutationTaster_pred
+    MutationTaster prediction
+  MutationTaster_score
+    MutationTaster p-value (MTori), ranges from 0 to 1
+  phastCons46way_placental
+    phastCons conservation score based on the multiple alignments of 33 placental mammal genomes (including human). The larger the score, the more conserved the site
+  phastCons46way_placental_rankscore
+    phastCons46way_placental scores were ranked among all phastCons46way_placental scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phastCons46way_placental scores in dbNSFP
+  phastCons46way_primate
+    phastCons conservation score based on the multiple alignments of 10 primate genomes (including human). The larger the score, the more conserved the site
+  phastCons46way_primate_rankscore
+    phastCons46way_primate scores were ranked among all phastCons46way_primate scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phastCons46way_primate scores in dbNSFP
+  phastCons100way_vertebrate
+    phastCons conservation score based on the multiple alignments of 100 vertebrate genomes (including human). The larger the score, the more conserved the site
+  phastCons100way_vertebrate_rankscore
+    phastCons100way_vertebrate scores were ranked among all phastCons100way_vertebrate scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phastCons100way_vertebrate scores in dbNSFP
+  phyloP46way_placental
+    phyloP (phylogenetic p-values) conservation score based on the multiple alignments of 33 placental mammal genomes (including human). The larger the score, the more conserved the site
+  phyloP46way_placental_rankscore
+    phyloP46way_placental scores were ranked among all phyloP46way_placental scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phyloP46way_placental scores in dbNSFP
+  phyloP46way_primate
+    phyloP (phylogenetic p-values) conservation score based on the multiple alignments of 10 primate genomes (including human). The larger the score, the more conserved the site
+  phyloP46way_primate_rankscore
+    phyloP46way_primate scores were ranked among all phyloP46way_primate scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phyloP46way_primate scores in dbNSFP
+  phyloP100way_vertebrate
+    phyloP (phylogenetic p-values) conservation score based on the multiple alignments of 100 vertebrate genomes (including human). The larger the score, the more conserved the site
+  phyloP100way_vertebrate_rankscore
+    phyloP100way_vertebrate scores were ranked among all phyloP100way_vertebrate scores in dbNSFP. The rankscore is the ratio of the rank of the score over the total number of phyloP100way_vertebrate scores in dbNSFP
+  Polyphen2_HDIV_pred
+    Polyphen2 prediction based on HumDiv
+  Polyphen2_HDIV_rankscore
+    Polyphen2 HDIV scores were first ranked among all HDIV scores in dbNSFP. The rankscore is the ratio of the rank the score over the total number of the scores in dbNSFP. If there are multiple scores, only the most damaging (largest) rankscore is presented. The scores range from 0.02656 to 0.89917
+  Polyphen2_HDIV_score
+    Polyphen2 score based on HumDiv, i.e. hdiv_prob. The score ranges from 0 to 1. Multiple entries separated by ";"
+  Polyphen2_HVAR_pred
+    Polyphen2 prediction based on HumVar
+  Polyphen2_HVAR_rankscore
+    Polyphen2 HVAR scores were first ranked among all HVAR scores in dbNSFP. The rankscore is the ratio of the rank the score over the total number of the scores in dbNSFP. If there are multiple scores, only the most damaging (largest) rankscore is presented. The scores range from 0.01281 to 0.9711
+  Polyphen2_HVAR_score
+    Polyphen2 score based on HumVar, i.e. hvar_prob. The score ranges from 0 to 1. Multiple entries separated by ";"
+  pos(1-coor)
+    Physical position on the chromosome as to hg19 (1-based coordinate)
+  RadialSVM_pred
+    Prediction of our SVM based ensemble prediction score, "T(olerated)" or "D(amaging)". The score cutoff between "D" and "T" is 0. The rankscore cutoff between "D" and "T" is 0.83357
+  RadialSVM_rankscore
+    RadialSVM scores were ranked among all RadialSVM scores in dbNSFP. The rankscore is the ratio of the rank of the screo over the total number of RadialSVM scores in dbNSFP. The scores range from 0 to 1
+  RadialSVM_score
+    Our support vector machine (SVM) based ensemble prediction score, which incorporated 10 scores (SIFT, PolyPhen-2 HDIV, PolyPhen-2 HVAR, GERP++, MutationTaster, Mutation Assessor, FATHMM, LRT, SiPhy, PhyloP) and the maximum frequency observed in the 1000 genomes populations. Larger value means the SNV is more likely to be damaging. Scores range from -2 to 3 in dbNSFP
+  ref
+    Reference nucleotide allele (as on the + strand)
+  refcodon
+    Reference codon
+  Reliability_index
+    Number of observed component scores (except the maximum frequency in the 1000 genomes populations) for RadialSVM and LR. Ranges from 1 to 10. As RadialSVM and LR scores are calculated based on imputed data, the less missing component scores, the higher the reliability of the scores and predictions
+  SIFT_converted_rankscore
+    SIFTori scores were first converted to SIFTnew=1-SIFTori, then ranked among all SIFTnew scores in dbNSFP. The rankscore is the ratio of the rank the SIFTnew score over the total number of SIFTnew scores in dbNSFP. If there are multiple scores, only the most damaging (largest) rankscore is presented. The rankscores range from 0.02654 to 0.87932
+  SIFT_pred
+    If SIFTori is smaller than 0.05 (rankscore&gt;0.55) the corresponding non-synonymous SNP is predicted as "D(amaging)"; otherwise it is predicted as "T(olerated)". Multiple predictions separated by ";"
+  SIFT_score
+    SIFT score (SIFTori). Scores range from 0 to 1. The smaller the score the more likely the SNP has damaging effect. Multiple scores separated by ";"
+  SiPhy_29way_logOdds
+    SiPhy score based on 29 mammals genomes. The larger the score, the more conserved the site
+  SiPhy_29way_pi
+    The estimated stationary distribution of A, C, G and T at the site, using SiPhy algorithm based on 29 mammals genomes
+  SLR_test_statistic
+    SLR test statistic for testing natural selection on codons. A negative value indicates negative selection, and a positive value indicates positive selection. Larger magnitude of the value suggests stronger evidence
+  Uniprot_aapos
+    Amino acid position as to Uniprot. Multiple entries separated by ";"
+  Uniprot_acc
+    Uniprot accession number. Multiple entries separated by ";"
+  Uniprot_id
+    Uniprot ID number. Multiple entries separated by ";"
+  UniSNP_ids
+    rs numbers from UniSNP, which is a cleaned version of dbSNP build 129, in format: rs number1;rs number2;...
+
+
+
+The procedure for preparing the dbNSFP data for use in SnpSift dbnsfp is in the SnpSift documentation:
+http://snpeff.sourceforge.net/SnpSift.html#dbNSFP
+
+A couple dbNSFP databases are prebuilt for SnpSift at:
+http://sourceforge.net/projects/snpeff/files/databases/dbNSFP/
+
+
+
+
+**Uploading Your Own Annotations for any Genome**
+
+The website for dbNSFP databases releases is:
+https://sites.google.com/site/jpopgen/dbNSFP
+
+But there is only annotation for human hg18, hg19,  and hg38 genome builds.
+
+However, any dbNSFP-like tabular file that be can used with SnpSift dbnsfp if it has:
+
+  - The first line of the file must be column headers that name the annotations.
+  - The first 4 columns are required and must be:
+
+    1. #chr		- chromosome
+    2. pos(1-coor)	- position in chromosome
+    3. ref		- reference base
+    4. alt		- alternate base
+
+
+For example:
+
+::
+
+	#chr	pos(1-coor)	ref	alt	aaref	aaalt	genename	SIFT_score
+	  4	 100239319	 T	 A	  H	  L	 ADH1B	           0
+	  4	 100239319	 T	 C	  H	  R	 ADH1B	           0.15
+	  4	 100239319	 T	 G	  H	  P	 ADH1B	           0
+
+
+The custom galaxy datatypes for dbNSFP can automatically convert the specially formatted tabular file for use by SnpSift dbNSFP:
+  1. Upload the tabular file, set the datatype as: **"dbnsfp.tabular"**
+  2. Edit the history dataset attributes (pencil icon): Use "Convert Format" to convert the **"dbnsfp.tabular"** to the correct format for SnpSift dbnsfp: **"snpsiftdbnsfp"**.
+
+The procedure for preparing the dbNSFP data for use in SnpSift dbnsfp is in the SnpSift documentation.
+
+
+@EXTERNAL_DOCUMENTATION@
+	http://snpeff.sourceforge.net/SnpSift.html#dbNSFP
+
+@CITATION_SECTION@
+
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_annotate_in.vcf	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,10 @@
+##fileformat=VCFv4.1
+##samtoolsVersion=0.1.18 (r982:295)
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
+##SnpEffVersion="3.5 (build 2014-02-12), by Pablo Cingolani"
+##SnpEffCmd="SnpEff  -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -stats /Users/jj/gxt/gxt/database/files/004/dataset_4998.dat GRCh37.71 /Users/jj/gxt/gxt/database/files/004/dataset_4996.dat "
+##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank  | Genotype_Number [ | ERRORS | WARNINGS ] )' ">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	/data/sequencing/output/biotec4/mapping/L774.q1.s.bam	/data/sequencing/output/biotec4/mapping/L775.q1.s.bam
+chr4	100239319	rs1229984	T	C	94.3	.	DP=29;EFF=EXON(MODIFIER|||||ADH1B|processed_transcript|CODING|ENST00000504498|3|1),EXON(MODIFIER|||||ADH1B|retained_intron|CODING|ENST00000515694|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H48R|375|ADH1B|protein_coding|CODING|ENST00000305046|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H8R|335|ADH1B|protein_coding|CODING|ENST00000394887|3|1),UTR_3_PRIME(MODIFIER||2729|||ADH1B|nonsense_mediated_decay|CODING|ENST00000506651|4|1)
+chr12	32491626	rs1471909	G	A	124.0	.	DP=22;EFF=DOWNSTREAM(MODIFIER||532|||BICD1|retained_intron|CODING|ENST00000552160||1),INTRON(MODIFIER||||835|BICD1|protein_coding|CODING|ENST00000548411|7|1),INTRON(MODIFIER||||975|BICD1|protein_coding|CODING|ENST00000281474|7|1),INTRON(MODIFIER|||||BICD1|nonsense_mediated_decay|CODING|ENST00000395758|7|1),INTRON(MODIFIER|||||BICD1|retained_intron|CODING|ENST00000552226|1|1)
+chrX	153010066	rs11803	C	T	73.8	.	DP=34;EFF=DOWNSTREAM(MODIFIER||4008||221|ABCD1|protein_coding|CODING|ENST00000443684||1),INTRAGENIC(MODIFIER|||||ABCD1||CODING|||1),INTRON(MODIFIER|||||U52111.14|antisense|NON_CODING|ENST00000434284|1|1),UTR_3_PRIME(MODIFIER||877||745|ABCD1|protein_coding|CODING|ENST00000218104|10|1)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_dbnsfp_out.vcf	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,15 @@
+##fileformat=VCFv4.1
+##samtoolsVersion=0.1.18 (r982:295)
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
+##SnpEffVersion="3.5 (build 2014-02-12), by Pablo Cingolani"
+##SnpEffCmd="SnpEff  -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -stats /Users/jj/gxt/gxt/database/files/004/dataset_4998.dat GRCh37.71 /Users/jj/gxt/gxt/database/files/004/dataset_4996.dat "
+##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank  | Genotype_Number [ | ERRORS | WARNINGS ] )' ">
+##SnpSiftVersion="SnpSift 3.5 (build 2014-02-12), by Pablo Cingolani"
+##SnpSiftCmd="SnpSift dbnsfp /Users/jj/gxt/gxt/database/files/005/dataset_5011_files/dbNSFP.gz -f aaref,aaalt,genename,aapos,SIFT_score /Users/jj/gxt/gxt/database/files/005/dataset_5006.dat "
+##INFO=<ID=dbNSFP_aapos,Number=A,Type=Integer,Description="Field 'aapos' from dbNSFP">
+##INFO=<ID=dbNSFP_genename,Number=A,Type=String,Description="Field 'genename' from dbNSFP">
+##INFO=<ID=dbNSFP_aaref,Number=A,Type=Character,Description="Field 'aaref' from dbNSFP">
+##INFO=<ID=dbNSFP_aaalt,Number=A,Type=Character,Description="Field 'aaalt' from dbNSFP">
+##INFO=<ID=dbNSFP_SIFT_score,Number=A,Type=Float,Description="Field 'SIFT_score' from dbNSFP">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	/data/sequencing/output/biotec4/mapping/L774.q1.s.bam	/data/sequencing/output/biotec4/mapping/L775.q1.s.bam
+chr4	100239319	rs1229984	T	C	94.3	.	DP=29;EFF=EXON(MODIFIER|||||ADH1B|processed_transcript|CODING|ENST00000504498|3|1),EXON(MODIFIER|||||ADH1B|retained_intron|CODING|ENST00000515694|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H48R|375|ADH1B|protein_coding|CODING|ENST00000305046|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H8R|335|ADH1B|protein_coding|CODING|ENST00000394887|3|1),UTR_3_PRIME(MODIFIER||2729|||ADH1B|nonsense_mediated_decay|CODING|ENST00000506651|4|1);dbNSFP_aapos=48|8|48;dbNSFP_genename=ADH1B;dbNSFP_aaref=H;dbNSFP_aaalt=R;dbNSFP_SIFT_score=0.15
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_dbnsfpdb.tabular	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,7 @@
+#chr	pos(1-coor)	ref	alt	aaref	aaalt	hg18_pos(1-coor)	genename	Uniprot_acc	Uniprot_id	Uniprot_aapos	Interpro_domain	cds_strand	refcodon	SLR_test_statistic 	codonpos	fold-degenerate	Ancestral_allele	Ensembl_geneid	Ensembl_transcriptid	aapos	aapos_SIFT	aapos_FATHMM	SIFT_score	SIFT_converted_rankscore	SIFT_pred	Polyphen2_HDIV_score	Polyphen2_HDIV_rankscore	Polyphen2_HDIV_pred	Polyphen2_HVAR_score	Polyphen2_HVAR_rankscore	Polyphen2_HVAR_pred	LRT_score	LRT_converted_rankscore	LRT_pred	MutationTaster_score	MutationTaster_converted_rankscore	MutationTaster_pred	MutationAssessor_score	MutationAssessor_rankscore	MutationAssessor_pred	FATHMM_score	FATHMM_rankscore	FATHMM_pred	RadialSVM_score	RadialSVM_rankscore	RadialSVM_pred	LR_score	LR_rankscore	LR_pred	Reliability_index	CADD_raw	CADD_raw_rankscore	CADD_phred	GERP++_NR	GERP++_RS	GERP++_RS_rankscore	phyloP46way_primate	phyloP46way_primate_rankscore	phyloP46way_placental	phyloP46way_placental_rankscore	phyloP100way_vertebrate	phyloP100way_vertebrate_rankscore	phastCons46way_primate	phastCons46way_primate_rankscore	phastCons46way_placental	phastCons46way_placental_rankscore	phastCons100way_vertebrate	phastCons100way_vertebrate_rankscore	SiPhy_29way_pi	SiPhy_29way_logOdds	SiPhy_29way_logOdds_rankscore	LRT_Omega	UniSNP_ids	1000Gp1_AC	1000Gp1_AF	1000Gp1_AFR_AC	1000Gp1_AFR_AF	1000Gp1_EUR_AC	1000Gp1_EUR_AF	1000Gp1_AMR_AC	1000Gp1_AMR_AF	1000Gp1_ASN_AC	1000Gp1_ASN_AF	ESP6500_AA_AF	ESP6500_EA_AF
+1	69134	A	C	E	A	58997	OR4F5	Q8NH21	OR4F5_HUMAN	15	.	+	GAA	.	2	0	.	ENSG00000186092	ENST00000534990;ENST00000335137	63;15	ENSP00000334393:E15A	ENSP00000334393:E15A	0.03	0.62326	D	0.043	0.20261	B	0.037	0.21917	B	0.263780	0.15411	U	0.998654	0.22851	N	2.635	0.83312	M	7.42	0.00438	T	-0.9897	0.32693	T	0.0017	0.00524	T	10	-0.186758	0.06340	3.092	2.31	2.31	0.28768	0.327000	0.21459	1.014000	0.39417	0.296000	0.19083	0.475000	0.33008	0.951000	0.38953	0.000000	0.05858	1.0:0.0:0.0:0.0	8.5094	0.33208	0.481469	.	.	.	.	.	.	.	.	.	.	.	.	.
+1	69134	A	G	E	G	58997	OR4F5	Q8NH21	OR4F5_HUMAN	15	.	+	GAA	.	2	0	.	ENSG00000186092	ENST00000534990;ENST00000335137	63;15	ENSP00000334393:E15G	ENSP00000334393:E15G	0.09	0.49607	T	0.0	0.02656	B	0.001	0.04013	B	0.263780	0.15411	N	0.998383	0.23043	N	2.055	0.67517	M	7.4	0.00444	T	-0.9720	0.37103	T	0.0013	0.00412	T	10	-0.469020	0.04445	1.834	2.31	2.31	0.28768	0.327000	0.21459	1.014000	0.39417	0.296000	0.19083	0.475000	0.33008	0.951000	0.38953	0.000000	0.05858	1.0:0.0:0.0:0.0	8.5094	0.33208	0.481469	.	.	.	.	.	.	.	.	.	.	.	.	.
+1	69134	A	T	E	V	58997	OR4F5	Q8NH21	OR4F5_HUMAN	15	.	+	GAA	.	2	0	.	ENSG00000186092	ENST00000534990;ENST00000335137	63;15	ENSP00000334393:E15V	ENSP00000334393:E15V	0.03	0.62326	D	0.308	0.31100	B	0.18	0.34346	B	0.263780	0.15411	U	0.996706	0.23838	N	2.57	0.82056	M	7.39	0.00446	T	-1.0029	0.28902	T	0.0017	0.00524	T	10	0.661265	0.14645	7.544	2.31	2.31	0.28768	0.327000	0.21459	1.014000	0.39417	0.296000	0.19083	0.475000	0.33008	0.951000	0.38953	0.000000	0.05858	1.0:0.0:0.0:0.0	8.5094	0.33208	0.481469	.	.	.	.	.	.	.	.	.	.	.	.	.
+4	100239319	T	A	H	L	100458342	ADH1B	A8MYN5	.	8	.	-	CAC	-1.2513	2	0	C	ENSG00000196616	ENST00000305046;ENST00000394887;ENST00000412614	48;8;48	ENSP00000306606:H48L	ENSP00000306606:H48L;ENSP00000378351:H8L	0	0.87932	D	0.021	0.17268	B	0.009	0.13407	B	0.001009	0.40818	N	0.962682	0.39176	D	1.11	0.37507	L	3.73;3.49	0.05139	T;T	-1.0300	0.19614	T	0.0167	0.06508	T	10	2.521929	0.44866	14.39	4.41	3.57	0.40892	-0.215000	0.12644	0.318000	0.23185	2.012000	0.40932	0.849000	0.48306	0.053000	0.19242	0.997000	0.39634	0.0:0.8397:0.0:0.1603	10.5345	0.44996	0.280785	.	.	.	.	.	.	.	.	.	.	.	.	.
+4	100239319	T	C	H	R	100458342	ADH1B	A8MYN5	.	8	.	-	CAC	-1.2513	2	0	C	ENSG00000196616	ENST00000305046;ENST00000394887;ENST00000412614	48;8;48	ENSP00000306606:H48R	ENSP00000306606:H48R;ENSP00000378351:H8R	0.15	0.41790	T	0.0	0.02656	B	0.0	0.01281	B	0.001009	0.40818	N	0.848429	0.29591	P	.	.	.	3.8;3.52	0.04970	T;T	-0.9979	0.30416	T	0.0000	0.00012	T	8	0.445290	0.12164	6.418	4.41	3.57	0.40892	-0.215000	0.12644	0.318000	0.23185	2.012000	0.40932	0.849000	0.48306	0.053000	0.19242	0.997000	0.39634	0.0:0.8397:0.0:0.1603	10.5345	0.44996	0.280785	rs1229984;rs1789884;rs11537716;rs17028836;rs17856968;rs52797169;rs57624638;rs1229984	1723	0.7889194139194139	492	1.0	744	0.9815303430079155	332	0.9171270718232044	155	0.270979020979021	0.983886	0.953023
+4	100239319	T	G	H	P	100458342	ADH1B	A8MYN5	.	8	.	-	CAC	-1.2513	2	0	C	ENSG00000196616	ENST00000305046;ENST00000394887;ENST00000412614	48;8;48	ENSP00000306606:H48P	ENSP00000306606:H48P;ENSP00000378351:H8P	0	0.87932	D	0.0	0.02656	B	0.002	0.06405	B	0.001009	0.40818	N	0.961891	0.39133	D	0.42	0.16602	N	3.72;3.47	0.05258	T;T	-0.9816	0.34750	T	0.0130	0.04815	T	10	2.210304	0.39003	13.35	4.41	3.57	0.40892	-0.215000	0.12644	0.318000	0.23185	2.012000	0.40932	0.849000	0.48306	0.053000	0.19242	0.997000	0.39634	0.0:0.8397:0.0:0.1603	10.5345	0.44996	0.280785	.	.	.	.	.	.	.	.	.	.	.	.	.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/snpsift_dbnsfp.loc.sample	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,3 @@
+#id	build	description	path	annotations
+#GRCh37_dbNSFP2.4	GRCh37	GRCh37 dbNSFP2.4	/depot/snpeff/	SIFT_pred,Uniprot_acc
+#GRCh38_dbNSFP2.7	GRCh38	GRCh38 dbNSFP2.7	/depot/snpeff/	SIFT_pred,Uniprot_acc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,7 @@
+<tables>
+    <table name="snpsift_dbnsfp" comment_char="#">
+        <columns>dbkey, build, name, value, annotations</columns>
+        <file path="tool-data/snpsift_dbnsfp.loc" />
+    </table>
+</tables>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Nov 10 14:17:47 2014 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="snpEff" version="4.0">
+      <repository changeset_revision="4ac635fc1781" name="package_snpeff_4_0" owner="jjohnson" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>