# HG changeset patch
# User Jim Johnson <jj@umn.edu>
# Date 1414062385 18000
# Node ID 1739678def32a1012b546f9a03d77c5fc98a68d6
# Parent  49b5bd3dc316811e58d78e36fe2d1f06297cf8c6
Add vcfCheck and test cases

diff -r 49b5bd3dc316 -r 1739678def32 snpSift_vcfCheck.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpSift_vcfCheck.xml	Thu Oct 23 06:06:25 2014 -0500
@@ -0,0 +1,39 @@
+<tool id="snpSift_vcfCheck" name="SnpSift vcfCheck" version="4.0.0">
+    <description>basic checks for Vcf specification compliance</description>
+    <expand macro="requirements" />
+    <macros>
+        <import>snpEff_macros.xml</import>
+    </macros>
+    <command>
+      java -Xmx2G -jar \$SNPEFF_JAR_PATH/SnpSift.jar vcfCheck $input > $output
+    </command>
+    <inputs>
+        <param format="vcf" name="input" type="data" label="Variant input file in VCF format to check"/>
+    </inputs>
+    <outputs>
+        <data format="vcf" name="output" />
+    </outputs>
+    <expand macro="stdio" />
+    <tests>
+        <test>
+            <param name="input" ftype="vcf" value="test-data/test_bad.vcf"/>
+            <output name="output">
+                <assert_contents>
+                    <has_text text="Errors" />
+		</assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help>
+
+Perform some basic check ups on VCF files to spot common problems.
+
+SnpSift vcfCheck checks for some common problems where VCF files are not following the specification. Given that many common VCF problems cause analysis tools and pipelines to behave unexpectedly, this command is intended as a simple debugging tool. 
+
+@EXTERNAL_DOCUMENTATION@
+	http://snpeff.sourceforge.net/SnpSift.html#vcfCheck
+
+@CITATION_SECTION@
+
+    </help>
+</tool>
diff -r 49b5bd3dc316 -r 1739678def32 test-data/test_bad.vcf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_bad.vcf	Thu Oct 23 06:06:25 2014 -0500
@@ -0,0 +1,11 @@
+##fileformat=VCFv4.1
+##samtoolsVersion=0.1.18 (r982:295)
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
+##SnpEffVersion="3.5 (build 2014-02-12), by Pablo Cingolani"
+##SnpEffCmd="SnpEff  -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -stats /Users/jj/gxt/gxt/database/files/004/dataset_4998.dat GRCh37.71 /Users/jj/gxt/gxt/database/files/004/dataset_4996.dat "
+##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank  | Genotype_Number [ | ERRORS | WARNINGS ] )' ">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+chr4	100239319	rs1229984	T	C	94.3	.	DP=29;EFF=EXON(MODIFIER|||||ADH1B|processed_transcript|CODING|ENST00000504498|3|1),EXON(MODIFIER|||||ADH1B|retained_intron|CODING|ENST00000515694|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H48R|375|ADH1B|protein_coding|CODING|ENST00000305046|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H8R|335|ADH1B|protein_coding|CODING|ENST00000394887|3|1),UTR_3_PRIME(MODIFIER||2729|||ADH1B|nonsense_mediated_decay|CODING|ENST00000506651|4|1)
+chr12	32491626	rs1471909	G	A	124.0	.	DP=22;EFF=DOWNSTREAM(MODIFIER||532|||BICD1|retained_intron|CODING|ENST00000552160||1),INTRON(MODIFIER||||835|BICD1|protein_coding|CODING|ENST00000548411|7|1),INTRON(MODIFIER||||975|BICD1|protein_coding|CODING|ENST00000281474|7|1),INTRON(MODIFIER|||||BICD1|nonsense_mediated_decay|CODING|ENST00000395758|7|1),INTRON(MODIFIER|||||BICD1|retained_intron|CODING|ENST00000552226|1|1)
+chr12	3249626	rs1471909	G	A	124.0	.	DP=22;EFF=DOWNSTREAM(MODIFIER||532|||BICD1|retained_intron|CODING|ENST00000552160||1),INTRON(MODIFIER||||835|BICD1|protein_coding|CODING|ENST00000548411|7|1),INTRON(MODIFIER||||975|BICD1|protein_coding|CODING|ENST00000281474|7|1),INTRON(MODIFIER|||||BICD1|nonsense_mediated_decay|CODING|ENST00000395758|7|1),INTRON(MODIFIER|||||BICD1|retained_intron|CODING|ENST00000552226|1|1)
+chrX	153010066	rs11803	C	T	73.8	.	DP=34;EFF=DOWNSTREAM(MODIFIER||4008||221|ABCD1|protein_coding|CODING|ENST00000443684||1),INTRAGENIC(MODIFIER|||||ABCD1||CODING|||1),INTRON(MODIFIER|||||U52111.14|antisense|NON_CODING|ENST00000434284|1|1),UTR_3_PRIME(MODIFIER||877||745|ABCD1|protein_coding|CODING|ENST00000218104|10|1)
diff -r 49b5bd3dc316 -r 1739678def32 test-data/test_rmEff.vcf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_rmEff.vcf	Thu Oct 23 06:06:25 2014 -0500
@@ -0,0 +1,10 @@
+##fileformat=VCFv4.1
+##samtoolsVersion=0.1.18 (r982:295)
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
+##SnpEffVersion="3.5 (build 2014-02-12), by Pablo Cingolani"
+##SnpEffCmd="SnpEff  -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -stats /Users/jj/gxt/gxt/database/files/004/dataset_4998.dat GRCh37.71 /Users/jj/gxt/gxt/database/files/004/dataset_4996.dat "
+##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank  | Genotype_Number [ | ERRORS | WARNINGS ] )' ">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+chr4	100239319	rs1229984	T	C	94.3	.	DP=29
+chr12	32491626	rs1471909	G	A	124.0	.	DP=22
+chrX	153010066	rs11803	C	T	73.8	.	DP=34
diff -r 49b5bd3dc316 -r 1739678def32 test-data/test_rmInfo.vcf
--- a/test-data/test_rmInfo.vcf	Tue Oct 21 12:17:25 2014 -0500
+++ b/test-data/test_rmInfo.vcf	Thu Oct 23 06:06:25 2014 -0500
@@ -4,7 +4,7 @@
 ##SnpEffVersion="3.5 (build 2014-02-12), by Pablo Cingolani"
 ##SnpEffCmd="SnpEff  -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -stats /Users/jj/gxt/gxt/database/files/004/dataset_4998.dat GRCh37.71 /Users/jj/gxt/gxt/database/files/004/dataset_4996.dat "
 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank  | Genotype_Number [ | ERRORS | WARNINGS ] )' ">
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	/data/sequencing/output/biotec4/mapping/L774.q1.s.bam	/data/sequencing/output/biotec4/mapping/L775.q1.s.bam
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
 chr4	100239319	rs1229984	T	C	94.3	.	DP=29;EFF=EXON(MODIFIER|||||ADH1B|processed_transcript|CODING|ENST00000504498|3|1),EXON(MODIFIER|||||ADH1B|retained_intron|CODING|ENST00000515694|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H48R|375|ADH1B|protein_coding|CODING|ENST00000305046|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H8R|335|ADH1B|protein_coding|CODING|ENST00000394887|3|1),UTR_3_PRIME(MODIFIER||2729|||ADH1B|nonsense_mediated_decay|CODING|ENST00000506651|4|1)
 chr12	32491626	rs1471909	G	A	124.0	.	DP=22;EFF=DOWNSTREAM(MODIFIER||532|||BICD1|retained_intron|CODING|ENST00000552160||1),INTRON(MODIFIER||||835|BICD1|protein_coding|CODING|ENST00000548411|7|1),INTRON(MODIFIER||||975|BICD1|protein_coding|CODING|ENST00000281474|7|1),INTRON(MODIFIER|||||BICD1|nonsense_mediated_decay|CODING|ENST00000395758|7|1),INTRON(MODIFIER|||||BICD1|retained_intron|CODING|ENST00000552226|1|1)
 chrX	153010066	rs11803	C	T	73.8	.	DP=34;EFF=DOWNSTREAM(MODIFIER||4008||221|ABCD1|protein_coding|CODING|ENST00000443684||1),INTRAGENIC(MODIFIER|||||ABCD1||CODING|||1),INTRON(MODIFIER|||||U52111.14|antisense|NON_CODING|ENST00000434284|1|1),UTR_3_PRIME(MODIFIER||877||745|ABCD1|protein_coding|CODING|ENST00000218104|10|1)