# HG changeset patch # User Jim Johnson # Date 1413911845 18000 # Node ID 49b5bd3dc316811e58d78e36fe2d1f06297cf8c6 # Parent 796388c291d395e5f78e44f9a013932673f9c309 Add rmInfo tool diff -r 796388c291d3 -r 49b5bd3dc316 snpSift_rmInfo.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpSift_rmInfo.xml Tue Oct 21 12:17:25 2014 -0500 @@ -0,0 +1,56 @@ + + remove INFO field annotations + + + snpEff_macros.xml + + + java -Xmx2G -jar \$SNPEFF_JAR_PATH/SnpSift.jar rmInfo $input ' '.join($info_fields.split(',')) > $output + + + + + Separate multiple INFO fields with a comma, e.g.: EFF,DP + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This command removes INFO fields from a VCF file (i.e. removes annotations) + +Removing INFO fields is usually done because you want to re-annotate the VCF file, thus removing old INFO fields in order to add new ones later. + +SnpEff & SnpSift only add annotations and do not change current ones. So, in order to re-annotate a file, you should first remove the old annotations and then re-annotate. +The reason for this behavior is simply because replacing annotation values is considered a bad practice. Imagine that you have a VCF entry in your re-annotated file having the value "AA=1": How do you know if this is from the old annotations or from the new ones? This confusion often leads to problems in downstream steps of your pipelines, so it's better to avoid the problem by first removing all the previous annotations and then adding the new ones. + +@EXTERNAL_DOCUMENTATION@ + http://snpeff.sourceforge.net/SnpSift.html#rmInfo + +@CITATION_SECTION@ + + + diff -r 796388c291d3 -r 49b5bd3dc316 test-data/test_rmInfo.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_rmInfo.vcf Tue Oct 21 12:17:25 2014 -0500 @@ -0,0 +1,10 @@ +##fileformat=VCFv4.1 +##samtoolsVersion=0.1.18 (r982:295) +##INFO= +##SnpEffVersion="3.5 (build 2014-02-12), by Pablo Cingolani" +##SnpEffCmd="SnpEff -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -stats /Users/jj/gxt/gxt/database/files/004/dataset_4998.dat GRCh37.71 /Users/jj/gxt/gxt/database/files/004/dataset_4996.dat " +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT /data/sequencing/output/biotec4/mapping/L774.q1.s.bam /data/sequencing/output/biotec4/mapping/L775.q1.s.bam +chr4 100239319 rs1229984 T C 94.3 . DP=29;EFF=EXON(MODIFIER|||||ADH1B|processed_transcript|CODING|ENST00000504498|3|1),EXON(MODIFIER|||||ADH1B|retained_intron|CODING|ENST00000515694|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H48R|375|ADH1B|protein_coding|CODING|ENST00000305046|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cAc/cGc|H8R|335|ADH1B|protein_coding|CODING|ENST00000394887|3|1),UTR_3_PRIME(MODIFIER||2729|||ADH1B|nonsense_mediated_decay|CODING|ENST00000506651|4|1) +chr12 32491626 rs1471909 G A 124.0 . DP=22;EFF=DOWNSTREAM(MODIFIER||532|||BICD1|retained_intron|CODING|ENST00000552160||1),INTRON(MODIFIER||||835|BICD1|protein_coding|CODING|ENST00000548411|7|1),INTRON(MODIFIER||||975|BICD1|protein_coding|CODING|ENST00000281474|7|1),INTRON(MODIFIER|||||BICD1|nonsense_mediated_decay|CODING|ENST00000395758|7|1),INTRON(MODIFIER|||||BICD1|retained_intron|CODING|ENST00000552226|1|1) +chrX 153010066 rs11803 C T 73.8 . DP=34;EFF=DOWNSTREAM(MODIFIER||4008||221|ABCD1|protein_coding|CODING|ENST00000443684||1),INTRAGENIC(MODIFIER|||||ABCD1||CODING|||1),INTRON(MODIFIER|||||U52111.14|antisense|NON_CODING|ENST00000434284|1|1),UTR_3_PRIME(MODIFIER||877||745|ABCD1|protein_coding|CODING|ENST00000218104|10|1)