diff snpSift_extractFields.xml @ 4:baf6602903e1

Uploaded
author jjohnson
date Wed, 09 Dec 2015 14:03:26 -0500
parents
children 824f78c0d0df
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snpSift_extractFields.xml	Wed Dec 09 14:03:26 2015 -0500
@@ -0,0 +1,221 @@
+<tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0">
+    <options sanitize="False" />
+    <description>from a VCF file inot a tabular file</description>
+    <macros>
+        <import>snpSift_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="stdio" />
+    <expand macro="version_command" />
+    <command><![CDATA[
+        cat "$input"
+        #if $one_effect_per_line:
+          | \$SNPEFF_JAR_PATH/scripts/vcfEffOnePerLine.pl
+        #end if 
+        | java -Xmx6G -jar \$SNPEFF_JAR_PATH/SnpSift.jar extractFields 
+        #if $separator:
+          -s '$separator'  
+        #end if
+        #if $empty_text:
+          -e '$empty_text' 
+        #end if
+        -
+        #echo ' '.join(['"%s"' % x for x in $extract.split()])
+        > $output
+]]>
+    </command>
+    <inputs>
+        <param format="vcf" name="input" type="data" label="Variant input file in VCF format"/>
+        <param name="extract" type="text" label="Extract" size="160" help="Need help? See below a few examples." />
+        <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" />
+        <param name="separator" type="text" value="" optional="true" label="multiple field separator" size="1" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values">
+        </param>
+        <param name="empty_text" type="text" value="" optional="true" label="empty field text" size="10" help="Represent empty fields with this value, rather than leaving them blank" >
+        </param>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
+            <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
+            <output name="output">
+                <assert_contents>
+                <has_text text="INTRAGENIC" />
+                <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
+                </assert_contents>
+            </output>
+        </test>
+
+        <test>
+            <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
+            <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
+            <param name="separator" value=","/>
+            <output name="output">
+                <assert_contents>
+		<has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
+                </assert_contents>
+            </output>
+        </test>
+
+    </tests>
+    <help><![CDATA[
+
+**SnpSift Extract Fields**
+
+Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc.
+
+http://snpeff.sourceforge.net/SnpSift.html#Extract
+
+You can also use sub-fields and genotype fields / sub-fields such as:
+
+  ::
+
+    Standard VCF fields:
+        CHROM
+        POS
+        ID
+        REF
+        ALT
+        FILTER 
+    INFO fields:
+        AF
+        AC
+        DP
+        MQ
+        etc. (any info field available) 
+    SnpEff 'ANN' fields:
+        "ANN[*].ALLELE" (alias GENOTYPE)
+        "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
+        "ANN[*].IMPACT:" { HIGH, MODERATE, LOW, MODIFIER }
+        "ANN[*].GENE:" Gene name (e.g. 'PSD3')
+        "ANN[*].GENEID:" Gene ID
+        "ANN[*].FEATURE
+        " ANN[*].FEATUREID (alias TRID: Transcript ID)
+        "ANN[*].BIOTYPE:" Biotype, as described by the annotations (e.g. 'protein_coding')
+        "ANN[*].RANK:" Exon or Intron rank (i.e. exon number in a transcript)
+        "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
+        "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
+        "ANN[*].CDNA_POS" (alias POS_CDNA)
+        "ANN[*].CDNA_LEN" (alias LEN_CDNA)
+        "ANN[*].CDS_POS" (alias POS_CDS)
+        "ANN[*].CDS_LEN" (alias LEN_CDS)
+        "ANN[*].AA_POS" (alias POS_AA)
+        "ANN[*].AA_LEN" (alias LEN_AA)
+        "ANN[*].DISTANCE"
+        "ANN[*].ERRORS" (alias WARNING, INFOS) 
+    SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field):
+        "EFF[*].EFFECT"
+        "EFF[*].IMPACT"
+        "EFF[*].FUNCLASS"
+        "EFF[*].CODON"
+        "EFF[*].AA"
+        "EFF[*].AA_LEN"
+        "EFF[*].GENE"
+        "EFF[*].BIOTYPE"
+        "EFF[*].CODING"
+        "EFF[*].TRID"
+        "EFF[*].RANK" 
+    SnpEff 'LOF' fields:
+        "LOF[*].GENE"
+        "LOF[*].GENEID"
+        "LOF[*].NUMTR"
+        "LOF[*].PERC" 
+    SnpEff' NMD' fields:
+        "NMD[*].GENE"
+        "NMD[*].GENEID"
+        "NMD[*].NUMTR"
+        "NMD[*].PERC" 
+
+
+Some examples:
+
+  - *Extracting chromosome, position, ID and allele frequency from a VCF file:*
+
+    **CHROM POS ID AF**
+
+    The result will look something like: 
+
+      ::
+
+        #CHROM        POS        ID            AF
+        1             69134                    0.086
+        1             69496      rs150690004   0.001
+
+
+  - *Extracting genotype fields:*
+
+    **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**
+
+    This means to extract:
+
+      - CHROM POS ID: regular fields (as in the previous example)
+      - THETA : This one is from INFO
+      - GEN[0].GL[1] : Second likelihood from first genotype
+      - GEN[1].GL : The whole GL fiels (all entries without separating them)
+      - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
+      - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated). 
+
+    The result will look something like: 
+
+      ::
+
+        #CHROM  POS     ID              THETA   GEN[0].GL[1]    GEN[1].GL               GEN[3].GL[*]            GEN[*].GT
+        1       10583   rs58108140      0.0046  -0.47           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|0     0|0     0|1     0|0     0|1     0|0     0|0     0|1 
+        1       10611   rs189107123     0.0077  -0.48           -0.24,-0.44,-1.16       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     0|0     0|0     0|0     0|0 
+        1       13302   rs180734498     0.0048  -0.58           -2.45,-0.00,-5.00       -0.48   -0.48   -0.48   0|0     0|1     0|0     0|0     0|0     1|0     0|0     0|1     0|0 
+
+  - *Extracting fields with multiple values:*
+    (notice that there are multiple effect columns per line because there are mutiple effects per variant)
+
+    **CHROM POS REF ALT ANN[*].EFFECT**
+
+    The result will look something like: 
+
+      :: 
+
+        #CHROM	POS	REF	ALT	ANN[*].EFFECT
+        22	17071756	T	C	3_prime_UTR_variant	downstream_gene_variant
+        22	17072035	C	T	missense_variant	downstream_gene_variant
+        22	17072258	C	A	missense_variant	downstream_gene_variant
+
+  - *Extracting fields with multiple values using a comma as a multipe field separator:*
+
+    **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**
+
+    The result will look something like: 
+
+      :: 
+
+        #CHROM	POS	REF	ALT	ANN[*].EFFECT	ANN[*].HGVS_P
+        22	17071756	T	C	3_prime_UTR_variant,downstream_gene_variant	.,.
+        22	17072035	C	T	missense_variant,downstream_gene_variant	p.Gly469Glu,.
+        22	17072258	C	A	missense_variant,downstream_gene_variant	p.Gly395Cys,.
+
+
+  - *Extracting fields with multiple values, one effect per line:*
+
+    **CHROM POS REF ALT ANN[*].EFFECT**
+
+    The result will look something like: 
+
+      :: 
+
+        #CHROM	POS	REF	ALT	ANN[*].EFFECT
+        22	17071756	T	C	3_prime_UTR_variant
+        22	17071756	T	C	downstream_gene_variant
+        22	17072035	C	T	missense_variant
+        22	17072035	C	T	downstream_gene_variant
+        22	17072258	C	A	missense_variant
+        22	17072258	C	A	downstream_gene_variant
+
+
+@EXTERNAL_DOCUMENTATION@
+	http://snpeff.sourceforge.net/SnpSift.html#Extract
+
+@CITATION_SECTION@
+
+]]>
+    </help>
+</tool>