4
|
1 <tool id="snpSift_extractFields" name="SnpSift Extract Fields" version="@WRAPPER_VERSION@.0">
|
|
2 <options sanitize="False" />
|
|
3 <description>from a VCF file inot a tabular file</description>
|
|
4 <macros>
|
|
5 <import>snpSift_macros.xml</import>
|
|
6 </macros>
|
|
7 <expand macro="requirements" />
|
|
8 <expand macro="stdio" />
|
|
9 <expand macro="version_command" />
|
|
10 <command><![CDATA[
|
|
11 cat "$input"
|
|
12 #if $one_effect_per_line:
|
|
13 | \$SNPEFF_JAR_PATH/scripts/vcfEffOnePerLine.pl
|
|
14 #end if
|
|
15 | java -Xmx6G -jar \$SNPEFF_JAR_PATH/SnpSift.jar extractFields
|
|
16 #if $separator:
|
|
17 -s '$separator'
|
|
18 #end if
|
|
19 #if $empty_text:
|
|
20 -e '$empty_text'
|
|
21 #end if
|
|
22 -
|
|
23 #echo ' '.join(['"%s"' % x for x in $extract.split()])
|
|
24 > $output
|
|
25 ]]>
|
|
26 </command>
|
|
27 <inputs>
|
|
28 <param format="vcf" name="input" type="data" label="Variant input file in VCF format"/>
|
|
29 <param name="extract" type="text" label="Extract" size="160" help="Need help? See below a few examples." />
|
|
30 <param name="one_effect_per_line" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="One effect per line" help="When variants have more than one effect, lists one effect per line, while all other parameters in the line are repeated across mutiple lines" />
|
|
31 <param name="separator" type="text" value="" optional="true" label="multiple field separator" size="1" help="Separate multiple fields in one column with this character, e.g. a comma, rather than a column for each of the multiple values">
|
|
32 </param>
|
|
33 <param name="empty_text" type="text" value="" optional="true" label="empty field text" size="10" help="Represent empty fields with this value, rather than leaving them blank" >
|
|
34 </param>
|
|
35 </inputs>
|
|
36 <outputs>
|
|
37 <data format="tabular" name="output" />
|
|
38 </outputs>
|
|
39 <tests>
|
|
40 <test>
|
|
41 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
|
|
42 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
|
|
43 <output name="output">
|
|
44 <assert_contents>
|
|
45 <has_text text="INTRAGENIC" />
|
|
46 <not_has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
|
|
47 </assert_contents>
|
|
48 </output>
|
|
49 </test>
|
|
50
|
|
51 <test>
|
|
52 <param name="input" ftype="vcf" value="test_rmInfo.vcf"/>
|
|
53 <param name="extract" value="CHROM POS REF ALT EFF[*].EFFECT"/>
|
|
54 <param name="separator" value=","/>
|
|
55 <output name="output">
|
|
56 <assert_contents>
|
|
57 <has_text text="DOWNSTREAM,INTRAGENIC,INTRON,UTR_3_PRIME" />
|
|
58 </assert_contents>
|
|
59 </output>
|
|
60 </test>
|
|
61
|
|
62 </tests>
|
|
63 <help><![CDATA[
|
|
64
|
|
65 **SnpSift Extract Fields**
|
|
66
|
|
67 Extract fields from a VCF file to a TXT, tab separated format, that you can easily load in R, XLS, etc.
|
|
68
|
|
69 http://snpeff.sourceforge.net/SnpSift.html#Extract
|
|
70
|
|
71 You can also use sub-fields and genotype fields / sub-fields such as:
|
|
72
|
|
73 ::
|
|
74
|
|
75 Standard VCF fields:
|
|
76 CHROM
|
|
77 POS
|
|
78 ID
|
|
79 REF
|
|
80 ALT
|
|
81 FILTER
|
|
82 INFO fields:
|
|
83 AF
|
|
84 AC
|
|
85 DP
|
|
86 MQ
|
|
87 etc. (any info field available)
|
|
88 SnpEff 'ANN' fields:
|
|
89 "ANN[*].ALLELE" (alias GENOTYPE)
|
|
90 "ANN[*].EFFECT" (alias ANNOTATION): Effect in Sequence ontology terms (e.g. 'missense_variant', 'synonymous_variant', 'stop_gained', etc.)
|
|
91 "ANN[*].IMPACT:" { HIGH, MODERATE, LOW, MODIFIER }
|
|
92 "ANN[*].GENE:" Gene name (e.g. 'PSD3')
|
|
93 "ANN[*].GENEID:" Gene ID
|
|
94 "ANN[*].FEATURE
|
|
95 " ANN[*].FEATUREID (alias TRID: Transcript ID)
|
|
96 "ANN[*].BIOTYPE:" Biotype, as described by the annotations (e.g. 'protein_coding')
|
|
97 "ANN[*].RANK:" Exon or Intron rank (i.e. exon number in a transcript)
|
|
98 "ANN[*].HGVS_C" (alias HGVS_DNA, CODON): Variant in HGVS (DNA) notation
|
|
99 "ANN[*].HGVS_P" (alias HGVS, HGVS_PROT, AA): Variant in HGVS (protein) notation
|
|
100 "ANN[*].CDNA_POS" (alias POS_CDNA)
|
|
101 "ANN[*].CDNA_LEN" (alias LEN_CDNA)
|
|
102 "ANN[*].CDS_POS" (alias POS_CDS)
|
|
103 "ANN[*].CDS_LEN" (alias LEN_CDS)
|
|
104 "ANN[*].AA_POS" (alias POS_AA)
|
|
105 "ANN[*].AA_LEN" (alias LEN_AA)
|
|
106 "ANN[*].DISTANCE"
|
|
107 "ANN[*].ERRORS" (alias WARNING, INFOS)
|
|
108 SnpEff 'EFF' fields (this is for older SnpEff/SnpSift versions, new version use 'ANN' field):
|
|
109 "EFF[*].EFFECT"
|
|
110 "EFF[*].IMPACT"
|
|
111 "EFF[*].FUNCLASS"
|
|
112 "EFF[*].CODON"
|
|
113 "EFF[*].AA"
|
|
114 "EFF[*].AA_LEN"
|
|
115 "EFF[*].GENE"
|
|
116 "EFF[*].BIOTYPE"
|
|
117 "EFF[*].CODING"
|
|
118 "EFF[*].TRID"
|
|
119 "EFF[*].RANK"
|
|
120 SnpEff 'LOF' fields:
|
|
121 "LOF[*].GENE"
|
|
122 "LOF[*].GENEID"
|
|
123 "LOF[*].NUMTR"
|
|
124 "LOF[*].PERC"
|
|
125 SnpEff' NMD' fields:
|
|
126 "NMD[*].GENE"
|
|
127 "NMD[*].GENEID"
|
|
128 "NMD[*].NUMTR"
|
|
129 "NMD[*].PERC"
|
|
130
|
|
131
|
|
132 Some examples:
|
|
133
|
|
134 - *Extracting chromosome, position, ID and allele frequency from a VCF file:*
|
|
135
|
|
136 **CHROM POS ID AF**
|
|
137
|
|
138 The result will look something like:
|
|
139
|
|
140 ::
|
|
141
|
|
142 #CHROM POS ID AF
|
|
143 1 69134 0.086
|
|
144 1 69496 rs150690004 0.001
|
|
145
|
|
146
|
|
147 - *Extracting genotype fields:*
|
|
148
|
|
149 **CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT**
|
|
150
|
|
151 This means to extract:
|
|
152
|
|
153 - CHROM POS ID: regular fields (as in the previous example)
|
|
154 - THETA : This one is from INFO
|
|
155 - GEN[0].GL[1] : Second likelihood from first genotype
|
|
156 - GEN[1].GL : The whole GL fiels (all entries without separating them)
|
|
157 - GEN[3].GL[*] : All likelihoods form genotype 3 (this time they will be tab separated, as opposed to the previous one).
|
|
158 - GEN[*].GT : Genotype subfields (GT) from ALL samples (tab separated).
|
|
159
|
|
160 The result will look something like:
|
|
161
|
|
162 ::
|
|
163
|
|
164 #CHROM POS ID THETA GEN[0].GL[1] GEN[1].GL GEN[3].GL[*] GEN[*].GT
|
|
165 1 10583 rs58108140 0.0046 -0.47 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|0 0|0 0|1 0|0 0|1 0|0 0|0 0|1
|
|
166 1 10611 rs189107123 0.0077 -0.48 -0.24,-0.44,-1.16 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0
|
|
167 1 13302 rs180734498 0.0048 -0.58 -2.45,-0.00,-5.00 -0.48 -0.48 -0.48 0|0 0|1 0|0 0|0 0|0 1|0 0|0 0|1 0|0
|
|
168
|
|
169 - *Extracting fields with multiple values:*
|
|
170 (notice that there are multiple effect columns per line because there are mutiple effects per variant)
|
|
171
|
|
172 **CHROM POS REF ALT ANN[*].EFFECT**
|
|
173
|
|
174 The result will look something like:
|
|
175
|
|
176 ::
|
|
177
|
|
178 #CHROM POS REF ALT ANN[*].EFFECT
|
|
179 22 17071756 T C 3_prime_UTR_variant downstream_gene_variant
|
|
180 22 17072035 C T missense_variant downstream_gene_variant
|
|
181 22 17072258 C A missense_variant downstream_gene_variant
|
|
182
|
|
183 - *Extracting fields with multiple values using a comma as a multipe field separator:*
|
|
184
|
|
185 **CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P**
|
|
186
|
|
187 The result will look something like:
|
|
188
|
|
189 ::
|
|
190
|
|
191 #CHROM POS REF ALT ANN[*].EFFECT ANN[*].HGVS_P
|
|
192 22 17071756 T C 3_prime_UTR_variant,downstream_gene_variant .,.
|
|
193 22 17072035 C T missense_variant,downstream_gene_variant p.Gly469Glu,.
|
|
194 22 17072258 C A missense_variant,downstream_gene_variant p.Gly395Cys,.
|
|
195
|
|
196
|
|
197 - *Extracting fields with multiple values, one effect per line:*
|
|
198
|
|
199 **CHROM POS REF ALT ANN[*].EFFECT**
|
|
200
|
|
201 The result will look something like:
|
|
202
|
|
203 ::
|
|
204
|
|
205 #CHROM POS REF ALT ANN[*].EFFECT
|
|
206 22 17071756 T C 3_prime_UTR_variant
|
|
207 22 17071756 T C downstream_gene_variant
|
|
208 22 17072035 C T missense_variant
|
|
209 22 17072035 C T downstream_gene_variant
|
|
210 22 17072258 C A missense_variant
|
|
211 22 17072258 C A downstream_gene_variant
|
|
212
|
|
213
|
|
214 @EXTERNAL_DOCUMENTATION@
|
|
215 http://snpeff.sourceforge.net/SnpSift.html#Extract
|
|
216
|
|
217 @CITATION_SECTION@
|
|
218
|
|
219 ]]>
|
|
220 </help>
|
|
221 </tool>
|