Mercurial > repos > niels > vcfflatten
changeset 0:8204b0000c8f draft default tip
Uploaded
author | niels |
---|---|
date | Mon, 19 Aug 2019 06:20:43 -0400 |
parents | |
children | |
files | VCF-2-TSV.sh vcf_flatten.xml |
diffstat | 2 files changed, 220 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VCF-2-TSV.sh Mon Aug 19 06:20:43 2019 -0400 @@ -0,0 +1,151 @@ +#!/bin/bash + +#take a VCF file and split the columns in INFO and FORMAT fields to make a tsv file + +inputfile=$1 +outfile=$2 +outfile2=$3 + + + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + fc=1; + fc2=1; + }{ + + + ### Parse Header + + if(index($1,"##")!=0){ #header + + #get INFO fields + if(index($1,"INFO")!=0){ + split($1,arr,",") + + split(arr[1],arr2,"=") + sub(/ /,"_",arr2[3]) + infofields[fc] = arr2[3] + sub(/ /,"_",arr2[3]) + + + split(arr[4],arr3,"=") + infodescr[fc] = arr3[2] + print "infofield: ",infofields[fc], infodescr[fc] > "'"$outfile2"'" + fc++ + } + + #get FORMAT fields + if(index($1,"FORMAT")!=0){ + split($1,f_arr,",") + + split(f_arr[1],f_arr2,"=") + sub(/ /,"_",f_arr2[3]) + formatfields[fc2] = f_arr2[3] + + split(f_arr[4],f_arr3,"=") + formatdescr[fc2] = f_arr3[2] + print "Formatfield: ",formatfields[fc2], formatdescr[fc2] > "'"$outfile2"'" + fc2++ + } + + + } + + + ### Make new Header line + + else if(index($1,"#")!=0){ #headerline + + # General Columns + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",substr($1,2),$2,$3,$4,$5,$6,$7 > "'"$outfile"'" + + # INFO columns + for (i=1;i<fc;i++) + printf "\t%s", "INFO_"infofields[i] > "'"$outfile"'" + #printf "infofield: ",infofields[i] > "'"$outfile2"'" + + # FORMAT columns per sample + for (j=10;j<=NF;j++) + for (k=1;k<fc2;k++){ + if (NF==10) + printf "\t%s", "FORMAT_"formatfields[k] > "'"$outfile"'" + #printf "formatfield: ",formatfields[j] > "'"$outfile2"'" + else + printf "\t%s", $j"_FORMAT_"formatfields[k] > "'"$outfile"'" + #printf "formatfield: ",formatfields[j] > "'"$outfile2"'" + + + } + + printf "\n" > "'"$outfile"'" + #printf "\n" > "'"$outfile"'" + + + } + + + ### Parse rest of file + + else{ + + ### GENERAL FIELDS + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",$1,$2,$3,$4,$5,$6,$7 > "'"$outfile"'" + + + ### INFO FIELDS + split($8,infovalues,";") # split values in INFO column + for (i=1;i<fc;i++){ + present=0 + for (j in infovalues){ + split(infovalues[j],arr5,"=") + + if(arr5[1]==infofields[i]) { + if(arr5[2]=="") + printf "\t%s", arr5[1] > "'"$outfile"'" + else + printf "\t%s", arr5[2] > "'"$outfile"'" + present=1 + } + } + if (!present) printf "\t" > "'"$outfile"'" + + } + + ### FORMAT FIELDS + split($9,formatcols,":") # what is order of format fields in next columns? + + for (j=10;j<=NF;j++){ + split($j,formatvalues,":") # get format fields + + col=1 + for (i=1;i<fc2;i++){ #put them in the right column + present=0 + for (k in formatcols){ + if(formatcols[k]==formatfields[i]){ + printf "\t%s", formatvalues[k] > "'"$outfile"'" + present=1 + } + } + if (!present) printf "\t" > "'"$outfile"'" + col++ + } + + + } + printf "\n" > "'"$outfile"'" + } + + }END{ + for(i in infofields) + print "infofield: ",infofields[i], infodescr[i] + #print "infofield: ",infofields[i], infodescr[i] > "'"$outfile2"'" + + for(j in formatfields) + print "formatfield: ",formatfields[j], formatdescr[j] + #print "formatfield: ",formatfields[j], formatdescr[j] > "'"$outfile2"'" + + + + }' $inputfile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_flatten.xml Mon Aug 19 06:20:43 2019 -0400 @@ -0,0 +1,69 @@ +<tool id="vcfflatten" name="VCF-flatten" version="1"> + + <description> Converts a VCF file to a tab-delimited file with one value per column </description> + + <command interpreter="bash"> + VCF-2-TSV.sh $infile $outfile $outfile2 </command> + + + <inputs> + <param name="infile" type="data" label="Select VCF file " /> + </inputs> + + <outputs> + <data format="tabular" name="outfile" label="VCF-flatten on ${on_string}"/> + <data format="txt" name="outfile2" label="VCF-details on ${on_string}"/> + </outputs> + + <help> + + +**What it does** + +This tool will take a VCF file, and convert it to a tab-delimited file with a 1-line header and separate columns for all INFO and FORMAT fields. +As well create an output with all the INFO and FORMAT fields and it's description. + +An example input:: +This tool will take a VCF file, and convert it to a tab-delimited file with a 1-line header and separate columns for all INFO and FORMAT fields. +As well create an output with all the INFO and FORMAT fields and it's description. + +An example input:: + + # #fileformat=VCFv4.1 + # #fileDate=20090805 + # #source=myImputationProgramV3.1 + # #reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta + # #contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x> + # #phasing=partial + # #INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"> + # #INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> + # #INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> + # #INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele"> + # #INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129"> + # #INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership"> + # #FILTER=<ID=q10,Description="Quality below 10"> + # #FILTER=<ID=s50,Description="Less than 50% of samples have data"> + # #FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> + # #FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> + # #FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> + # #FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality"> + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 + 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. + 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 + 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 + 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 + 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 + + +will become:: + + + CHROM POS ID REF ALT QUAL FILTER NS DP AF AA DB H2 NA00001_GT NA00001_GQ NA00001_DP NA00001_HQ NA00002_GT NA00002_GQ NA00002_DP NA00002_HQ NA00003_GT NA00003_GQ NA00003_DP NA00003_HQ + 20 14370 rs6054257 G A 29 PASS 3 14 0.5 DB H2 0|0 48 1 51,51 1|0 48 8 51,51 1/1 43 5 .,. + 20 17330 . T A 3 q10 3 11 0.017 0|0 49 3 58,50 0|1 3 5 65,3 0/0 41 3 + 20 1110696 rs6040355 A G,T 67 PASS 2 10 0.333,0.667 T DB 1|2 21 6 23,27 2|1 2 0 18,2 2/2 35 4 + 20 1230237 . T . 47 PASS 3 13 T 0|0 54 7 56,60 0|0 48 4 51,51 0/0 61 2 + 20 1234567 microsat1 GTC G,GTCT 50 PASS 3 9 G 0/1 35 4 0/2 17 2 1/1 40 3 + + </help> +</tool>