Mercurial > repos > niels > vcfflatten
diff VCF-2-TSV.sh @ 0:8204b0000c8f draft default tip
Uploaded
author | niels |
---|---|
date | Mon, 19 Aug 2019 06:20:43 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VCF-2-TSV.sh Mon Aug 19 06:20:43 2019 -0400 @@ -0,0 +1,151 @@ +#!/bin/bash + +#take a VCF file and split the columns in INFO and FORMAT fields to make a tsv file + +inputfile=$1 +outfile=$2 +outfile2=$3 + + + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + fc=1; + fc2=1; + }{ + + + ### Parse Header + + if(index($1,"##")!=0){ #header + + #get INFO fields + if(index($1,"INFO")!=0){ + split($1,arr,",") + + split(arr[1],arr2,"=") + sub(/ /,"_",arr2[3]) + infofields[fc] = arr2[3] + sub(/ /,"_",arr2[3]) + + + split(arr[4],arr3,"=") + infodescr[fc] = arr3[2] + print "infofield: ",infofields[fc], infodescr[fc] > "'"$outfile2"'" + fc++ + } + + #get FORMAT fields + if(index($1,"FORMAT")!=0){ + split($1,f_arr,",") + + split(f_arr[1],f_arr2,"=") + sub(/ /,"_",f_arr2[3]) + formatfields[fc2] = f_arr2[3] + + split(f_arr[4],f_arr3,"=") + formatdescr[fc2] = f_arr3[2] + print "Formatfield: ",formatfields[fc2], formatdescr[fc2] > "'"$outfile2"'" + fc2++ + } + + + } + + + ### Make new Header line + + else if(index($1,"#")!=0){ #headerline + + # General Columns + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",substr($1,2),$2,$3,$4,$5,$6,$7 > "'"$outfile"'" + + # INFO columns + for (i=1;i<fc;i++) + printf "\t%s", "INFO_"infofields[i] > "'"$outfile"'" + #printf "infofield: ",infofields[i] > "'"$outfile2"'" + + # FORMAT columns per sample + for (j=10;j<=NF;j++) + for (k=1;k<fc2;k++){ + if (NF==10) + printf "\t%s", "FORMAT_"formatfields[k] > "'"$outfile"'" + #printf "formatfield: ",formatfields[j] > "'"$outfile2"'" + else + printf "\t%s", $j"_FORMAT_"formatfields[k] > "'"$outfile"'" + #printf "formatfield: ",formatfields[j] > "'"$outfile2"'" + + + } + + printf "\n" > "'"$outfile"'" + #printf "\n" > "'"$outfile"'" + + + } + + + ### Parse rest of file + + else{ + + ### GENERAL FIELDS + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",$1,$2,$3,$4,$5,$6,$7 > "'"$outfile"'" + + + ### INFO FIELDS + split($8,infovalues,";") # split values in INFO column + for (i=1;i<fc;i++){ + present=0 + for (j in infovalues){ + split(infovalues[j],arr5,"=") + + if(arr5[1]==infofields[i]) { + if(arr5[2]=="") + printf "\t%s", arr5[1] > "'"$outfile"'" + else + printf "\t%s", arr5[2] > "'"$outfile"'" + present=1 + } + } + if (!present) printf "\t" > "'"$outfile"'" + + } + + ### FORMAT FIELDS + split($9,formatcols,":") # what is order of format fields in next columns? + + for (j=10;j<=NF;j++){ + split($j,formatvalues,":") # get format fields + + col=1 + for (i=1;i<fc2;i++){ #put them in the right column + present=0 + for (k in formatcols){ + if(formatcols[k]==formatfields[i]){ + printf "\t%s", formatvalues[k] > "'"$outfile"'" + present=1 + } + } + if (!present) printf "\t" > "'"$outfile"'" + col++ + } + + + } + printf "\n" > "'"$outfile"'" + } + + }END{ + for(i in infofields) + print "infofield: ",infofields[i], infodescr[i] + #print "infofield: ",infofields[i], infodescr[i] > "'"$outfile2"'" + + for(j in formatfields) + print "formatfield: ",formatfields[j], formatdescr[j] + #print "formatfield: ",formatfields[j], formatdescr[j] > "'"$outfile2"'" + + + + }' $inputfile