Mercurial > repos > niels > vcfflatten
view VCF-2-TSV.sh @ 0:8204b0000c8f draft default tip
Uploaded
author | niels |
---|---|
date | Mon, 19 Aug 2019 06:20:43 -0400 |
parents | |
children |
line wrap: on
line source
#!/bin/bash #take a VCF file and split the columns in INFO and FORMAT fields to make a tsv file inputfile=$1 outfile=$2 outfile2=$3 awk 'BEGIN{ FS="\t"; OFS="\t"; fc=1; fc2=1; }{ ### Parse Header if(index($1,"##")!=0){ #header #get INFO fields if(index($1,"INFO")!=0){ split($1,arr,",") split(arr[1],arr2,"=") sub(/ /,"_",arr2[3]) infofields[fc] = arr2[3] sub(/ /,"_",arr2[3]) split(arr[4],arr3,"=") infodescr[fc] = arr3[2] print "infofield: ",infofields[fc], infodescr[fc] > "'"$outfile2"'" fc++ } #get FORMAT fields if(index($1,"FORMAT")!=0){ split($1,f_arr,",") split(f_arr[1],f_arr2,"=") sub(/ /,"_",f_arr2[3]) formatfields[fc2] = f_arr2[3] split(f_arr[4],f_arr3,"=") formatdescr[fc2] = f_arr3[2] print "Formatfield: ",formatfields[fc2], formatdescr[fc2] > "'"$outfile2"'" fc2++ } } ### Make new Header line else if(index($1,"#")!=0){ #headerline # General Columns printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",substr($1,2),$2,$3,$4,$5,$6,$7 > "'"$outfile"'" # INFO columns for (i=1;i<fc;i++) printf "\t%s", "INFO_"infofields[i] > "'"$outfile"'" #printf "infofield: ",infofields[i] > "'"$outfile2"'" # FORMAT columns per sample for (j=10;j<=NF;j++) for (k=1;k<fc2;k++){ if (NF==10) printf "\t%s", "FORMAT_"formatfields[k] > "'"$outfile"'" #printf "formatfield: ",formatfields[j] > "'"$outfile2"'" else printf "\t%s", $j"_FORMAT_"formatfields[k] > "'"$outfile"'" #printf "formatfield: ",formatfields[j] > "'"$outfile2"'" } printf "\n" > "'"$outfile"'" #printf "\n" > "'"$outfile"'" } ### Parse rest of file else{ ### GENERAL FIELDS printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",$1,$2,$3,$4,$5,$6,$7 > "'"$outfile"'" ### INFO FIELDS split($8,infovalues,";") # split values in INFO column for (i=1;i<fc;i++){ present=0 for (j in infovalues){ split(infovalues[j],arr5,"=") if(arr5[1]==infofields[i]) { if(arr5[2]=="") printf "\t%s", arr5[1] > "'"$outfile"'" else printf "\t%s", arr5[2] > "'"$outfile"'" present=1 } } if (!present) printf "\t" > "'"$outfile"'" } ### FORMAT FIELDS split($9,formatcols,":") # what is order of format fields in next columns? for (j=10;j<=NF;j++){ split($j,formatvalues,":") # get format fields col=1 for (i=1;i<fc2;i++){ #put them in the right column present=0 for (k in formatcols){ if(formatcols[k]==formatfields[i]){ printf "\t%s", formatvalues[k] > "'"$outfile"'" present=1 } } if (!present) printf "\t" > "'"$outfile"'" col++ } } printf "\n" > "'"$outfile"'" } }END{ for(i in infofields) print "infofield: ",infofields[i], infodescr[i] #print "infofield: ",infofields[i], infodescr[i] > "'"$outfile2"'" for(j in formatfields) print "formatfield: ",formatfields[j], formatdescr[j] #print "formatfield: ",formatfields[j], formatdescr[j] > "'"$outfile2"'" }' $inputfile