diff VCF-2-TSV.sh @ 0:8204b0000c8f draft default tip

Uploaded
author niels
date Mon, 19 Aug 2019 06:20:43 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VCF-2-TSV.sh	Mon Aug 19 06:20:43 2019 -0400
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+#take a VCF file and split the columns in INFO and FORMAT fields to make a tsv file
+
+inputfile=$1
+outfile=$2
+outfile2=$3
+
+
+
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";
+		fc=1;
+		fc2=1;	
+	}{
+
+
+		### Parse Header 
+
+		if(index($1,"##")!=0){ #header			
+
+			#get INFO fields
+			if(index($1,"INFO")!=0){
+				split($1,arr,",")
+				
+				split(arr[1],arr2,"=")
+				sub(/ /,"_",arr2[3])
+				infofields[fc] = arr2[3]
+				sub(/ /,"_",arr2[3])
+				
+				
+				split(arr[4],arr3,"=")
+				infodescr[fc] = arr3[2]
+				print "infofield: ",infofields[fc], infodescr[fc]  > "'"$outfile2"'"
+				fc++
+			}
+
+			#get FORMAT fields
+			if(index($1,"FORMAT")!=0){
+				split($1,f_arr,",")
+				
+				split(f_arr[1],f_arr2,"=")
+				sub(/ /,"_",f_arr2[3])
+				formatfields[fc2] = f_arr2[3]
+
+				split(f_arr[4],f_arr3,"=")
+				formatdescr[fc2] = f_arr3[2]
+				print "Formatfield: ",formatfields[fc2], formatdescr[fc2]  > "'"$outfile2"'"
+				fc2++
+			}
+
+
+		}
+
+
+		### Make new Header line
+
+		else if(index($1,"#")!=0){ #headerline
+			
+			# General Columns
+			printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",substr($1,2),$2,$3,$4,$5,$6,$7  > "'"$outfile"'"
+			
+			# INFO columns		
+			for (i=1;i<fc;i++)
+				printf "\t%s", "INFO_"infofields[i]  > "'"$outfile"'"
+				#printf "infofield: ",infofields[i]  > "'"$outfile2"'"
+
+			# FORMAT columns per sample
+			for (j=10;j<=NF;j++)
+				for (k=1;k<fc2;k++){
+					if (NF==10)
+						printf "\t%s", "FORMAT_"formatfields[k]  > "'"$outfile"'"
+						#printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
+					else
+						printf "\t%s", $j"_FORMAT_"formatfields[k]  > "'"$outfile"'"
+						#printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
+
+					
+			}
+
+			printf "\n"  > "'"$outfile"'"
+			#printf "\n"  > "'"$outfile"'"
+
+	
+		}
+
+
+		### Parse rest of file
+
+		else{ 
+			
+			### GENERAL FIELDS
+			printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",$1,$2,$3,$4,$5,$6,$7  > "'"$outfile"'"
+			
+
+			### INFO FIELDS			
+			split($8,infovalues,";")  # split values in INFO column
+			for (i=1;i<fc;i++){
+				present=0
+				for (j in infovalues){
+					split(infovalues[j],arr5,"=")
+					
+					if(arr5[1]==infofields[i]) {			
+						if(arr5[2]=="")
+							printf "\t%s", arr5[1]  > "'"$outfile"'"
+						else			
+							printf "\t%s", arr5[2]  > "'"$outfile"'"
+						present=1
+					}
+				}
+				if (!present) printf "\t"  > "'"$outfile"'"
+
+			}
+
+			### FORMAT FIELDS
+			split($9,formatcols,":")  # what is order of format fields in next columns?
+			
+			for (j=10;j<=NF;j++){
+				split($j,formatvalues,":")  # get format fields
+
+				col=1
+				for (i=1;i<fc2;i++){		#put them in the right column
+					present=0					
+					for (k in formatcols){						
+						if(formatcols[k]==formatfields[i]){
+							printf "\t%s", formatvalues[k]  > "'"$outfile"'"
+							present=1
+						}
+					}
+					if (!present) printf "\t"  > "'"$outfile"'"
+					col++
+				}
+				
+
+			}
+			printf "\n"  > "'"$outfile"'"
+		}
+
+	}END{
+		for(i in infofields)
+			print "infofield: ",infofields[i], infodescr[i]
+			#print "infofield: ",infofields[i], infodescr[i]  > "'"$outfile2"'"
+
+		for(j in formatfields)
+			print "formatfield: ",formatfields[j], formatdescr[j]
+			#print "formatfield: ",formatfields[j], formatdescr[j] > "'"$outfile2"'"
+
+		
+	 
+	}' $inputfile