view VCF-2-TSV.sh @ 0:8204b0000c8f draft default tip

Uploaded
author niels
date Mon, 19 Aug 2019 06:20:43 -0400
parents
children
line wrap: on
line source

#!/bin/bash

#take a VCF file and split the columns in INFO and FORMAT fields to make a tsv file

inputfile=$1
outfile=$2
outfile2=$3



awk 'BEGIN{
		FS="\t";
		OFS="\t";
		fc=1;
		fc2=1;	
	}{


		### Parse Header 

		if(index($1,"##")!=0){ #header			

			#get INFO fields
			if(index($1,"INFO")!=0){
				split($1,arr,",")
				
				split(arr[1],arr2,"=")
				sub(/ /,"_",arr2[3])
				infofields[fc] = arr2[3]
				sub(/ /,"_",arr2[3])
				
				
				split(arr[4],arr3,"=")
				infodescr[fc] = arr3[2]
				print "infofield: ",infofields[fc], infodescr[fc]  > "'"$outfile2"'"
				fc++
			}

			#get FORMAT fields
			if(index($1,"FORMAT")!=0){
				split($1,f_arr,",")
				
				split(f_arr[1],f_arr2,"=")
				sub(/ /,"_",f_arr2[3])
				formatfields[fc2] = f_arr2[3]

				split(f_arr[4],f_arr3,"=")
				formatdescr[fc2] = f_arr3[2]
				print "Formatfield: ",formatfields[fc2], formatdescr[fc2]  > "'"$outfile2"'"
				fc2++
			}


		}


		### Make new Header line

		else if(index($1,"#")!=0){ #headerline
			
			# General Columns
			printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",substr($1,2),$2,$3,$4,$5,$6,$7  > "'"$outfile"'"
			
			# INFO columns		
			for (i=1;i<fc;i++)
				printf "\t%s", "INFO_"infofields[i]  > "'"$outfile"'"
				#printf "infofield: ",infofields[i]  > "'"$outfile2"'"

			# FORMAT columns per sample
			for (j=10;j<=NF;j++)
				for (k=1;k<fc2;k++){
					if (NF==10)
						printf "\t%s", "FORMAT_"formatfields[k]  > "'"$outfile"'"
						#printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
					else
						printf "\t%s", $j"_FORMAT_"formatfields[k]  > "'"$outfile"'"
						#printf "formatfield: ",formatfields[j] > "'"$outfile2"'"

					
			}

			printf "\n"  > "'"$outfile"'"
			#printf "\n"  > "'"$outfile"'"

	
		}


		### Parse rest of file

		else{ 
			
			### GENERAL FIELDS
			printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",$1,$2,$3,$4,$5,$6,$7  > "'"$outfile"'"
			

			### INFO FIELDS			
			split($8,infovalues,";")  # split values in INFO column
			for (i=1;i<fc;i++){
				present=0
				for (j in infovalues){
					split(infovalues[j],arr5,"=")
					
					if(arr5[1]==infofields[i]) {			
						if(arr5[2]=="")
							printf "\t%s", arr5[1]  > "'"$outfile"'"
						else			
							printf "\t%s", arr5[2]  > "'"$outfile"'"
						present=1
					}
				}
				if (!present) printf "\t"  > "'"$outfile"'"

			}

			### FORMAT FIELDS
			split($9,formatcols,":")  # what is order of format fields in next columns?
			
			for (j=10;j<=NF;j++){
				split($j,formatvalues,":")  # get format fields

				col=1
				for (i=1;i<fc2;i++){		#put them in the right column
					present=0					
					for (k in formatcols){						
						if(formatcols[k]==formatfields[i]){
							printf "\t%s", formatvalues[k]  > "'"$outfile"'"
							present=1
						}
					}
					if (!present) printf "\t"  > "'"$outfile"'"
					col++
				}
				

			}
			printf "\n"  > "'"$outfile"'"
		}

	}END{
		for(i in infofields)
			print "infofield: ",infofields[i], infodescr[i]
			#print "infofield: ",infofields[i], infodescr[i]  > "'"$outfile2"'"

		for(j in formatfields)
			print "formatfield: ",formatfields[j], formatdescr[j]
			#print "formatfield: ",formatfields[j], formatdescr[j] > "'"$outfile2"'"

		
	 
	}' $inputfile