0
|
1 #!/bin/bash
|
|
2
|
|
3 #take a VCF file and split the columns in INFO and FORMAT fields to make a tsv file
|
|
4
|
|
5 inputfile=$1
|
|
6 outfile=$2
|
|
7 outfile2=$3
|
|
8
|
|
9
|
|
10
|
|
11 awk 'BEGIN{
|
|
12 FS="\t";
|
|
13 OFS="\t";
|
|
14 fc=1;
|
|
15 fc2=1;
|
|
16 }{
|
|
17
|
|
18
|
|
19 ### Parse Header
|
|
20
|
|
21 if(index($1,"##")!=0){ #header
|
|
22
|
|
23 #get INFO fields
|
|
24 if(index($1,"INFO")!=0){
|
|
25 split($1,arr,",")
|
|
26
|
|
27 split(arr[1],arr2,"=")
|
|
28 sub(/ /,"_",arr2[3])
|
|
29 infofields[fc] = arr2[3]
|
|
30 sub(/ /,"_",arr2[3])
|
|
31
|
|
32
|
|
33 split(arr[4],arr3,"=")
|
|
34 infodescr[fc] = arr3[2]
|
|
35 print "infofield: ",infofields[fc], infodescr[fc] > "'"$outfile2"'"
|
|
36 fc++
|
|
37 }
|
|
38
|
|
39 #get FORMAT fields
|
|
40 if(index($1,"FORMAT")!=0){
|
|
41 split($1,f_arr,",")
|
|
42
|
|
43 split(f_arr[1],f_arr2,"=")
|
|
44 sub(/ /,"_",f_arr2[3])
|
|
45 formatfields[fc2] = f_arr2[3]
|
|
46
|
|
47 split(f_arr[4],f_arr3,"=")
|
|
48 formatdescr[fc2] = f_arr3[2]
|
|
49 print "Formatfield: ",formatfields[fc2], formatdescr[fc2] > "'"$outfile2"'"
|
|
50 fc2++
|
|
51 }
|
|
52
|
|
53
|
|
54 }
|
|
55
|
|
56
|
|
57 ### Make new Header line
|
|
58
|
|
59 else if(index($1,"#")!=0){ #headerline
|
|
60
|
|
61 # General Columns
|
|
62 printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",substr($1,2),$2,$3,$4,$5,$6,$7 > "'"$outfile"'"
|
|
63
|
|
64 # INFO columns
|
|
65 for (i=1;i<fc;i++)
|
|
66 printf "\t%s", "INFO_"infofields[i] > "'"$outfile"'"
|
|
67 #printf "infofield: ",infofields[i] > "'"$outfile2"'"
|
|
68
|
|
69 # FORMAT columns per sample
|
|
70 for (j=10;j<=NF;j++)
|
|
71 for (k=1;k<fc2;k++){
|
|
72 if (NF==10)
|
|
73 printf "\t%s", "FORMAT_"formatfields[k] > "'"$outfile"'"
|
|
74 #printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
|
|
75 else
|
|
76 printf "\t%s", $j"_FORMAT_"formatfields[k] > "'"$outfile"'"
|
|
77 #printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
|
|
78
|
|
79
|
|
80 }
|
|
81
|
|
82 printf "\n" > "'"$outfile"'"
|
|
83 #printf "\n" > "'"$outfile"'"
|
|
84
|
|
85
|
|
86 }
|
|
87
|
|
88
|
|
89 ### Parse rest of file
|
|
90
|
|
91 else{
|
|
92
|
|
93 ### GENERAL FIELDS
|
|
94 printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",$1,$2,$3,$4,$5,$6,$7 > "'"$outfile"'"
|
|
95
|
|
96
|
|
97 ### INFO FIELDS
|
|
98 split($8,infovalues,";") # split values in INFO column
|
|
99 for (i=1;i<fc;i++){
|
|
100 present=0
|
|
101 for (j in infovalues){
|
|
102 split(infovalues[j],arr5,"=")
|
|
103
|
|
104 if(arr5[1]==infofields[i]) {
|
|
105 if(arr5[2]=="")
|
|
106 printf "\t%s", arr5[1] > "'"$outfile"'"
|
|
107 else
|
|
108 printf "\t%s", arr5[2] > "'"$outfile"'"
|
|
109 present=1
|
|
110 }
|
|
111 }
|
|
112 if (!present) printf "\t" > "'"$outfile"'"
|
|
113
|
|
114 }
|
|
115
|
|
116 ### FORMAT FIELDS
|
|
117 split($9,formatcols,":") # what is order of format fields in next columns?
|
|
118
|
|
119 for (j=10;j<=NF;j++){
|
|
120 split($j,formatvalues,":") # get format fields
|
|
121
|
|
122 col=1
|
|
123 for (i=1;i<fc2;i++){ #put them in the right column
|
|
124 present=0
|
|
125 for (k in formatcols){
|
|
126 if(formatcols[k]==formatfields[i]){
|
|
127 printf "\t%s", formatvalues[k] > "'"$outfile"'"
|
|
128 present=1
|
|
129 }
|
|
130 }
|
|
131 if (!present) printf "\t" > "'"$outfile"'"
|
|
132 col++
|
|
133 }
|
|
134
|
|
135
|
|
136 }
|
|
137 printf "\n" > "'"$outfile"'"
|
|
138 }
|
|
139
|
|
140 }END{
|
|
141 for(i in infofields)
|
|
142 print "infofield: ",infofields[i], infodescr[i]
|
|
143 #print "infofield: ",infofields[i], infodescr[i] > "'"$outfile2"'"
|
|
144
|
|
145 for(j in formatfields)
|
|
146 print "formatfield: ",formatfields[j], formatdescr[j]
|
|
147 #print "formatfield: ",formatfields[j], formatdescr[j] > "'"$outfile2"'"
|
|
148
|
|
149
|
|
150
|
|
151 }' $inputfile
|