annotate VCF-2-TSV.sh @ 0:8204b0000c8f draft default tip

Uploaded
author niels
date Mon, 19 Aug 2019 06:20:43 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8204b0000c8f Uploaded
niels
parents:
diff changeset
1 #!/bin/bash
8204b0000c8f Uploaded
niels
parents:
diff changeset
2
8204b0000c8f Uploaded
niels
parents:
diff changeset
3 #take a VCF file and split the columns in INFO and FORMAT fields to make a tsv file
8204b0000c8f Uploaded
niels
parents:
diff changeset
4
8204b0000c8f Uploaded
niels
parents:
diff changeset
5 inputfile=$1
8204b0000c8f Uploaded
niels
parents:
diff changeset
6 outfile=$2
8204b0000c8f Uploaded
niels
parents:
diff changeset
7 outfile2=$3
8204b0000c8f Uploaded
niels
parents:
diff changeset
8
8204b0000c8f Uploaded
niels
parents:
diff changeset
9
8204b0000c8f Uploaded
niels
parents:
diff changeset
10
8204b0000c8f Uploaded
niels
parents:
diff changeset
11 awk 'BEGIN{
8204b0000c8f Uploaded
niels
parents:
diff changeset
12 FS="\t";
8204b0000c8f Uploaded
niels
parents:
diff changeset
13 OFS="\t";
8204b0000c8f Uploaded
niels
parents:
diff changeset
14 fc=1;
8204b0000c8f Uploaded
niels
parents:
diff changeset
15 fc2=1;
8204b0000c8f Uploaded
niels
parents:
diff changeset
16 }{
8204b0000c8f Uploaded
niels
parents:
diff changeset
17
8204b0000c8f Uploaded
niels
parents:
diff changeset
18
8204b0000c8f Uploaded
niels
parents:
diff changeset
19 ### Parse Header
8204b0000c8f Uploaded
niels
parents:
diff changeset
20
8204b0000c8f Uploaded
niels
parents:
diff changeset
21 if(index($1,"##")!=0){ #header
8204b0000c8f Uploaded
niels
parents:
diff changeset
22
8204b0000c8f Uploaded
niels
parents:
diff changeset
23 #get INFO fields
8204b0000c8f Uploaded
niels
parents:
diff changeset
24 if(index($1,"INFO")!=0){
8204b0000c8f Uploaded
niels
parents:
diff changeset
25 split($1,arr,",")
8204b0000c8f Uploaded
niels
parents:
diff changeset
26
8204b0000c8f Uploaded
niels
parents:
diff changeset
27 split(arr[1],arr2,"=")
8204b0000c8f Uploaded
niels
parents:
diff changeset
28 sub(/ /,"_",arr2[3])
8204b0000c8f Uploaded
niels
parents:
diff changeset
29 infofields[fc] = arr2[3]
8204b0000c8f Uploaded
niels
parents:
diff changeset
30 sub(/ /,"_",arr2[3])
8204b0000c8f Uploaded
niels
parents:
diff changeset
31
8204b0000c8f Uploaded
niels
parents:
diff changeset
32
8204b0000c8f Uploaded
niels
parents:
diff changeset
33 split(arr[4],arr3,"=")
8204b0000c8f Uploaded
niels
parents:
diff changeset
34 infodescr[fc] = arr3[2]
8204b0000c8f Uploaded
niels
parents:
diff changeset
35 print "infofield: ",infofields[fc], infodescr[fc] > "'"$outfile2"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
36 fc++
8204b0000c8f Uploaded
niels
parents:
diff changeset
37 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
38
8204b0000c8f Uploaded
niels
parents:
diff changeset
39 #get FORMAT fields
8204b0000c8f Uploaded
niels
parents:
diff changeset
40 if(index($1,"FORMAT")!=0){
8204b0000c8f Uploaded
niels
parents:
diff changeset
41 split($1,f_arr,",")
8204b0000c8f Uploaded
niels
parents:
diff changeset
42
8204b0000c8f Uploaded
niels
parents:
diff changeset
43 split(f_arr[1],f_arr2,"=")
8204b0000c8f Uploaded
niels
parents:
diff changeset
44 sub(/ /,"_",f_arr2[3])
8204b0000c8f Uploaded
niels
parents:
diff changeset
45 formatfields[fc2] = f_arr2[3]
8204b0000c8f Uploaded
niels
parents:
diff changeset
46
8204b0000c8f Uploaded
niels
parents:
diff changeset
47 split(f_arr[4],f_arr3,"=")
8204b0000c8f Uploaded
niels
parents:
diff changeset
48 formatdescr[fc2] = f_arr3[2]
8204b0000c8f Uploaded
niels
parents:
diff changeset
49 print "Formatfield: ",formatfields[fc2], formatdescr[fc2] > "'"$outfile2"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
50 fc2++
8204b0000c8f Uploaded
niels
parents:
diff changeset
51 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
52
8204b0000c8f Uploaded
niels
parents:
diff changeset
53
8204b0000c8f Uploaded
niels
parents:
diff changeset
54 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
55
8204b0000c8f Uploaded
niels
parents:
diff changeset
56
8204b0000c8f Uploaded
niels
parents:
diff changeset
57 ### Make new Header line
8204b0000c8f Uploaded
niels
parents:
diff changeset
58
8204b0000c8f Uploaded
niels
parents:
diff changeset
59 else if(index($1,"#")!=0){ #headerline
8204b0000c8f Uploaded
niels
parents:
diff changeset
60
8204b0000c8f Uploaded
niels
parents:
diff changeset
61 # General Columns
8204b0000c8f Uploaded
niels
parents:
diff changeset
62 printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",substr($1,2),$2,$3,$4,$5,$6,$7 > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
63
8204b0000c8f Uploaded
niels
parents:
diff changeset
64 # INFO columns
8204b0000c8f Uploaded
niels
parents:
diff changeset
65 for (i=1;i<fc;i++)
8204b0000c8f Uploaded
niels
parents:
diff changeset
66 printf "\t%s", "INFO_"infofields[i] > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
67 #printf "infofield: ",infofields[i] > "'"$outfile2"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
68
8204b0000c8f Uploaded
niels
parents:
diff changeset
69 # FORMAT columns per sample
8204b0000c8f Uploaded
niels
parents:
diff changeset
70 for (j=10;j<=NF;j++)
8204b0000c8f Uploaded
niels
parents:
diff changeset
71 for (k=1;k<fc2;k++){
8204b0000c8f Uploaded
niels
parents:
diff changeset
72 if (NF==10)
8204b0000c8f Uploaded
niels
parents:
diff changeset
73 printf "\t%s", "FORMAT_"formatfields[k] > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
74 #printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
75 else
8204b0000c8f Uploaded
niels
parents:
diff changeset
76 printf "\t%s", $j"_FORMAT_"formatfields[k] > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
77 #printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
78
8204b0000c8f Uploaded
niels
parents:
diff changeset
79
8204b0000c8f Uploaded
niels
parents:
diff changeset
80 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
81
8204b0000c8f Uploaded
niels
parents:
diff changeset
82 printf "\n" > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
83 #printf "\n" > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
84
8204b0000c8f Uploaded
niels
parents:
diff changeset
85
8204b0000c8f Uploaded
niels
parents:
diff changeset
86 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
87
8204b0000c8f Uploaded
niels
parents:
diff changeset
88
8204b0000c8f Uploaded
niels
parents:
diff changeset
89 ### Parse rest of file
8204b0000c8f Uploaded
niels
parents:
diff changeset
90
8204b0000c8f Uploaded
niels
parents:
diff changeset
91 else{
8204b0000c8f Uploaded
niels
parents:
diff changeset
92
8204b0000c8f Uploaded
niels
parents:
diff changeset
93 ### GENERAL FIELDS
8204b0000c8f Uploaded
niels
parents:
diff changeset
94 printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",$1,$2,$3,$4,$5,$6,$7 > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
95
8204b0000c8f Uploaded
niels
parents:
diff changeset
96
8204b0000c8f Uploaded
niels
parents:
diff changeset
97 ### INFO FIELDS
8204b0000c8f Uploaded
niels
parents:
diff changeset
98 split($8,infovalues,";") # split values in INFO column
8204b0000c8f Uploaded
niels
parents:
diff changeset
99 for (i=1;i<fc;i++){
8204b0000c8f Uploaded
niels
parents:
diff changeset
100 present=0
8204b0000c8f Uploaded
niels
parents:
diff changeset
101 for (j in infovalues){
8204b0000c8f Uploaded
niels
parents:
diff changeset
102 split(infovalues[j],arr5,"=")
8204b0000c8f Uploaded
niels
parents:
diff changeset
103
8204b0000c8f Uploaded
niels
parents:
diff changeset
104 if(arr5[1]==infofields[i]) {
8204b0000c8f Uploaded
niels
parents:
diff changeset
105 if(arr5[2]=="")
8204b0000c8f Uploaded
niels
parents:
diff changeset
106 printf "\t%s", arr5[1] > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
107 else
8204b0000c8f Uploaded
niels
parents:
diff changeset
108 printf "\t%s", arr5[2] > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
109 present=1
8204b0000c8f Uploaded
niels
parents:
diff changeset
110 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
111 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
112 if (!present) printf "\t" > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
113
8204b0000c8f Uploaded
niels
parents:
diff changeset
114 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
115
8204b0000c8f Uploaded
niels
parents:
diff changeset
116 ### FORMAT FIELDS
8204b0000c8f Uploaded
niels
parents:
diff changeset
117 split($9,formatcols,":") # what is order of format fields in next columns?
8204b0000c8f Uploaded
niels
parents:
diff changeset
118
8204b0000c8f Uploaded
niels
parents:
diff changeset
119 for (j=10;j<=NF;j++){
8204b0000c8f Uploaded
niels
parents:
diff changeset
120 split($j,formatvalues,":") # get format fields
8204b0000c8f Uploaded
niels
parents:
diff changeset
121
8204b0000c8f Uploaded
niels
parents:
diff changeset
122 col=1
8204b0000c8f Uploaded
niels
parents:
diff changeset
123 for (i=1;i<fc2;i++){ #put them in the right column
8204b0000c8f Uploaded
niels
parents:
diff changeset
124 present=0
8204b0000c8f Uploaded
niels
parents:
diff changeset
125 for (k in formatcols){
8204b0000c8f Uploaded
niels
parents:
diff changeset
126 if(formatcols[k]==formatfields[i]){
8204b0000c8f Uploaded
niels
parents:
diff changeset
127 printf "\t%s", formatvalues[k] > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
128 present=1
8204b0000c8f Uploaded
niels
parents:
diff changeset
129 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
130 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
131 if (!present) printf "\t" > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
132 col++
8204b0000c8f Uploaded
niels
parents:
diff changeset
133 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
134
8204b0000c8f Uploaded
niels
parents:
diff changeset
135
8204b0000c8f Uploaded
niels
parents:
diff changeset
136 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
137 printf "\n" > "'"$outfile"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
138 }
8204b0000c8f Uploaded
niels
parents:
diff changeset
139
8204b0000c8f Uploaded
niels
parents:
diff changeset
140 }END{
8204b0000c8f Uploaded
niels
parents:
diff changeset
141 for(i in infofields)
8204b0000c8f Uploaded
niels
parents:
diff changeset
142 print "infofield: ",infofields[i], infodescr[i]
8204b0000c8f Uploaded
niels
parents:
diff changeset
143 #print "infofield: ",infofields[i], infodescr[i] > "'"$outfile2"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
144
8204b0000c8f Uploaded
niels
parents:
diff changeset
145 for(j in formatfields)
8204b0000c8f Uploaded
niels
parents:
diff changeset
146 print "formatfield: ",formatfields[j], formatdescr[j]
8204b0000c8f Uploaded
niels
parents:
diff changeset
147 #print "formatfield: ",formatfields[j], formatdescr[j] > "'"$outfile2"'"
8204b0000c8f Uploaded
niels
parents:
diff changeset
148
8204b0000c8f Uploaded
niels
parents:
diff changeset
149
8204b0000c8f Uploaded
niels
parents:
diff changeset
150
8204b0000c8f Uploaded
niels
parents:
diff changeset
151 }' $inputfile