Mercurial > repos > niels > vcfflatten

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VCF-2-TSV.sh	Mon Aug 19 06:20:43 2019 -0400
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+#take a VCF file and split the columns in INFO and FORMAT fields to make a tsv file
+
+inputfile=$1
+outfile=$2
+outfile2=$3
+
+
+
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";
+		fc=1;
+		fc2=1;
+	}{
+
+
+		### Parse Header
+
+		if(index($1,"##")!=0){ #header
+
+			#get INFO fields
+			if(index($1,"INFO")!=0){
+				split($1,arr,",")
+
+				split(arr[1],arr2,"=")
+				sub(/ /,"_",arr2[3])
+				infofields[fc] = arr2[3]
+				sub(/ /,"_",arr2[3])
+
+
+				split(arr[4],arr3,"=")
+				infodescr[fc] = arr3[2]
+				print "infofield: ",infofields[fc], infodescr[fc]  > "'"$outfile2"'"
+				fc++
+			}
+
+			#get FORMAT fields
+			if(index($1,"FORMAT")!=0){
+				split($1,f_arr,",")
+
+				split(f_arr[1],f_arr2,"=")
+				sub(/ /,"_",f_arr2[3])
+				formatfields[fc2] = f_arr2[3]
+
+				split(f_arr[4],f_arr3,"=")
+				formatdescr[fc2] = f_arr3[2]
+				print "Formatfield: ",formatfields[fc2], formatdescr[fc2]  > "'"$outfile2"'"
+				fc2++
+			}
+
+
+		}
+
+
+		### Make new Header line
+
+		else if(index($1,"#")!=0){ #headerline
+
+			# General Columns
+			printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",substr($1,2),$2,$3,$4,$5,$6,$7  > "'"$outfile"'"
+
+			# INFO columns
+			for (i=1;i<fc;i++)
+				printf "\t%s", "INFO_"infofields[i]  > "'"$outfile"'"
+				#printf "infofield: ",infofields[i]  > "'"$outfile2"'"
+
+			# FORMAT columns per sample
+			for (j=10;j<=NF;j++)
+				for (k=1;k<fc2;k++){
+					if (NF==10)
+						printf "\t%s", "FORMAT_"formatfields[k]  > "'"$outfile"'"
+						#printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
+					else
+						printf "\t%s", $j"_FORMAT_"formatfields[k]  > "'"$outfile"'"
+						#printf "formatfield: ",formatfields[j] > "'"$outfile2"'"
+
+
+			}
+
+			printf "\n"  > "'"$outfile"'"
+			#printf "\n"  > "'"$outfile"'"
+
+
+		}
+
+
+		### Parse rest of file
+
+		else{
+
+			### GENERAL FIELDS
+			printf "%s\t%s\t%s\t%s\t%s\t%s\t%s",$1,$2,$3,$4,$5,$6,$7  > "'"$outfile"'"
+
+
+			### INFO FIELDS
+			split($8,infovalues,";")  # split values in INFO column
+			for (i=1;i<fc;i++){
+				present=0
+				for (j in infovalues){
+					split(infovalues[j],arr5,"=")
+
+					if(arr5[1]==infofields[i]) {
+						if(arr5[2]=="")
+							printf "\t%s", arr5[1]  > "'"$outfile"'"
+						else
+							printf "\t%s", arr5[2]  > "'"$outfile"'"
+						present=1
+					}
+				}
+				if (!present) printf "\t"  > "'"$outfile"'"
+
+			}
+
+			### FORMAT FIELDS
+			split($9,formatcols,":")  # what is order of format fields in next columns?
+
+			for (j=10;j<=NF;j++){
+				split($j,formatvalues,":")  # get format fields
+
+				col=1
+				for (i=1;i<fc2;i++){		#put them in the right column
+					present=0
+					for (k in formatcols){
+						if(formatcols[k]==formatfields[i]){
+							printf "\t%s", formatvalues[k]  > "'"$outfile"'"
+							present=1
+						}
+					}
+					if (!present) printf "\t"  > "'"$outfile"'"
+					col++
+				}
+
+
+			}
+			printf "\n"  > "'"$outfile"'"
+		}
+
+	}END{
+		for(i in infofields)
+			print "infofield: ",infofields[i], infodescr[i]
+			#print "infofield: ",infofields[i], infodescr[i]  > "'"$outfile2"'"
+
+		for(j in formatfields)
+			print "formatfield: ",formatfields[j], formatdescr[j]
+			#print "formatfield: ",formatfields[j], formatdescr[j] > "'"$outfile2"'"
+
+
+
+	}' $inputfile
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_flatten.xml	Mon Aug 19 06:20:43 2019 -0400
@@ -0,0 +1,69 @@
+<tool id="vcfflatten" name="VCF-flatten" version="1">
+
+  <description> Converts a VCF file to a tab-delimited file with one value per column </description>
+
+  <command interpreter="bash">
+  	VCF-2-TSV.sh $infile $outfile $outfile2   </command>
+
+
+  <inputs>
+   <param name="infile" type="data" label="Select VCF file " />
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="outfile" label="VCF-flatten on ${on_string}"/>
+    <data format="txt" name="outfile2" label="VCF-details on ${on_string}"/>
+  </outputs>
+
+  <help>
+
+
+**What it does**
+
+This tool will take a VCF file, and convert it to a tab-delimited file with a 1-line header and separate columns for all INFO and FORMAT fields.
+As well create an output with all the INFO and FORMAT fields and it's description.
+
+An example input::
+This tool will take a VCF file, and convert it to a tab-delimited file with a 1-line header and separate columns for all INFO and FORMAT fields.
+As well create an output with all the INFO and FORMAT fields and it's description.
+
+An example input::
+
+    	# #fileformat=VCFv4.1
+	# #fileDate=20090805
+	# #source=myImputationProgramV3.1
+	# #reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
+	# #contig=&lt;ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x&gt;
+	# #phasing=partial
+	# #INFO=&lt;ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"&gt;
+	# #INFO=&lt;ID=DP,Number=1,Type=Integer,Description="Total Depth"&gt;
+	# #INFO=&lt;ID=AF,Number=A,Type=Float,Description="Allele Frequency"&gt;
+	# #INFO=&lt;ID=AA,Number=1,Type=String,Description="Ancestral Allele"&gt;
+	# #INFO=&lt;ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129"&gt;
+	# #INFO=&lt;ID=H2,Number=0,Type=Flag,Description="HapMap2 membership"&gt;
+	# #FILTER=&lt;ID=q10,Description="Quality below 10"&gt;
+	# #FILTER=&lt;ID=s50,Description="Less than 50% of samples have data"&gt;
+	# #FORMAT=&lt;ID=GT,Number=1,Type=String,Description="Genotype"&gt;
+	# #FORMAT=&lt;ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"&gt;
+	# #FORMAT=&lt;ID=DP,Number=1,Type=Integer,Description="Read Depth"&gt;
+	# #FORMAT=&lt;ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality"&gt;
+	#CHROM POS     ID        REF    ALT     QUAL FILTER INFO                              FORMAT      NA00001        NA00002        NA00003
+	20     14370   rs6054257 G      A       29   PASS   NS=3;DP=14;AF=0.5;DB;H2           GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
+	20     17330   .         T      A       3    q10    NS=3;DP=11;AF=0.017               GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3   0/0:41:3
+	20     1110696 rs6040355 A      G,T     67   PASS   NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2   2/2:35:4
+	20     1230237 .         T      .       47   PASS   NS=3;DP=13;AA=T                   GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
+	20     1234567 microsat1 GTC    G,GTCT  50   PASS   NS=3;DP=9;AA=G                    GT:GQ:DP    0/1:35:4       0/2:17:2       1/1:40:3
+
+
+will become::
+
+
+	CHROM	POS	ID		REF	ALT	QUAL	FILTER	NS	DP	AF		AA	DB	H2	NA00001_GT	NA00001_GQ	NA00001_DP	NA00001_HQ	NA00002_GT	NA00002_GQ	NA00002_DP	NA00002_HQ	NA00003_GT	NA00003_GQ	NA00003_DP	NA00003_HQ
+	20	14370	rs6054257	G	A	29	PASS	3	14	0.5			DB	H2	0|0		48		1		51,51		1|0		48		8		51,51		1/1		43		5		.,.
+	20	17330	.		T	A	3	q10	3	11	0.017					0|0		49		3		58,50		0|1		3		5		65,3		0/0		41		3
+	20	1110696	rs6040355	A	G,T	67	PASS	2	10	0.333,0.667	T	DB		1|2		21		6		23,27		2|1		2		0		18,2		2/2		35		4
+	20	1230237	.		T	.	47	PASS	3	13			T			0|0		54		7		56,60		0|0		48		4		51,51		0/0		61		2
+	20	1234567	microsat1	GTC	G,GTCT	50	PASS	3	9			G			0/1		35		4				0/2		17		2				1/1		40		3
+
+  </help>
+</tool>