diff scripts/variations2circos.sh @ 0:46f7f689b929 draft default tip

Uploaded
author saskia-hiltemann
date Tue, 17 Sep 2013 11:29:11 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/variations2circos.sh	Tue Sep 17 11:29:11 2013 -0400
@@ -0,0 +1,129 @@
+#! /bin/bash
+
+#convert LV output to circos input
+# LV out: variantId	chromosome	begin	end	varType	reference	alleleSeq	xRef
+
+# bin results and get snp density (value 0-1, numvars/binsize)
+# determine highest value seen and write "max = <maxval>" to max.txt
+
+infile=$1
+outfile_wg=$2
+outfile_chr=$3
+
+
+echo "converting variations file"
+
+#whole-chromosome file (bigger bins)
+awk 'BEGIN{
+		FS="\t";
+		OFS=" ";
+		maxval=0;
+		binsize=5120000
+	}{
+		if (FNR>1 && index($1,"#")==0 && $0!="" && index($1,">")==0){
+			density[$2,int($3/binsize)]++;			 
+		}
+
+
+	}END{
+		for (i in density){
+			numbins++;
+			avg+=(density[i]/binsize);
+  			split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin			
+			print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize
+			if(density[i]/binsize > maxval)
+				maxval=density[i]/binsize
+		
+		}
+		avg/=numbins
+		print "max = " maxval > "maxval_orig.txt"
+		if(avg*5 < maxval)
+			maxval = avg*5
+		print "avg = " avg > "avg.txt"
+		print "max = " maxval > "maxval.txt"
+		print maxval > "maxtmp"
+
+
+	}' $infile > $outfile_wg.tmp2
+
+maxval=`cat maxtmp`
+echo "maxval: $maxval"
+
+#second pass, anything greater than maxval is set to maxval
+awk 'BEGIN{
+		FS="\t";
+		OFS=" ";
+		maxval="'"$maxval"'"
+		
+	}{
+		if (FNR>1 && $4 > maxval){
+			print $1,$2,$3,maxval		 	
+		}
+		else print $0
+
+
+	}END{		
+	}' $outfile_wg.tmp2 > $outfile_wg.tmp
+
+
+sed -i 's/chr/hs/g' $outfile_wg.tmp
+sed -i '/hsM/d' $outfile_wg.tmp
+sort -d -k1,2 $outfile_wg.tmp > $outfile_wg
+
+
+
+
+#per-chromosome file (smaller bins)
+awk 'BEGIN{
+		FS="\t";
+		OFS=" ";	
+		binsize=512000
+	}{
+		if (FNR>1){
+			density[$2,int($3/binsize)]++;			 
+		}
+
+
+	}END{
+		for (i in density){
+			numbins["all"]++;
+			avg["all"]+=(density[i]/binsize);
+			
+  			split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin	
+			chr=separate[1]
+			avg[chr]+= (density[i]/binsize);		
+			numbins[chr]++
+
+			print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize
+
+			if(density[i]/binsize > maxval["all"])
+				maxval["all"]=density[i]/binsize
+			if(density[i]/binsize > maxval[chr])
+				maxval[chr]=density[i]/binsize				
+		
+		}
+	
+		
+
+		for (i in avg){
+			avg[i]/=numbins[i]
+			if(avg[i]*3 < maxval[i])
+				maxval[i] = avg[i]*3
+			print "avg = " avg[i] > "snp_avg_"i".txt"
+			print "max = " maxval[i] > "snp_maxval_"i".txt"
+			
+		}	
+		
+
+
+	}' $infile > $outfile_chr.tmp
+
+
+
+sed -i 's/chr/hs/g' $outfile_chr.tmp
+sed -i '/hsM/d' $outfile_chr.tmp
+sort -d -k1,2 $outfile_chr.tmp > $outfile_chr
+
+
+
+