view scripts/variations2circos.sh @ 0:46f7f689b929 draft default tip

Uploaded
author saskia-hiltemann
date Tue, 17 Sep 2013 11:29:11 -0400
parents
children
line wrap: on
line source

#! /bin/bash

#convert LV output to circos input
# LV out: variantId	chromosome	begin	end	varType	reference	alleleSeq	xRef

# bin results and get snp density (value 0-1, numvars/binsize)
# determine highest value seen and write "max = <maxval>" to max.txt

infile=$1
outfile_wg=$2
outfile_chr=$3


echo "converting variations file"

#whole-chromosome file (bigger bins)
awk 'BEGIN{
		FS="\t";
		OFS=" ";
		maxval=0;
		binsize=5120000
	}{
		if (FNR>1 && index($1,"#")==0 && $0!="" && index($1,">")==0){
			density[$2,int($3/binsize)]++;			 
		}


	}END{
		for (i in density){
			numbins++;
			avg+=(density[i]/binsize);
  			split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin			
			print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize
			if(density[i]/binsize > maxval)
				maxval=density[i]/binsize
		
		}
		avg/=numbins
		print "max = " maxval > "maxval_orig.txt"
		if(avg*5 < maxval)
			maxval = avg*5
		print "avg = " avg > "avg.txt"
		print "max = " maxval > "maxval.txt"
		print maxval > "maxtmp"


	}' $infile > $outfile_wg.tmp2

maxval=`cat maxtmp`
echo "maxval: $maxval"

#second pass, anything greater than maxval is set to maxval
awk 'BEGIN{
		FS="\t";
		OFS=" ";
		maxval="'"$maxval"'"
		
	}{
		if (FNR>1 && $4 > maxval){
			print $1,$2,$3,maxval		 	
		}
		else print $0


	}END{		
	}' $outfile_wg.tmp2 > $outfile_wg.tmp


sed -i 's/chr/hs/g' $outfile_wg.tmp
sed -i '/hsM/d' $outfile_wg.tmp
sort -d -k1,2 $outfile_wg.tmp > $outfile_wg




#per-chromosome file (smaller bins)
awk 'BEGIN{
		FS="\t";
		OFS=" ";	
		binsize=512000
	}{
		if (FNR>1){
			density[$2,int($3/binsize)]++;			 
		}


	}END{
		for (i in density){
			numbins["all"]++;
			avg["all"]+=(density[i]/binsize);
			
  			split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin	
			chr=separate[1]
			avg[chr]+= (density[i]/binsize);		
			numbins[chr]++

			print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize

			if(density[i]/binsize > maxval["all"])
				maxval["all"]=density[i]/binsize
			if(density[i]/binsize > maxval[chr])
				maxval[chr]=density[i]/binsize				
		
		}
	
		

		for (i in avg){
			avg[i]/=numbins[i]
			if(avg[i]*3 < maxval[i])
				maxval[i] = avg[i]*3
			print "avg = " avg[i] > "snp_avg_"i".txt"
			print "max = " maxval[i] > "snp_maxval_"i".txt"
			
		}	
		


	}' $infile > $outfile_chr.tmp



sed -i 's/chr/hs/g' $outfile_chr.tmp
sed -i '/hsM/d' $outfile_chr.tmp
sort -d -k1,2 $outfile_chr.tmp > $outfile_chr