Mercurial > repos > saskia-hiltemann > cgtag_circos
diff scripts/variations2circos.sh @ 0:46f7f689b929 draft default tip
Uploaded
author | saskia-hiltemann |
---|---|
date | Tue, 17 Sep 2013 11:29:11 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/variations2circos.sh Tue Sep 17 11:29:11 2013 -0400 @@ -0,0 +1,129 @@ +#! /bin/bash + +#convert LV output to circos input +# LV out: variantId chromosome begin end varType reference alleleSeq xRef + +# bin results and get snp density (value 0-1, numvars/binsize) +# determine highest value seen and write "max = <maxval>" to max.txt + +infile=$1 +outfile_wg=$2 +outfile_chr=$3 + + +echo "converting variations file" + +#whole-chromosome file (bigger bins) +awk 'BEGIN{ + FS="\t"; + OFS=" "; + maxval=0; + binsize=5120000 + }{ + if (FNR>1 && index($1,"#")==0 && $0!="" && index($1,">")==0){ + density[$2,int($3/binsize)]++; + } + + + }END{ + for (i in density){ + numbins++; + avg+=(density[i]/binsize); + split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin + print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize + if(density[i]/binsize > maxval) + maxval=density[i]/binsize + + } + avg/=numbins + print "max = " maxval > "maxval_orig.txt" + if(avg*5 < maxval) + maxval = avg*5 + print "avg = " avg > "avg.txt" + print "max = " maxval > "maxval.txt" + print maxval > "maxtmp" + + + }' $infile > $outfile_wg.tmp2 + +maxval=`cat maxtmp` +echo "maxval: $maxval" + +#second pass, anything greater than maxval is set to maxval +awk 'BEGIN{ + FS="\t"; + OFS=" "; + maxval="'"$maxval"'" + + }{ + if (FNR>1 && $4 > maxval){ + print $1,$2,$3,maxval + } + else print $0 + + + }END{ + }' $outfile_wg.tmp2 > $outfile_wg.tmp + + +sed -i 's/chr/hs/g' $outfile_wg.tmp +sed -i '/hsM/d' $outfile_wg.tmp +sort -d -k1,2 $outfile_wg.tmp > $outfile_wg + + + + +#per-chromosome file (smaller bins) +awk 'BEGIN{ + FS="\t"; + OFS=" "; + binsize=512000 + }{ + if (FNR>1){ + density[$2,int($3/binsize)]++; + } + + + }END{ + for (i in density){ + numbins["all"]++; + avg["all"]+=(density[i]/binsize); + + split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin + chr=separate[1] + avg[chr]+= (density[i]/binsize); + numbins[chr]++ + + print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize + + if(density[i]/binsize > maxval["all"]) + maxval["all"]=density[i]/binsize + if(density[i]/binsize > maxval[chr]) + maxval[chr]=density[i]/binsize + + } + + + + for (i in avg){ + avg[i]/=numbins[i] + if(avg[i]*3 < maxval[i]) + maxval[i] = avg[i]*3 + print "avg = " avg[i] > "snp_avg_"i".txt" + print "max = " maxval[i] > "snp_maxval_"i".txt" + + } + + + + }' $infile > $outfile_chr.tmp + + + +sed -i 's/chr/hs/g' $outfile_chr.tmp +sed -i '/hsM/d' $outfile_chr.tmp +sort -d -k1,2 $outfile_chr.tmp > $outfile_chr + + + +