comparison scripts/variations2circos.sh @ 0:46f7f689b929 draft default tip

Uploaded
author saskia-hiltemann
date Tue, 17 Sep 2013 11:29:11 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:46f7f689b929
1 #! /bin/bash
2
3 #convert LV output to circos input
4 # LV out: variantId chromosome begin end varType reference alleleSeq xRef
5
6 # bin results and get snp density (value 0-1, numvars/binsize)
7 # determine highest value seen and write "max = <maxval>" to max.txt
8
9 infile=$1
10 outfile_wg=$2
11 outfile_chr=$3
12
13
14 echo "converting variations file"
15
16 #whole-chromosome file (bigger bins)
17 awk 'BEGIN{
18 FS="\t";
19 OFS=" ";
20 maxval=0;
21 binsize=5120000
22 }{
23 if (FNR>1 && index($1,"#")==0 && $0!="" && index($1,">")==0){
24 density[$2,int($3/binsize)]++;
25 }
26
27
28 }END{
29 for (i in density){
30 numbins++;
31 avg+=(density[i]/binsize);
32 split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin
33 print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize
34 if(density[i]/binsize > maxval)
35 maxval=density[i]/binsize
36
37 }
38 avg/=numbins
39 print "max = " maxval > "maxval_orig.txt"
40 if(avg*5 < maxval)
41 maxval = avg*5
42 print "avg = " avg > "avg.txt"
43 print "max = " maxval > "maxval.txt"
44 print maxval > "maxtmp"
45
46
47 }' $infile > $outfile_wg.tmp2
48
49 maxval=`cat maxtmp`
50 echo "maxval: $maxval"
51
52 #second pass, anything greater than maxval is set to maxval
53 awk 'BEGIN{
54 FS="\t";
55 OFS=" ";
56 maxval="'"$maxval"'"
57
58 }{
59 if (FNR>1 && $4 > maxval){
60 print $1,$2,$3,maxval
61 }
62 else print $0
63
64
65 }END{
66 }' $outfile_wg.tmp2 > $outfile_wg.tmp
67
68
69 sed -i 's/chr/hs/g' $outfile_wg.tmp
70 sed -i '/hsM/d' $outfile_wg.tmp
71 sort -d -k1,2 $outfile_wg.tmp > $outfile_wg
72
73
74
75
76 #per-chromosome file (smaller bins)
77 awk 'BEGIN{
78 FS="\t";
79 OFS=" ";
80 binsize=512000
81 }{
82 if (FNR>1){
83 density[$2,int($3/binsize)]++;
84 }
85
86
87 }END{
88 for (i in density){
89 numbins["all"]++;
90 avg["all"]+=(density[i]/binsize);
91
92 split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin
93 chr=separate[1]
94 avg[chr]+= (density[i]/binsize);
95 numbins[chr]++
96
97 print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize
98
99 if(density[i]/binsize > maxval["all"])
100 maxval["all"]=density[i]/binsize
101 if(density[i]/binsize > maxval[chr])
102 maxval[chr]=density[i]/binsize
103
104 }
105
106
107
108 for (i in avg){
109 avg[i]/=numbins[i]
110 if(avg[i]*3 < maxval[i])
111 maxval[i] = avg[i]*3
112 print "avg = " avg[i] > "snp_avg_"i".txt"
113 print "max = " maxval[i] > "snp_maxval_"i".txt"
114
115 }
116
117
118
119 }' $infile > $outfile_chr.tmp
120
121
122
123 sed -i 's/chr/hs/g' $outfile_chr.tmp
124 sed -i '/hsM/d' $outfile_chr.tmp
125 sort -d -k1,2 $outfile_chr.tmp > $outfile_chr
126
127
128
129