0
|
1 #! /bin/bash
|
|
2
|
|
3 #convert LV output to circos input
|
|
4 # LV out: variantId chromosome begin end varType reference alleleSeq xRef
|
|
5
|
|
6 # bin results and get snp density (value 0-1, numvars/binsize)
|
|
7 # determine highest value seen and write "max = <maxval>" to max.txt
|
|
8
|
|
9 infile=$1
|
|
10 outfile_wg=$2
|
|
11 outfile_chr=$3
|
|
12
|
|
13
|
|
14 echo "converting variations file"
|
|
15
|
|
16 #whole-chromosome file (bigger bins)
|
|
17 awk 'BEGIN{
|
|
18 FS="\t";
|
|
19 OFS=" ";
|
|
20 maxval=0;
|
|
21 binsize=5120000
|
|
22 }{
|
|
23 if (FNR>1 && index($1,"#")==0 && $0!="" && index($1,">")==0){
|
|
24 density[$2,int($3/binsize)]++;
|
|
25 }
|
|
26
|
|
27
|
|
28 }END{
|
|
29 for (i in density){
|
|
30 numbins++;
|
|
31 avg+=(density[i]/binsize);
|
|
32 split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin
|
|
33 print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize
|
|
34 if(density[i]/binsize > maxval)
|
|
35 maxval=density[i]/binsize
|
|
36
|
|
37 }
|
|
38 avg/=numbins
|
|
39 print "max = " maxval > "maxval_orig.txt"
|
|
40 if(avg*5 < maxval)
|
|
41 maxval = avg*5
|
|
42 print "avg = " avg > "avg.txt"
|
|
43 print "max = " maxval > "maxval.txt"
|
|
44 print maxval > "maxtmp"
|
|
45
|
|
46
|
|
47 }' $infile > $outfile_wg.tmp2
|
|
48
|
|
49 maxval=`cat maxtmp`
|
|
50 echo "maxval: $maxval"
|
|
51
|
|
52 #second pass, anything greater than maxval is set to maxval
|
|
53 awk 'BEGIN{
|
|
54 FS="\t";
|
|
55 OFS=" ";
|
|
56 maxval="'"$maxval"'"
|
|
57
|
|
58 }{
|
|
59 if (FNR>1 && $4 > maxval){
|
|
60 print $1,$2,$3,maxval
|
|
61 }
|
|
62 else print $0
|
|
63
|
|
64
|
|
65 }END{
|
|
66 }' $outfile_wg.tmp2 > $outfile_wg.tmp
|
|
67
|
|
68
|
|
69 sed -i 's/chr/hs/g' $outfile_wg.tmp
|
|
70 sed -i '/hsM/d' $outfile_wg.tmp
|
|
71 sort -d -k1,2 $outfile_wg.tmp > $outfile_wg
|
|
72
|
|
73
|
|
74
|
|
75
|
|
76 #per-chromosome file (smaller bins)
|
|
77 awk 'BEGIN{
|
|
78 FS="\t";
|
|
79 OFS=" ";
|
|
80 binsize=512000
|
|
81 }{
|
|
82 if (FNR>1){
|
|
83 density[$2,int($3/binsize)]++;
|
|
84 }
|
|
85
|
|
86
|
|
87 }END{
|
|
88 for (i in density){
|
|
89 numbins["all"]++;
|
|
90 avg["all"]+=(density[i]/binsize);
|
|
91
|
|
92 split(i, separate, "\034") # separate[1] contains chr, separate[2] contains bin
|
|
93 chr=separate[1]
|
|
94 avg[chr]+= (density[i]/binsize);
|
|
95 numbins[chr]++
|
|
96
|
|
97 print separate[1], separate[2]*binsize, (separate[2]+1)*binsize, density[i]/binsize
|
|
98
|
|
99 if(density[i]/binsize > maxval["all"])
|
|
100 maxval["all"]=density[i]/binsize
|
|
101 if(density[i]/binsize > maxval[chr])
|
|
102 maxval[chr]=density[i]/binsize
|
|
103
|
|
104 }
|
|
105
|
|
106
|
|
107
|
|
108 for (i in avg){
|
|
109 avg[i]/=numbins[i]
|
|
110 if(avg[i]*3 < maxval[i])
|
|
111 maxval[i] = avg[i]*3
|
|
112 print "avg = " avg[i] > "snp_avg_"i".txt"
|
|
113 print "max = " maxval[i] > "snp_maxval_"i".txt"
|
|
114
|
|
115 }
|
|
116
|
|
117
|
|
118
|
|
119 }' $infile > $outfile_chr.tmp
|
|
120
|
|
121
|
|
122
|
|
123 sed -i 's/chr/hs/g' $outfile_chr.tmp
|
|
124 sed -i '/hsM/d' $outfile_chr.tmp
|
|
125 sort -d -k1,2 $outfile_chr.tmp > $outfile_chr
|
|
126
|
|
127
|
|
128
|
|
129
|