annotate dnp-correlation-between-profiles.sh @ 0:448204d12325 draft default tip

"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
author erinija
date Fri, 01 May 2020 12:13:00 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
1 #!/bin/sh
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
2
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
3 if test "$#" -ne 4; then
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
4
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
5 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
6 echo " CALL "
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
7 echo " sh dnp-correlation-between-profiles.sh difreq.tabular winsize dinucleotides correlations.tabular"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
8 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
9 echo " INPUT"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
10 echo " difreq.tabular - tabular file of dinucleotide frequencies on forward and complementary fasta sequences "
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
11 echo " winsize - size of sliding window within which correlation is computed (default 146)"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
12 echo " dinucleotides - 'AA AC AG AT ...'"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
13 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
14 echo " OUTPUT"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
15 echo " correlations.tabular - tabular file of Pearson correlations at each position for given dinucleotides"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
16 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
17 echo " DESCRIPTION"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
18 echo " To find a position of highest symmetry between the"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
19 echo " dinucleotides positional frequency profiles on"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
20 echo " forward and complementary sequences a Pearson"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
21 echo " correlation coefficient is computed at each position"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
22 echo " between forward and complementary dinucleotide profiles"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
23 echo " within a sliding window."
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
24 echo " Input file contains dinucleotide frequencies arranged in columns"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
25 echo " for input dinucleotides named by *.f and *.r corresponding to"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
26 echo " forward and complementary profiles as:"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
27 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
28 echo " AA.f AA.r AC.f AC.r AG.f AG.r"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
29 echo " 0.076320 0.067920 0.057800 0.078120 0.081600 0.061960"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
30 echo " 0.072540 0.069520 0.041800 0.079820 0.076300 0.055190"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
31 echo " ... ... ... ... ... ..."
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
32 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
33 echo " The input.tabular file is output of a dnp-subset-dinuc-profile.sh script."
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
34 echo " A tabular output file contains columns of correlation coefficients "
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
35 echo " along the positions of the frequency profile. A first column labelled 0 contains "
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
36 echo " average of all correlations at each position:"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
37 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
38 echo " 0 AA AC AG"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
39 echo " 0.042205 0.0882716 0.030175 0.126967 "
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
40 echo " ... ... ... ..."
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
41 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
42 echo " Position and value of maximum correlations"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
43 echo " of each dinucleotide are printed to a standard output."
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
44 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
45 echo " REQUIREMENT"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
46 echo " dnp-corrprofile installed"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
47 echo " conda install -c bioconda dnp-corrprofile"
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
48 echo ""
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
49 exit 1
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
50
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
51 fi
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
52
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
53 name=$1
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
54 window=$2
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
55 dinucleotides=$3
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
56 out=$4
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
57
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
58 call=dnp-corrprofile
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
59
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
60 # compute length of the file
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
61 len=`wc ${name} |awk '{print $1}'`
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
62 len=$((len-1))
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
63
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
64 for di in ${dinucleotides}
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
65 do
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
66 #extract column numbers of required dinucleotide
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
67 i=`awk -v name=$di'.f' '{ for (i=1; i<=NF; i++) if($i==name) print i; exit}' $name`
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
68
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
69 # create a file of two columns in temp
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
70 awk -v k1=${i} -v k2=$((i+1)) '{print $k1 "\t" $k2}' $name | sed -n "2,${len}p" > temp.${di}
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
71
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
72 # put the header of the column
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
73 echo ${di} > cp${i}
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
74
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
75 # correlation dinucleotide profile along the sequence
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
76 # ./corrprofile aa -n 219 -w 146
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
77 # output in the temp column
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
78 ${call} temp.${di} -n $((len-1)) -w ${window} >> cp${i}
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
79 # print the maximum
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
80 posval=`cat cp${i} | grep -v ${di} | cat -n | sort -k2,2n | tail -n1`
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
81 echo ${di} ${posval} >> maxpos
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
82
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
83 done
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
84
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
85 paste cp[1-9]* > temp
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
86
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
87 # compute average correlation profile
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
88 awk 'NR==2{next}
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
89 {T=0;
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
90 for(N=1; N<=NF; N++) T+=$N;
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
91 T/=NF
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
92 print T "\t" $0}' temp > ${out}
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
93
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
94 # find a position of the maximum and output it
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
95 posval=`cat ${out}| awk '{print $1} ' | cat -n | sort -k2,2n | tail -n1`
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
96 echo "avg " ${posval} >> maxpos
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
97 cat maxpos | tr "\n" ,
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
98
448204d12325 "planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
erinija
parents:
diff changeset
99 rm temp* cp[1-9]* maxpos