comparison dnp-correlation-between-profiles.sh @ 0:b45de206654d draft default tip

"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
author erinija
date Fri, 01 May 2020 12:08:23 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b45de206654d
1 #!/bin/sh
2
3 if test "$#" -ne 4; then
4
5 echo ""
6 echo " CALL "
7 echo " sh dnp-correlation-between-profiles.sh difreq.tabular winsize dinucleotides correlations.tabular"
8 echo ""
9 echo " INPUT"
10 echo " difreq.tabular - tabular file of dinucleotide frequencies on forward and complementary fasta sequences "
11 echo " winsize - size of sliding window within which correlation is computed (default 146)"
12 echo " dinucleotides - 'AA AC AG AT ...'"
13 echo ""
14 echo " OUTPUT"
15 echo " correlations.tabular - tabular file of Pearson correlations at each position for given dinucleotides"
16 echo ""
17 echo " DESCRIPTION"
18 echo " To find a position of highest symmetry between the"
19 echo " dinucleotides positional frequency profiles on"
20 echo " forward and complementary sequences a Pearson"
21 echo " correlation coefficient is computed at each position"
22 echo " between forward and complementary dinucleotide profiles"
23 echo " within a sliding window."
24 echo " Input file contains dinucleotide frequencies arranged in columns"
25 echo " for input dinucleotides named by *.f and *.r corresponding to"
26 echo " forward and complementary profiles as:"
27 echo ""
28 echo " AA.f AA.r AC.f AC.r AG.f AG.r"
29 echo " 0.076320 0.067920 0.057800 0.078120 0.081600 0.061960"
30 echo " 0.072540 0.069520 0.041800 0.079820 0.076300 0.055190"
31 echo " ... ... ... ... ... ..."
32 echo ""
33 echo " The input.tabular file is output of a dnp-subset-dinuc-profile.sh script."
34 echo " A tabular output file contains columns of correlation coefficients "
35 echo " along the positions of the frequency profile. A first column labelled 0 contains "
36 echo " average of all correlations at each position:"
37 echo ""
38 echo " 0 AA AC AG"
39 echo " 0.042205 0.0882716 0.030175 0.126967 "
40 echo " ... ... ... ..."
41 echo ""
42 echo " Position and value of maximum correlations"
43 echo " of each dinucleotide are printed to a standard output."
44 echo ""
45 echo " REQUIREMENT"
46 echo " dnp-corrprofile installed"
47 echo " conda install -c bioconda dnp-corrprofile"
48 echo ""
49 exit 1
50
51 fi
52
53 name=$1
54 window=$2
55 dinucleotides=$3
56 out=$4
57
58 call=dnp-corrprofile
59
60 # compute length of the file
61 len=`wc ${name} |awk '{print $1}'`
62 len=$((len-1))
63
64 for di in ${dinucleotides}
65 do
66 #extract column numbers of required dinucleotide
67 i=`awk -v name=$di'.f' '{ for (i=1; i<=NF; i++) if($i==name) print i; exit}' $name`
68
69 # create a file of two columns in temp
70 awk -v k1=${i} -v k2=$((i+1)) '{print $k1 "\t" $k2}' $name | sed -n "2,${len}p" > temp.${di}
71
72 # put the header of the column
73 echo ${di} > cp${i}
74
75 # correlation dinucleotide profile along the sequence
76 # ./corrprofile aa -n 219 -w 146
77 # output in the temp column
78 ${call} temp.${di} -n $((len-1)) -w ${window} >> cp${i}
79 # print the maximum
80 posval=`cat cp${i} | grep -v ${di} | cat -n | sort -k2,2n | tail -n1`
81 echo ${di} ${posval} >> maxpos
82
83 done
84
85 paste cp[1-9]* > temp
86
87 # compute average correlation profile
88 awk 'NR==2{next}
89 {T=0;
90 for(N=1; N<=NF; N++) T+=$N;
91 T/=NF
92 print T "\t" $0}' temp > ${out}
93
94 # find a position of the maximum and output it
95 posval=`cat ${out}| awk '{print $1} ' | cat -n | sort -k2,2n | tail -n1`
96 echo "avg " ${posval} >> maxpos
97 cat maxpos | tr "\n" ,
98
99 rm temp* cp[1-9]* maxpos