Mercurial > repos > erinija > dnp_correlation_between_profiles
comparison dnp-correlation-between-profiles.sh @ 0:b45de206654d draft default tip
"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
author | erinija |
---|---|
date | Fri, 01 May 2020 12:08:23 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b45de206654d |
---|---|
1 #!/bin/sh | |
2 | |
3 if test "$#" -ne 4; then | |
4 | |
5 echo "" | |
6 echo " CALL " | |
7 echo " sh dnp-correlation-between-profiles.sh difreq.tabular winsize dinucleotides correlations.tabular" | |
8 echo "" | |
9 echo " INPUT" | |
10 echo " difreq.tabular - tabular file of dinucleotide frequencies on forward and complementary fasta sequences " | |
11 echo " winsize - size of sliding window within which correlation is computed (default 146)" | |
12 echo " dinucleotides - 'AA AC AG AT ...'" | |
13 echo "" | |
14 echo " OUTPUT" | |
15 echo " correlations.tabular - tabular file of Pearson correlations at each position for given dinucleotides" | |
16 echo "" | |
17 echo " DESCRIPTION" | |
18 echo " To find a position of highest symmetry between the" | |
19 echo " dinucleotides positional frequency profiles on" | |
20 echo " forward and complementary sequences a Pearson" | |
21 echo " correlation coefficient is computed at each position" | |
22 echo " between forward and complementary dinucleotide profiles" | |
23 echo " within a sliding window." | |
24 echo " Input file contains dinucleotide frequencies arranged in columns" | |
25 echo " for input dinucleotides named by *.f and *.r corresponding to" | |
26 echo " forward and complementary profiles as:" | |
27 echo "" | |
28 echo " AA.f AA.r AC.f AC.r AG.f AG.r" | |
29 echo " 0.076320 0.067920 0.057800 0.078120 0.081600 0.061960" | |
30 echo " 0.072540 0.069520 0.041800 0.079820 0.076300 0.055190" | |
31 echo " ... ... ... ... ... ..." | |
32 echo "" | |
33 echo " The input.tabular file is output of a dnp-subset-dinuc-profile.sh script." | |
34 echo " A tabular output file contains columns of correlation coefficients " | |
35 echo " along the positions of the frequency profile. A first column labelled 0 contains " | |
36 echo " average of all correlations at each position:" | |
37 echo "" | |
38 echo " 0 AA AC AG" | |
39 echo " 0.042205 0.0882716 0.030175 0.126967 " | |
40 echo " ... ... ... ..." | |
41 echo "" | |
42 echo " Position and value of maximum correlations" | |
43 echo " of each dinucleotide are printed to a standard output." | |
44 echo "" | |
45 echo " REQUIREMENT" | |
46 echo " dnp-corrprofile installed" | |
47 echo " conda install -c bioconda dnp-corrprofile" | |
48 echo "" | |
49 exit 1 | |
50 | |
51 fi | |
52 | |
53 name=$1 | |
54 window=$2 | |
55 dinucleotides=$3 | |
56 out=$4 | |
57 | |
58 call=dnp-corrprofile | |
59 | |
60 # compute length of the file | |
61 len=`wc ${name} |awk '{print $1}'` | |
62 len=$((len-1)) | |
63 | |
64 for di in ${dinucleotides} | |
65 do | |
66 #extract column numbers of required dinucleotide | |
67 i=`awk -v name=$di'.f' '{ for (i=1; i<=NF; i++) if($i==name) print i; exit}' $name` | |
68 | |
69 # create a file of two columns in temp | |
70 awk -v k1=${i} -v k2=$((i+1)) '{print $k1 "\t" $k2}' $name | sed -n "2,${len}p" > temp.${di} | |
71 | |
72 # put the header of the column | |
73 echo ${di} > cp${i} | |
74 | |
75 # correlation dinucleotide profile along the sequence | |
76 # ./corrprofile aa -n 219 -w 146 | |
77 # output in the temp column | |
78 ${call} temp.${di} -n $((len-1)) -w ${window} >> cp${i} | |
79 # print the maximum | |
80 posval=`cat cp${i} | grep -v ${di} | cat -n | sort -k2,2n | tail -n1` | |
81 echo ${di} ${posval} >> maxpos | |
82 | |
83 done | |
84 | |
85 paste cp[1-9]* > temp | |
86 | |
87 # compute average correlation profile | |
88 awk 'NR==2{next} | |
89 {T=0; | |
90 for(N=1; N<=NF; N++) T+=$N; | |
91 T/=NF | |
92 print T "\t" $0}' temp > ${out} | |
93 | |
94 # find a position of the maximum and output it | |
95 posval=`cat ${out}| awk '{print $1} ' | cat -n | sort -k2,2n | tail -n1` | |
96 echo "avg " ${posval} >> maxpos | |
97 cat maxpos | tr "\n" , | |
98 | |
99 rm temp* cp[1-9]* maxpos |