Mercurial > repos > erinija > dnp_binary_strings
comparison dnp-correlation-between-profiles.sh @ 0:611156829647 draft default tip
"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
| author | erinija |
|---|---|
| date | Fri, 01 May 2020 12:07:46 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:611156829647 |
|---|---|
| 1 #!/bin/sh | |
| 2 | |
| 3 if test "$#" -ne 4; then | |
| 4 | |
| 5 echo "" | |
| 6 echo " CALL " | |
| 7 echo " sh dnp-correlation-between-profiles.sh difreq.tabular winsize dinucleotides correlations.tabular" | |
| 8 echo "" | |
| 9 echo " INPUT" | |
| 10 echo " difreq.tabular - tabular file of dinucleotide frequencies on forward and complementary fasta sequences " | |
| 11 echo " winsize - size of sliding window within which correlation is computed (default 146)" | |
| 12 echo " dinucleotides - 'AA AC AG AT ...'" | |
| 13 echo "" | |
| 14 echo " OUTPUT" | |
| 15 echo " correlations.tabular - tabular file of Pearson correlations at each position for given dinucleotides" | |
| 16 echo "" | |
| 17 echo " DESCRIPTION" | |
| 18 echo " To find a position of highest symmetry between the" | |
| 19 echo " dinucleotides positional frequency profiles on" | |
| 20 echo " forward and complementary sequences a Pearson" | |
| 21 echo " correlation coefficient is computed at each position" | |
| 22 echo " between forward and complementary dinucleotide profiles" | |
| 23 echo " within a sliding window." | |
| 24 echo " Input file contains dinucleotide frequencies arranged in columns" | |
| 25 echo " for input dinucleotides named by *.f and *.r corresponding to" | |
| 26 echo " forward and complementary profiles as:" | |
| 27 echo "" | |
| 28 echo " AA.f AA.r AC.f AC.r AG.f AG.r" | |
| 29 echo " 0.076320 0.067920 0.057800 0.078120 0.081600 0.061960" | |
| 30 echo " 0.072540 0.069520 0.041800 0.079820 0.076300 0.055190" | |
| 31 echo " ... ... ... ... ... ..." | |
| 32 echo "" | |
| 33 echo " The input.tabular file is output of a dnp-subset-dinuc-profile.sh script." | |
| 34 echo " A tabular output file contains columns of correlation coefficients " | |
| 35 echo " along the positions of the frequency profile. A first column labelled 0 contains " | |
| 36 echo " average of all correlations at each position:" | |
| 37 echo "" | |
| 38 echo " 0 AA AC AG" | |
| 39 echo " 0.042205 0.0882716 0.030175 0.126967 " | |
| 40 echo " ... ... ... ..." | |
| 41 echo "" | |
| 42 echo " Position and value of maximum correlations" | |
| 43 echo " of each dinucleotide are printed to a standard output." | |
| 44 echo "" | |
| 45 echo " REQUIREMENT" | |
| 46 echo " dnp-corrprofile installed" | |
| 47 echo " conda install -c bioconda dnp-corrprofile" | |
| 48 echo "" | |
| 49 exit 1 | |
| 50 | |
| 51 fi | |
| 52 | |
| 53 name=$1 | |
| 54 window=$2 | |
| 55 dinucleotides=$3 | |
| 56 out=$4 | |
| 57 | |
| 58 call=dnp-corrprofile | |
| 59 | |
| 60 # compute length of the file | |
| 61 len=`wc ${name} |awk '{print $1}'` | |
| 62 len=$((len-1)) | |
| 63 | |
| 64 for di in ${dinucleotides} | |
| 65 do | |
| 66 #extract column numbers of required dinucleotide | |
| 67 i=`awk -v name=$di'.f' '{ for (i=1; i<=NF; i++) if($i==name) print i; exit}' $name` | |
| 68 | |
| 69 # create a file of two columns in temp | |
| 70 awk -v k1=${i} -v k2=$((i+1)) '{print $k1 "\t" $k2}' $name | sed -n "2,${len}p" > temp.${di} | |
| 71 | |
| 72 # put the header of the column | |
| 73 echo ${di} > cp${i} | |
| 74 | |
| 75 # correlation dinucleotide profile along the sequence | |
| 76 # ./corrprofile aa -n 219 -w 146 | |
| 77 # output in the temp column | |
| 78 ${call} temp.${di} -n $((len-1)) -w ${window} >> cp${i} | |
| 79 # print the maximum | |
| 80 posval=`cat cp${i} | grep -v ${di} | cat -n | sort -k2,2n | tail -n1` | |
| 81 echo ${di} ${posval} >> maxpos | |
| 82 | |
| 83 done | |
| 84 | |
| 85 paste cp[1-9]* > temp | |
| 86 | |
| 87 # compute average correlation profile | |
| 88 awk 'NR==2{next} | |
| 89 {T=0; | |
| 90 for(N=1; N<=NF; N++) T+=$N; | |
| 91 T/=NF | |
| 92 print T "\t" $0}' temp > ${out} | |
| 93 | |
| 94 # find a position of the maximum and output it | |
| 95 posval=`cat ${out}| awk '{print $1} ' | cat -n | sort -k2,2n | tail -n1` | |
| 96 echo "avg " ${posval} >> maxpos | |
| 97 cat maxpos | tr "\n" , | |
| 98 | |
| 99 rm temp* cp[1-9]* maxpos |
