view dnp-correlation-between-profiles.sh @ 0:7e69f62cb516 draft default tip

"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
author erinija
date Fri, 01 May 2020 12:10:21 +0000
parents
children
line wrap: on
line source

#!/bin/sh

if test "$#" -ne 4; then

echo ""
echo " CALL  "
echo "   sh dnp-correlation-between-profiles.sh difreq.tabular winsize  dinucleotides correlations.tabular"
echo ""
echo " INPUT"
echo "   difreq.tabular - tabular file of dinucleotide frequencies on forward and complementary fasta sequences "
echo "   winsize        - size of sliding window within which correlation is computed (default 146)"
echo "   dinucleotides  - 'AA AC AG AT ...'"
echo ""
echo " OUTPUT"
echo "   correlations.tabular - tabular file of Pearson correlations at each position for given dinucleotides"
echo ""
echo " DESCRIPTION"
echo "   To find a position of highest symmetry between the"
echo "   dinucleotides positional frequency profiles on"
echo "   forward and complementary sequences  a Pearson" 
echo "   correlation coefficient is computed at each position"
echo "   between forward and complementary dinucleotide profiles"
echo "   within a sliding window."
echo "   Input file contains dinucleotide frequencies arranged in columns"
echo "   for input dinucleotides named by *.f and *.r corresponding to"
echo "   forward and complementary profiles as:"
echo ""
echo " AA.f      AA.r      AC.f      AC.r      AG.f      AG.r"
echo " 0.076320  0.067920  0.057800  0.078120  0.081600   0.061960"
echo " 0.072540  0.069520  0.041800  0.079820  0.076300   0.055190"
echo " ...       ...       ...       ...       ...        ..."
echo ""
echo "   The input.tabular file is output of a dnp-subset-dinuc-profile.sh script." 
echo "   A tabular output file contains columns of correlation coefficients "
echo "   along the positions of the frequency profile. A first column labelled 0 contains "
echo "   average of all correlations at each position:"
echo ""
echo " 0        AA        AC       AG"
echo " 0.042205 0.0882716 0.030175 0.126967 "
echo " ...      ...       ...      ..."
echo ""
echo "   Position and value of maximum correlations"
echo "   of each dinucleotide are printed to a standard output."
echo ""
echo " REQUIREMENT"
echo "   dnp-corrprofile installed"
echo "   conda install -c bioconda dnp-corrprofile"
echo ""
  exit 1

fi

name=$1
window=$2
dinucleotides=$3
out=$4

call=dnp-corrprofile

# compute length of the file
len=`wc ${name} |awk '{print $1}'` 
len=$((len-1))

for di in ${dinucleotides}
do
    #extract column numbers of required dinucleotide
    i=`awk -v name=$di'.f' '{ for (i=1; i<=NF; i++) if($i==name) print i; exit}' $name`

    # create a file of two columns in temp
    awk -v k1=${i} -v k2=$((i+1)) '{print $k1 "\t" $k2}' $name | sed -n "2,${len}p" > temp.${di} 

    # put the header of the column
    echo ${di} > cp${i}
    
    # correlation dinucleotide profile along the sequence
    # ./corrprofile aa -n 219 -w 146
    # output in the temp column
    ${call} temp.${di} -n $((len-1)) -w ${window} >> cp${i}
    # print the maximum    
    posval=`cat cp${i} | grep -v ${di} | cat -n | sort -k2,2n | tail -n1`
    echo ${di} ${posval} >> maxpos
    
done

paste cp[1-9]* > temp

# compute average correlation profile
awk 'NR==2{next} 
       {T=0;
         for(N=1; N<=NF; N++) T+=$N;
         T/=NF
         print T "\t" $0}' temp > ${out}

# find a position of the maximum and output it
posval=`cat ${out}| awk '{print $1} ' | cat -n | sort -k2,2n | tail -n1` 
echo "avg " ${posval} >> maxpos
cat maxpos | tr "\n" , 

rm temp*  cp[1-9]* maxpos