Mercurial > repos > erinija > dnp_correlation_between_profiles
diff dnp-correlation-between-profiles.sh @ 0:b45de206654d draft default tip
"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
author | erinija |
---|---|
date | Fri, 01 May 2020 12:08:23 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dnp-correlation-between-profiles.sh Fri May 01 12:08:23 2020 +0000 @@ -0,0 +1,99 @@ +#!/bin/sh + +if test "$#" -ne 4; then + +echo "" +echo " CALL " +echo " sh dnp-correlation-between-profiles.sh difreq.tabular winsize dinucleotides correlations.tabular" +echo "" +echo " INPUT" +echo " difreq.tabular - tabular file of dinucleotide frequencies on forward and complementary fasta sequences " +echo " winsize - size of sliding window within which correlation is computed (default 146)" +echo " dinucleotides - 'AA AC AG AT ...'" +echo "" +echo " OUTPUT" +echo " correlations.tabular - tabular file of Pearson correlations at each position for given dinucleotides" +echo "" +echo " DESCRIPTION" +echo " To find a position of highest symmetry between the" +echo " dinucleotides positional frequency profiles on" +echo " forward and complementary sequences a Pearson" +echo " correlation coefficient is computed at each position" +echo " between forward and complementary dinucleotide profiles" +echo " within a sliding window." +echo " Input file contains dinucleotide frequencies arranged in columns" +echo " for input dinucleotides named by *.f and *.r corresponding to" +echo " forward and complementary profiles as:" +echo "" +echo " AA.f AA.r AC.f AC.r AG.f AG.r" +echo " 0.076320 0.067920 0.057800 0.078120 0.081600 0.061960" +echo " 0.072540 0.069520 0.041800 0.079820 0.076300 0.055190" +echo " ... ... ... ... ... ..." +echo "" +echo " The input.tabular file is output of a dnp-subset-dinuc-profile.sh script." +echo " A tabular output file contains columns of correlation coefficients " +echo " along the positions of the frequency profile. A first column labelled 0 contains " +echo " average of all correlations at each position:" +echo "" +echo " 0 AA AC AG" +echo " 0.042205 0.0882716 0.030175 0.126967 " +echo " ... ... ... ..." +echo "" +echo " Position and value of maximum correlations" +echo " of each dinucleotide are printed to a standard output." +echo "" +echo " REQUIREMENT" +echo " dnp-corrprofile installed" +echo " conda install -c bioconda dnp-corrprofile" +echo "" + exit 1 + +fi + +name=$1 +window=$2 +dinucleotides=$3 +out=$4 + +call=dnp-corrprofile + +# compute length of the file +len=`wc ${name} |awk '{print $1}'` +len=$((len-1)) + +for di in ${dinucleotides} +do + #extract column numbers of required dinucleotide + i=`awk -v name=$di'.f' '{ for (i=1; i<=NF; i++) if($i==name) print i; exit}' $name` + + # create a file of two columns in temp + awk -v k1=${i} -v k2=$((i+1)) '{print $k1 "\t" $k2}' $name | sed -n "2,${len}p" > temp.${di} + + # put the header of the column + echo ${di} > cp${i} + + # correlation dinucleotide profile along the sequence + # ./corrprofile aa -n 219 -w 146 + # output in the temp column + ${call} temp.${di} -n $((len-1)) -w ${window} >> cp${i} + # print the maximum + posval=`cat cp${i} | grep -v ${di} | cat -n | sort -k2,2n | tail -n1` + echo ${di} ${posval} >> maxpos + +done + +paste cp[1-9]* > temp + +# compute average correlation profile +awk 'NR==2{next} + {T=0; + for(N=1; N<=NF; N++) T+=$N; + T/=NF + print T "\t" $0}' temp > ${out} + +# find a position of the maximum and output it +posval=`cat ${out}| awk '{print $1} ' | cat -n | sort -k2,2n | tail -n1` +echo "avg " ${posval} >> maxpos +cat maxpos | tr "\n" , + +rm temp* cp[1-9]* maxpos