Mercurial > repos > erinija > dnp_correlation_between_profiles
diff dnp-compute-composite.sh @ 0:b45de206654d draft default tip
"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
author | erinija |
---|---|
date | Fri, 01 May 2020 12:08:23 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dnp-compute-composite.sh Fri May 01 12:08:23 2020 +0000 @@ -0,0 +1,77 @@ +#!/bin/sh + +if test "$#" -ne 2; then +echo "" +echo " CALL " +echo " sh dnp-compute-composite.sh compute-composite-input.tabular compute-composite-output.tabular" +echo "" +echo " INPUT " +echo " compute-composite-input.tabular - dinucleotide frequency profiles containing all 16 dinucleotides" +echo "" +echo " OUTPUT " +echo " compute-composite-output.tabular - original input with WW SS RR YY columns added to the output" +echo "" +echo " DESCRIPTION" +echo " Composite dinucleotides weak/weak WW (A or T) , strong/strong SS (G or C), purine/purine RR (A or G), " +echo " and pyrimidine/pyrimidine YY (C or T) are generalized dinucleotide frequency patterns in nucleosome sequences." +echo " Given a tabular innput file with all 16 dinucleotides the composite patterns are computed as follows" +echo " WW=AA+AT+TA+TT, SS=CC+CG+GC+GG, RR=AG+GA+AA+GG, YY=CC+TT+CT+TC and their columns added to the original table." +echo "" +echo " Example columns of an input table" +echo " pos AA AC AG AT ..." +echo " -73 0.08616 0.08034 0.07146 0.05934 ..." +echo " -72 0.11976 0.04966 0.03412 0.07274 ..." +echo " -71 0.07202 0.08882 0.18912 0.0462 ..." +echo " ... " +echo " Example columns of computed composites " +echo " ... WW SS RR" +echo " ... 0.27644 0.1614 0.29494" +echo " ... 0.36788 0.1091 0.29428" +echo " ... 0.21406 0.12566 0.34432" +echo " ..." + + exit 1 +fi + +name=$1 +out=$2 + +## TO DO test that all required nucleotide columns are in file +## if not, stop the execution + +# add the column of composite profile at the end +## WW = AA+TT+AT+TA + i1=`awk '{ for (i=1; i<=NF; i++) if($i=="AA") print i; exit}' ${name}` + i2=`awk '{ for (i=1; i<=NF; i++) if($i=="AT") print i; exit}' ${name}` + i3=`awk '{ for (i=1; i<=NF; i++) if($i=="TA") print i; exit}' ${name}` + i4=`awk '{ for (i=1; i<=NF; i++) if($i=="TT") print i; exit}' ${name}` +echo WW > ww +cut -f${i1},${i2},${i3},${i4} ${name}| tail -n +2 | awk '{print $1+$2+$3+$4}' >> ww + +## SS = CC+CG+GC+GG + i1=`awk '{ for (i=1; i<=NF; i++) if($i=="CC") print i; exit}' ${name}` + i2=`awk '{ for (i=1; i<=NF; i++) if($i=="CG") print i; exit}' ${name}` + i3=`awk '{ for (i=1; i<=NF; i++) if($i=="GC") print i; exit}' ${name}` + i4=`awk '{ for (i=1; i<=NF; i++) if($i=="GG") print i; exit}' ${name}` +echo SS > ss +cut -f${i1},${i2},${i3},${i4} ${name}| tail -n +2 | awk '{print $1+$2+$3+$4}' >> ss + +## RR = AA+AG+GA+GG + i1=`awk '{ for (i=1; i<=NF; i++) if($i=="AA") print i; exit}' ${name}` + i2=`awk '{ for (i=1; i<=NF; i++) if($i=="AG") print i; exit}' ${name}` + i3=`awk '{ for (i=1; i<=NF; i++) if($i=="GA") print i; exit}' ${name}` + i4=`awk '{ for (i=1; i<=NF; i++) if($i=="GG") print i; exit}' ${name}` +echo RR > rr +cut -f${i1},${i2},${i3},${i4} ${name}| tail -n +2 | awk '{print $1+$2+$3+$4}' >> rr + +## YY = CC+CT+TC+TT + i1=`awk '{ for (i=1; i<=NF; i++) if($i=="CC") print i; exit}' ${name}` + i2=`awk '{ for (i=1; i<=NF; i++) if($i=="CT") print i; exit}' ${name}` + i3=`awk '{ for (i=1; i<=NF; i++) if($i=="TC") print i; exit}' ${name}` + i4=`awk '{ for (i=1; i<=NF; i++) if($i=="TT") print i; exit}' ${name}` +echo YY > yy +cut -f${i1},${i2},${i3},${i4} ${name}| tail -n +2 |awk '{print $1+$2+$3+$4}' >> yy + +paste ${name} ww ss rr yy > ${out} + +rm ww ss rr yy