diff dnp-select-range.sh @ 0:448204d12325 draft default tip

"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
author erinija
date Fri, 01 May 2020 12:13:00 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dnp-select-range.sh	Fri May 01 12:13:00 2020 +0000
@@ -0,0 +1,89 @@
+#!/bin/sh
+
+if test "$#" -ne 5; then
+echo ""
+echo " CALL "  
+echo "   sh dnp-select-range.sh select-range-input.tabular start length  dinucleotides select-range-output.tabular"
+echo ""
+echo " INPUT "
+echo "   select-range-input.tabular  - full length dinucleotide frequency profiles"
+echo "   start                       - start position of selection - identified start position of nucleosome's sequence"
+echo "   length                      - length of selection; default =146 nucleosome's length in base pairs"
+echo "   dinucleotides               - any subset of dinucleotides enclosed by quotes 'AA AC AG AT CA CC ...'"
+echo ""
+echo " OUTPUT "
+echo "   select-range-output.tabular - rows of input file selected at a given position spanning a length"
+echo ""
+echo " DESCRIPTION"
+echo "   Selects rows from input table within a given range and adds a column with a position number."
+echo ""
+echo "   Example of an input table"
+echo "   AA.f	        AA.r	        AC.f	        AC.r	        AG.f	        AG.r	        AT.f	        AT.r       ..."
+echo "   0.0763         0.067920	0.057800	0.078120	0.081600	0.061960	0.055600	0.044080   ..."
+echo "   0.077160	0.073760	0.056000	0.072160	0.079400	0.060720	0.055960	0.047040   ..."
+echo "   0.083320	0.071200	0.053840	0.080760	0.084560	0.064880	0.050440	0.048720   ..."
+echo "   0.077960	0.068200	0.056040	0.075520	0.080120	0.061680	0.053160	0.047400   ..."
+echo "   0.078200	0.069120	0.056880	0.074000	0.084360	0.060840	0.053520	0.046280   ..."
+echo "   ... "
+echo ""
+echo "   Example of an output table where start=20"
+echo "   pos	AA.f	        AC.f	        AG.f	        AT.f	        CA.f	        CC.f	        CG.f	        CT.f     ..."
+echo "   20	0.100200	0.084720	0.077200	0.072480	0.066160	0.044160	0.004560	0.060720 ..."
+echo "   21	0.172440	0.024800	0.002080	0.101240	0.131840	0.007200	0.000320	0.095920 ..."
+echo "   22	0.077160	0.096240	0.314320	0.047360	0.012040	0.028560	0.011840	0.013680 ..."
+echo "   ..."
+
+    exit 1
+fi
+
+
+# input file name
+name=$1
+
+# start position
+startpos=$2
+
+# length of selection (146bp)
+length=$3
+
+# list of dinucleotides
+dinucleotides=$4
+
+# output file
+out=$5
+
+# compute length of the file
+len=`wc ${name} |awk '{print $1}'` 
+#echo $len
+
+len=$((len-1))
+
+# endpos 
+endpos=$((startpos+length))
+
+cnum=1
+# TO DO:should scheck if endpos is within the range
+for di in ${dinucleotides}
+do
+    # column number for forward and complementary profile
+    i1=`awk -v name=$di'.f' '{ for (i=1; i<=NF; i++) if($i==name) print i; exit}' ${name}`
+    i2=`awk -v name=$di'.r' '{ for (i=1; i<=NF; i++) if($i==name) print i; exit}' ${name}`
+
+    echo "$di.f" > forward.${di}
+    awk -v k=${i1} '{print $k}' $name | sed -n "${startpos},${endpos}p" >> forward.${di}
+    echo "$di.r"> complement.${di}
+    awk -v k=${i2} '{print $k}' $name | sed -n "${startpos},${endpos}p" >> complement.${di}
+cnum=$((cnum+1))
+done
+
+# sequence positions within range
+echo "pos" > seq.pos
+seq ${startpos} 1 ${endpos} >> seq.pos
+
+# paste forward
+paste seq.pos forward.* > forward
+paste complement.* > complement
+
+# create output
+paste forward complement > ${out}
+rm seq.pos forward* complement*