Mercurial > repos > erinija > dnp_correlation_between_profiles

diff dnp-subset-dinuc-profile.sh @ 0:b45de206654d draft default tip
"planemo upload commit 1a32efb8343938e8d49190003f251c78b5a58225-dirty"
author: erinija
date: Fri, 01 May 2020 12:08:23 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dnp-subset-dinuc-profile.sh	Fri May 01 12:08:23 2020 +0000
@@ -0,0 +1,74 @@
+#!/bin/sh
+if test "$#" -ne 3; then
+
+echo  " CALL "
+echo  "   sh subset_dinuc_profile.sh input.fasta  dinucleotides output"
+echo  ""
+echo  " INPUT"
+echo  "   input.fasta - a batch of nucleosome (or any DNA) DNA sequences "
+echo  "   dinucleotides - any subset of dinucleotides enclosed by quotes as 'AA AC AG AT CA CC' "
+echo  ""
+echo  " OUTPUT"
+echo  "   output - file name to write the output  in tabular format, columns have names as AA.f AA.r ..."
+echo  ""
+echo  " DESCRIPTION"
+echo  "   Compute dinucleotide frequency profiles on forward and its complementary "
+echo  "   sequences from a batch of fasta sequences. Output columns are labelled by AA.f, AA.r ... "
+echo  ""
+echo  " Example of input fasta lines"
+echo  "  >chr9:42475963-42476182"
+echo  "  CCAGGCAGACCCCATATTCAAGCTGCTGCCCCAGGGTGGTGTACAGATCTGGGGAGAAGAAGGATGA"
+echo  "  >chr9:42476175-42476394"
+echo  "  TCTGCACTCCAGCATGCCTGAGGAGAGGAGGGAATGCAGGATCCTAGTGGAAAGAGTACCAAGCTGG"
+echo ""
+echo  " Example of output  table"
+echo  "  AA.f            AA.r            AC.f            AC.r   ..."
+echo  "  0.076000        0.059000        0.065000        0.078000 ..."
+echo  "  0.082000        0.060000        0.057000        0.076000 ..."
+echo  "  0.067000        0.075000        0.049000        0.071000 ..."
+echo  ""
+echo ""
+echo " REQUIREMENT"
+echo "   dnp-diprofile installed"
+echo "   conda install -c bioconda dnp-diprofile"
+
+    exit 0
+fi
+
+name=$1
+diset=$2
+out=$3
+
+call=dnp-diprofile
+
+## the dinucleotide profiles are computed for the subset of dinucleotides listed in $diset
+## the profiles are outputs as columns of a table 
+
+# prepare fasta, we copy here because
+# in galaxy we don't have fa ending which is required by the dinuc
+cp ${name} ${name}.fa
+
+# compute length of the fasta sequence
+seq=`head -n2 $name | tail -n1`
+len=${#seq}
+#echo "Sequence length = " $len
+
+
+# for each dinucleotide compute the forward
+# and complementary profile and save
+# in separate columns that will be merged in the end
+for di in ${diset}
+do
+    #echo ${di}
+    echo ${di}.f > ${di}.f
+    ${call} ${name}.fa -di ${di} -sl ${len} >> ${di}.f
+    echo ${di}.r > ${di}.r
+    ${call} ${name}.fa -di ${di} -sl ${len} -c >> ${di}.r
+    echo ${di}.f >> names
+    echo ${di}.r >> names
+done;
+paste `cat names` > ${out}
+rm names
+rm ${name}.fa
+rm *.f *.r
+exit 0
author	erinija
date	Fri, 01 May 2020 12:08:23 +0000
parents
children