diff dnp-mapping.sh @ 0:b4bef5178d86 draft default tip

"planemo upload commit 4846fbc43d4c7437de1ce996392fd13a71abd9c7"
author erinija
date Tue, 07 Sep 2021 15:03:57 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dnp-mapping.sh	Tue Sep 07 15:03:57 2021 +0000
@@ -0,0 +1,142 @@
+#!/bin/sh
+if test "$#" -ne 6; then
+
+echo ""
+echo " CALL  "
+echo "   sh dnp-mapping.sh input.fasta input.pattern input.trimstart input.length output.file1 output.file2"
+echo ""
+echo " INPUT" 
+echo "   input.fasta     - input fasta file "
+echo "   input.pattern   - 'one or more columns with dinucleotide frequency pattern'"
+echo "   input.trimstart - 'number of positions to trim from the start of the sequence'"
+echo "   input.length    - 'sequence length to retain past trimming from start'"
+echo ""
+echo " OUTPUT"
+echo "   output.file1   - tabular file with correlations  "
+echo "   output.file2   - file to store the max correlation position  "
+
+echo ""
+echo " DESCRIPTION"
+echo "   Each sequence in the fasta file is reduced by trimming  "
+echo "   and retaining a given number of positions, but no less than 147." 
+echo "   Correlation of the nucleosome's sequence with the patterns" 
+echo "   is computed within the sliding window. Correlation coefficients "
+echo "   of the patterns with the sequence starting at a position 73 - dyad " 
+echo "   are computed and saved in output.file1. The maximum correlation position"
+echo "   is saved in output.file2."
+echo ""
+echo " REQUIREMENT"
+echo "   dnp-mapping installed"
+echo "   conda install -c bioconda dnp-mapping"
+echo ""
+  exit 1
+fi
+
+faseqfile=$1
+patternfile=$2
+seqstart=$3
+seqlength=$4
+
+outfile1=$5
+outfile2=$6
+
+call=dnp-mapping
+
+
+awk_program=$( cat << 'EOF' 
+###################################################################
+# position of maximum
+# parameters: window=W (minimal distance between two peaks)
+#             buffer=N (size of buffer)
+###################################################################
+function max_pos_funct(min_pos, max_pos)
+{
+  sum=0;
+    start_position=min_pos;
+    for(i=min_pos+window;i<=max_pos&&sum<1000;)
+    {
+      sum++;
+      max=arr[i];
+      pos=i;
+      for(j=i-window;j<=i+window&&j<=max_pos;j++)
+      {
+        if(arr[j]>max)
+        {
+          max=arr[j]
+          pos=j
+        }
+      }
+      if(arr[pos]>=arr[pos-1]&&arr[pos]>=arr[pos+1]&&arr[pos]>0)
+      {
+        if(pos==i)
+        {
+          start_position=pos+window+1
+          printf("%d %f\n", pos+buffer*num_buf, arr[pos]);
+          i=pos+window*2+1;
+        }
+        else
+        {
+          if(pos>=start_position&&pos>min_pos)
+          {
+            i=pos;
+          }
+          else
+          {
+            i+=window*2+1;
+          }
+        }
+      }
+      else
+      {
+        i+=window+1;
+      }
+      if(sum==999)
+      {
+        i+=window*3;
+        sum=1;
+      }
+    }
+#  printf("\n");
+}
+{
+  if(FNR==1)
+    num_buf=0;
+  pos_buf=int($1/buffer);
+  if(pos_buf>num_buf)
+  {
+    max_pos_funct(1,buffer);
+    num_buf=pos_buf;
+  }
+  arr[$1-num_buf*buffer]=$2;
+}
+END{
+  max_pos_funct(1,$1-num_buf*buffer);
+}
+
+EOF
+)
+
+> ${outfile2}
+> ${outfile1}
+
+for seq in `cat ${faseqfile} | tr "\t" "="`; 
+  do 
+    echo $seq | sed 's/=.*$//' > id; 
+    echo $seq | sed 's/^.*=//' > dseq; 
+    dseq=`cat dseq`;
+    echo ${dseq:${seqstart}:${seqlength}} > dseq 
+    id=`cat id`;
+    echo ${id}
+    cat dseq
+    ${call} -m ${patternfile} -s dseq | awk -v id=${id} '{print $0 "\t" id}'  >>  ${outfile1}
+
+    #compute average correlation    
+    cat ${outfile1} | gawk '{sum=0; for(i=2;i<=NF;i++) sum+=$i; print $1, sum/(NF-1);}' > avgc
+    
+    # compute most likely position of the nucleosome 
+    cat avgc | awk "$awk_program" window=73 buffer=10000 | awk -v num=$patternfile -v id=$id '{print id "\t" num "\t" $0}' >> ${outfile2} ; 
+ done 
+
+rm id dseq avgc 
+exit 0
+