view dnp-mapping.sh @ 0:b4bef5178d86 draft default tip

"planemo upload commit 4846fbc43d4c7437de1ce996392fd13a71abd9c7"
author erinija
date Tue, 07 Sep 2021 15:03:57 +0000
parents
children
line wrap: on
line source

#!/bin/sh
if test "$#" -ne 6; then

echo ""
echo " CALL  "
echo "   sh dnp-mapping.sh input.fasta input.pattern input.trimstart input.length output.file1 output.file2"
echo ""
echo " INPUT" 
echo "   input.fasta     - input fasta file "
echo "   input.pattern   - 'one or more columns with dinucleotide frequency pattern'"
echo "   input.trimstart - 'number of positions to trim from the start of the sequence'"
echo "   input.length    - 'sequence length to retain past trimming from start'"
echo ""
echo " OUTPUT"
echo "   output.file1   - tabular file with correlations  "
echo "   output.file2   - file to store the max correlation position  "

echo ""
echo " DESCRIPTION"
echo "   Each sequence in the fasta file is reduced by trimming  "
echo "   and retaining a given number of positions, but no less than 147." 
echo "   Correlation of the nucleosome's sequence with the patterns" 
echo "   is computed within the sliding window. Correlation coefficients "
echo "   of the patterns with the sequence starting at a position 73 - dyad " 
echo "   are computed and saved in output.file1. The maximum correlation position"
echo "   is saved in output.file2."
echo ""
echo " REQUIREMENT"
echo "   dnp-mapping installed"
echo "   conda install -c bioconda dnp-mapping"
echo ""
  exit 1
fi

faseqfile=$1
patternfile=$2
seqstart=$3
seqlength=$4

outfile1=$5
outfile2=$6

call=dnp-mapping


awk_program=$( cat << 'EOF' 
###################################################################
# position of maximum
# parameters: window=W (minimal distance between two peaks)
#             buffer=N (size of buffer)
###################################################################
function max_pos_funct(min_pos, max_pos)
{
  sum=0;
    start_position=min_pos;
    for(i=min_pos+window;i<=max_pos&&sum<1000;)
    {
      sum++;
      max=arr[i];
      pos=i;
      for(j=i-window;j<=i+window&&j<=max_pos;j++)
      {
        if(arr[j]>max)
        {
          max=arr[j]
          pos=j
        }
      }
      if(arr[pos]>=arr[pos-1]&&arr[pos]>=arr[pos+1]&&arr[pos]>0)
      {
        if(pos==i)
        {
          start_position=pos+window+1
          printf("%d %f\n", pos+buffer*num_buf, arr[pos]);
          i=pos+window*2+1;
        }
        else
        {
          if(pos>=start_position&&pos>min_pos)
          {
            i=pos;
          }
          else
          {
            i+=window*2+1;
          }
        }
      }
      else
      {
        i+=window+1;
      }
      if(sum==999)
      {
        i+=window*3;
        sum=1;
      }
    }
#  printf("\n");
}
{
  if(FNR==1)
    num_buf=0;
  pos_buf=int($1/buffer);
  if(pos_buf>num_buf)
  {
    max_pos_funct(1,buffer);
    num_buf=pos_buf;
  }
  arr[$1-num_buf*buffer]=$2;
}
END{
  max_pos_funct(1,$1-num_buf*buffer);
}

EOF
)

> ${outfile2}
> ${outfile1}

for seq in `cat ${faseqfile} | tr "\t" "="`; 
  do 
    echo $seq | sed 's/=.*$//' > id; 
    echo $seq | sed 's/^.*=//' > dseq; 
    dseq=`cat dseq`;
    echo ${dseq:${seqstart}:${seqlength}} > dseq 
    id=`cat id`;
    echo ${id}
    cat dseq
    ${call} -m ${patternfile} -s dseq | awk -v id=${id} '{print $0 "\t" id}'  >>  ${outfile1}

    #compute average correlation    
    cat ${outfile1} | gawk '{sum=0; for(i=2;i<=NF;i++) sum+=$i; print $1, sum/(NF-1);}' > avgc
    
    # compute most likely position of the nucleosome 
    cat avgc | awk "$awk_program" window=73 buffer=10000 | awk -v num=$patternfile -v id=$id '{print id "\t" num "\t" $0}' >> ${outfile2} ; 
 done 

rm id dseq avgc 
exit 0