comparison dnp-mapping.sh @ 0:b4bef5178d86 draft default tip

"planemo upload commit 4846fbc43d4c7437de1ce996392fd13a71abd9c7"
author erinija
date Tue, 07 Sep 2021 15:03:57 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b4bef5178d86
1 #!/bin/sh
2 if test "$#" -ne 6; then
3
4 echo ""
5 echo " CALL "
6 echo " sh dnp-mapping.sh input.fasta input.pattern input.trimstart input.length output.file1 output.file2"
7 echo ""
8 echo " INPUT"
9 echo " input.fasta - input fasta file "
10 echo " input.pattern - 'one or more columns with dinucleotide frequency pattern'"
11 echo " input.trimstart - 'number of positions to trim from the start of the sequence'"
12 echo " input.length - 'sequence length to retain past trimming from start'"
13 echo ""
14 echo " OUTPUT"
15 echo " output.file1 - tabular file with correlations "
16 echo " output.file2 - file to store the max correlation position "
17
18 echo ""
19 echo " DESCRIPTION"
20 echo " Each sequence in the fasta file is reduced by trimming "
21 echo " and retaining a given number of positions, but no less than 147."
22 echo " Correlation of the nucleosome's sequence with the patterns"
23 echo " is computed within the sliding window. Correlation coefficients "
24 echo " of the patterns with the sequence starting at a position 73 - dyad "
25 echo " are computed and saved in output.file1. The maximum correlation position"
26 echo " is saved in output.file2."
27 echo ""
28 echo " REQUIREMENT"
29 echo " dnp-mapping installed"
30 echo " conda install -c bioconda dnp-mapping"
31 echo ""
32 exit 1
33 fi
34
35 faseqfile=$1
36 patternfile=$2
37 seqstart=$3
38 seqlength=$4
39
40 outfile1=$5
41 outfile2=$6
42
43 call=dnp-mapping
44
45
46 awk_program=$( cat << 'EOF'
47 ###################################################################
48 # position of maximum
49 # parameters: window=W (minimal distance between two peaks)
50 # buffer=N (size of buffer)
51 ###################################################################
52 function max_pos_funct(min_pos, max_pos)
53 {
54 sum=0;
55 start_position=min_pos;
56 for(i=min_pos+window;i<=max_pos&&sum<1000;)
57 {
58 sum++;
59 max=arr[i];
60 pos=i;
61 for(j=i-window;j<=i+window&&j<=max_pos;j++)
62 {
63 if(arr[j]>max)
64 {
65 max=arr[j]
66 pos=j
67 }
68 }
69 if(arr[pos]>=arr[pos-1]&&arr[pos]>=arr[pos+1]&&arr[pos]>0)
70 {
71 if(pos==i)
72 {
73 start_position=pos+window+1
74 printf("%d %f\n", pos+buffer*num_buf, arr[pos]);
75 i=pos+window*2+1;
76 }
77 else
78 {
79 if(pos>=start_position&&pos>min_pos)
80 {
81 i=pos;
82 }
83 else
84 {
85 i+=window*2+1;
86 }
87 }
88 }
89 else
90 {
91 i+=window+1;
92 }
93 if(sum==999)
94 {
95 i+=window*3;
96 sum=1;
97 }
98 }
99 # printf("\n");
100 }
101 {
102 if(FNR==1)
103 num_buf=0;
104 pos_buf=int($1/buffer);
105 if(pos_buf>num_buf)
106 {
107 max_pos_funct(1,buffer);
108 num_buf=pos_buf;
109 }
110 arr[$1-num_buf*buffer]=$2;
111 }
112 END{
113 max_pos_funct(1,$1-num_buf*buffer);
114 }
115
116 EOF
117 )
118
119 > ${outfile2}
120 > ${outfile1}
121
122 for seq in `cat ${faseqfile} | tr "\t" "="`;
123 do
124 echo $seq | sed 's/=.*$//' > id;
125 echo $seq | sed 's/^.*=//' > dseq;
126 dseq=`cat dseq`;
127 echo ${dseq:${seqstart}:${seqlength}} > dseq
128 id=`cat id`;
129 echo ${id}
130 cat dseq
131 ${call} -m ${patternfile} -s dseq | awk -v id=${id} '{print $0 "\t" id}' >> ${outfile1}
132
133 #compute average correlation
134 cat ${outfile1} | gawk '{sum=0; for(i=2;i<=NF;i++) sum+=$i; print $1, sum/(NF-1);}' > avgc
135
136 # compute most likely position of the nucleosome
137 cat avgc | awk "$awk_program" window=73 buffer=10000 | awk -v num=$patternfile -v id=$id '{print id "\t" num "\t" $0}' >> ${outfile2} ;
138 done
139
140 rm id dseq avgc
141 exit 0
142