Mercurial > repos > erinija > dnp_mapping
comparison dnp-mapping.sh @ 0:b4bef5178d86 draft default tip
"planemo upload commit 4846fbc43d4c7437de1ce996392fd13a71abd9c7"
| author | erinija |
|---|---|
| date | Tue, 07 Sep 2021 15:03:57 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b4bef5178d86 |
|---|---|
| 1 #!/bin/sh | |
| 2 if test "$#" -ne 6; then | |
| 3 | |
| 4 echo "" | |
| 5 echo " CALL " | |
| 6 echo " sh dnp-mapping.sh input.fasta input.pattern input.trimstart input.length output.file1 output.file2" | |
| 7 echo "" | |
| 8 echo " INPUT" | |
| 9 echo " input.fasta - input fasta file " | |
| 10 echo " input.pattern - 'one or more columns with dinucleotide frequency pattern'" | |
| 11 echo " input.trimstart - 'number of positions to trim from the start of the sequence'" | |
| 12 echo " input.length - 'sequence length to retain past trimming from start'" | |
| 13 echo "" | |
| 14 echo " OUTPUT" | |
| 15 echo " output.file1 - tabular file with correlations " | |
| 16 echo " output.file2 - file to store the max correlation position " | |
| 17 | |
| 18 echo "" | |
| 19 echo " DESCRIPTION" | |
| 20 echo " Each sequence in the fasta file is reduced by trimming " | |
| 21 echo " and retaining a given number of positions, but no less than 147." | |
| 22 echo " Correlation of the nucleosome's sequence with the patterns" | |
| 23 echo " is computed within the sliding window. Correlation coefficients " | |
| 24 echo " of the patterns with the sequence starting at a position 73 - dyad " | |
| 25 echo " are computed and saved in output.file1. The maximum correlation position" | |
| 26 echo " is saved in output.file2." | |
| 27 echo "" | |
| 28 echo " REQUIREMENT" | |
| 29 echo " dnp-mapping installed" | |
| 30 echo " conda install -c bioconda dnp-mapping" | |
| 31 echo "" | |
| 32 exit 1 | |
| 33 fi | |
| 34 | |
| 35 faseqfile=$1 | |
| 36 patternfile=$2 | |
| 37 seqstart=$3 | |
| 38 seqlength=$4 | |
| 39 | |
| 40 outfile1=$5 | |
| 41 outfile2=$6 | |
| 42 | |
| 43 call=dnp-mapping | |
| 44 | |
| 45 | |
| 46 awk_program=$( cat << 'EOF' | |
| 47 ################################################################### | |
| 48 # position of maximum | |
| 49 # parameters: window=W (minimal distance between two peaks) | |
| 50 # buffer=N (size of buffer) | |
| 51 ################################################################### | |
| 52 function max_pos_funct(min_pos, max_pos) | |
| 53 { | |
| 54 sum=0; | |
| 55 start_position=min_pos; | |
| 56 for(i=min_pos+window;i<=max_pos&&sum<1000;) | |
| 57 { | |
| 58 sum++; | |
| 59 max=arr[i]; | |
| 60 pos=i; | |
| 61 for(j=i-window;j<=i+window&&j<=max_pos;j++) | |
| 62 { | |
| 63 if(arr[j]>max) | |
| 64 { | |
| 65 max=arr[j] | |
| 66 pos=j | |
| 67 } | |
| 68 } | |
| 69 if(arr[pos]>=arr[pos-1]&&arr[pos]>=arr[pos+1]&&arr[pos]>0) | |
| 70 { | |
| 71 if(pos==i) | |
| 72 { | |
| 73 start_position=pos+window+1 | |
| 74 printf("%d %f\n", pos+buffer*num_buf, arr[pos]); | |
| 75 i=pos+window*2+1; | |
| 76 } | |
| 77 else | |
| 78 { | |
| 79 if(pos>=start_position&&pos>min_pos) | |
| 80 { | |
| 81 i=pos; | |
| 82 } | |
| 83 else | |
| 84 { | |
| 85 i+=window*2+1; | |
| 86 } | |
| 87 } | |
| 88 } | |
| 89 else | |
| 90 { | |
| 91 i+=window+1; | |
| 92 } | |
| 93 if(sum==999) | |
| 94 { | |
| 95 i+=window*3; | |
| 96 sum=1; | |
| 97 } | |
| 98 } | |
| 99 # printf("\n"); | |
| 100 } | |
| 101 { | |
| 102 if(FNR==1) | |
| 103 num_buf=0; | |
| 104 pos_buf=int($1/buffer); | |
| 105 if(pos_buf>num_buf) | |
| 106 { | |
| 107 max_pos_funct(1,buffer); | |
| 108 num_buf=pos_buf; | |
| 109 } | |
| 110 arr[$1-num_buf*buffer]=$2; | |
| 111 } | |
| 112 END{ | |
| 113 max_pos_funct(1,$1-num_buf*buffer); | |
| 114 } | |
| 115 | |
| 116 EOF | |
| 117 ) | |
| 118 | |
| 119 > ${outfile2} | |
| 120 > ${outfile1} | |
| 121 | |
| 122 for seq in `cat ${faseqfile} | tr "\t" "="`; | |
| 123 do | |
| 124 echo $seq | sed 's/=.*$//' > id; | |
| 125 echo $seq | sed 's/^.*=//' > dseq; | |
| 126 dseq=`cat dseq`; | |
| 127 echo ${dseq:${seqstart}:${seqlength}} > dseq | |
| 128 id=`cat id`; | |
| 129 echo ${id} | |
| 130 cat dseq | |
| 131 ${call} -m ${patternfile} -s dseq | awk -v id=${id} '{print $0 "\t" id}' >> ${outfile1} | |
| 132 | |
| 133 #compute average correlation | |
| 134 cat ${outfile1} | gawk '{sum=0; for(i=2;i<=NF;i++) sum+=$i; print $1, sum/(NF-1);}' > avgc | |
| 135 | |
| 136 # compute most likely position of the nucleosome | |
| 137 cat avgc | awk "$awk_program" window=73 buffer=10000 | awk -v num=$patternfile -v id=$id '{print id "\t" num "\t" $0}' >> ${outfile2} ; | |
| 138 done | |
| 139 | |
| 140 rm id dseq avgc | |
| 141 exit 0 | |
| 142 |
