1
|
1 ########################################################################
|
|
2 # JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates.
|
|
3 # Copyright (C) 2014-2015 Mahmoud Ibrahim
|
|
4 #
|
|
5 # This program is free software: you can redistribute it and/or modify
|
|
6 # it under the terms of the GNU General Public License as published by
|
|
7 # the Free Software Foundation, either version 3 of the License, or
|
|
8 # (at your option) any later version.
|
|
9 #
|
|
10 # This program is distributed in the hope that it will be useful,
|
|
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 # GNU General Public License for more details.
|
|
14 #
|
|
15 # You should have received a copy of the GNU General Public License
|
|
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
17 #
|
|
18 # Contact: mahmoud.ibrahim@mdc-berlin.de
|
|
19 ########################################################################
|
0
|
20
|
|
21 ##Finding out the path
|
|
22 sPath="`dirname \"$0\"`"
|
|
23 sPath="`( cd \"$sPath\" && pwd )`"
|
|
24
|
|
25
|
|
26
|
|
27 usage()
|
|
28 {
|
|
29 cat << EOF
|
1
|
30 Welcome to JAMM v1.0.7rev1 Signal Generator Script (GNU GPLv3). Copyright (C) 2014-2015 Mahmoud Ibrahim.
|
0
|
31
|
|
32 This program comes with ABSOLUTELY NO WARRANTY; for details visit http://www.gnu.org/licenses/gpl.html. This is free software, and you are welcome to redistribute it under certain conditions; visit http://www.gnu.org/licenses/gpl.html for details.
|
|
33
|
|
34 OPTIONS:
|
|
35 -s Directory containing sample files (required)
|
|
36 -g Genome size file (required)
|
|
37 -o Output Directory (required)
|
|
38 -c directory containing input or Control files
|
|
39 -r file with Regions to get signal for (required)
|
|
40 -b Bin size for signal generation (default: 10)
|
1
|
41 -f Fragment lengths (required if -t is "single")
|
0
|
42 -p Number of processors used by R scripts (default: 1)
|
1
|
43 -t Alignment type, paired or single (default: single)
|
0
|
44
|
|
45 EOF
|
|
46 }
|
|
47
|
|
48
|
|
49 # =========================
|
|
50 # Process Input parameters
|
|
51 # =========================
|
|
52
|
|
53
|
|
54 sdir=""
|
|
55 gsize=""
|
|
56 out=""
|
|
57 signal="10"
|
|
58 regs=""
|
|
59 fraglen=""
|
|
60 wdir=$(mktemp -d)
|
|
61 ran=$RANDOM
|
|
62 cores="1"
|
1
|
63 type="single"
|
0
|
64
|
1
|
65 while getopts "s:o:c:r:b:f:g:p:t:" OPTION
|
0
|
66 do
|
|
67 case $OPTION in
|
|
68 s) sdir=$OPTARG
|
|
69 ;;
|
|
70 g) gsize=$OPTARG
|
|
71 ;;
|
|
72 o) out=$OPTARG
|
|
73 ;;
|
|
74 c) bdir=$OPTARG
|
|
75 ;;
|
|
76 r) regs=$OPTARG
|
|
77 ;;
|
|
78 b) signal=$OPTARG
|
|
79 ;;
|
|
80 f) fraglen=$OPTARG
|
|
81 ;;
|
|
82 p) cores=$OPTARG
|
|
83 ;;
|
1
|
84 t) type=$OPTARG
|
|
85 ;;
|
0
|
86 ?)
|
|
87 usage
|
|
88 exit
|
|
89 ;;
|
|
90 esac
|
|
91 done
|
|
92
|
|
93
|
1
|
94 if [[ -z $sdir ]] || [[ -z $regs ]] || [[ -z $out ]]
|
0
|
95 then
|
|
96 usage
|
|
97 exit 1
|
|
98 fi
|
1
|
99
|
|
100 if [[ -z $fraglen ]]
|
|
101 then
|
|
102 if [ $type == "single" ]
|
|
103 then
|
|
104 usage
|
|
105 exit 1
|
|
106 fi
|
|
107 fi
|
|
108 if [[ -d "$out/signal" ]]; then
|
|
109 printf "\n\nOutput directory $out/signal already exists. I can't override existing results!\n\n"
|
|
110 exit 0
|
|
111 fi
|
0
|
112 #=======================> DONE!
|
|
113
|
|
114
|
|
115
|
|
116 # =============================
|
|
117 # Step One: Initial Processing
|
|
118 # =============================
|
1
|
119 printf "\n\n=======================================================\nStarted JAMM1.0.7rev1 Signal Generator Pipeline...Hang on!\n=======================================================\n\n"
|
0
|
120
|
|
121 if [ ! -d "$wdir" ]; then
|
|
122 mkdir $wdir #make working directory
|
|
123 fi
|
|
124 if [ ! -d "$out" ]; then
|
|
125 mkdir $out #make working directory
|
|
126 fi
|
|
127 mkdir $wdir/bkgd.$ran/ #directory to store background files
|
|
128 mkdir $wdir/sizes.$ran/ #chromosomes and sizes
|
|
129 mkdir $wdir/samples.$ran/ #store sample files
|
|
130
|
|
131 dupnum=$(ls -1 $sdir | wc -l) #count how many sample files
|
|
132
|
|
133
|
|
134 #separate chromosome sizes
|
|
135 printf "Loading genome size file..."
|
|
136 ext="$wdir/sizes.$ran/"
|
|
137 awk -v ext="$ext" '{ print >> ext"/size." $1 ".bed" }' $gsize
|
|
138 printf "Done!\n"
|
|
139
|
|
140
|
|
141 printf "Processing sample files..."
|
|
142 #load each chromosome from each sample file
|
|
143 for i in $sdir/*.bed; do
|
|
144 samplefile=$(basename $i)
|
|
145 for f in $wdir/sizes.$ran/*; do
|
|
146 sizefile=$(basename $f)
|
|
147 chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}');
|
|
148 awk -v chr="$chr" -v ext="$wdir/samples.$ran/" -v samplefile="$samplefile" -F"\t" '$1 == chr { print $2"\t"$6 >> ext"sample."chr"."samplefile }' "$i"
|
|
149 done
|
|
150 done
|
|
151 printf "Done!\n"
|
|
152
|
|
153
|
|
154 if [ ! -z $bdir ]; then
|
|
155 #concatenate all background files into one file
|
|
156 printf "Processing control files..."
|
|
157 cat $bdir/*.bed > $wdir/bkgd.$ran/ctrl.bed
|
|
158
|
|
159 for f in $wdir/sizes.$ran/*; do
|
|
160 sizefile=$(basename $f)
|
|
161 chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}');
|
|
162 awk -v chr="$chr" -v ext="$wdir/bkgd.$ran/" -F"\t" '$1 == chr { print $2"\t"$6 >> ext"bkgd."chr".ctrl.bed" }' "$wdir/bkgd.$ran/ctrl.bed"
|
|
163 done
|
|
164
|
|
165
|
|
166 printf "Done!\n"
|
|
167 fi
|
|
168
|
|
169
|
|
170 #determine average read lengths
|
1
|
171 if [ $type == "single" ]; then
|
0
|
172 printf "Getting average read lengths..."
|
|
173 if [ ! -z $bdir ]; then
|
|
174 readC=$(awk '{a=$3-$2;print a;}' "$wdir/bkgd.$ran/ctrl.bed" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{print int($1)}')
|
|
175 fi
|
|
176 readL=""
|
|
177 for s in $sdir/*.bed; do #and for each sample file
|
|
178 read=$(awk '{a=$3-$2;print a;}' "$s" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{print int($1)}')
|
|
179 readL="$readL,$read"
|
|
180 done
|
|
181 readL=${readL#","}
|
|
182 printf "Done!\n"
|
1
|
183 fi
|
0
|
184 #=======================> DONE!
|
|
185
|
|
186
|
|
187
|
|
188
|
|
189
|
|
190
|
|
191 # ===========================
|
|
192 # Step Three: Getting Signal
|
|
193 # ===========================
|
|
194 mkdir $wdir/signal.$ran/
|
|
195 mkdir $out/signal #store signal
|
|
196
|
|
197
|
|
198 printf "Generating Signal...(bin size: $signal)\n"
|
|
199
|
1
|
200 if [ $type == "single" ]; then
|
0
|
201 counting=1;
|
|
202 for f in $wdir/sizes.$ran/*; do #for each chromosome
|
|
203 samplelist=""
|
|
204 frag=""
|
|
205 frag=$fraglen
|
|
206 k=1
|
|
207
|
|
208 sizefile=$(basename $f)
|
|
209 chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}');
|
|
210 chrSize=$(cat $f | cut -f2);
|
|
211 printf "Chromosome $chr: "
|
|
212
|
|
213 #list of sample bed files and fragment lengths
|
|
214 for s in $wdir/samples.$ran/*.bed; do #and for each sample file
|
|
215 samplefile=$(basename $s)
|
|
216 chr2=$(echo $samplefile | awk -F"." '{print $2}');
|
|
217 if [ $chr == $chr2 ] #belonging to this chromosome
|
|
218 then
|
|
219 samplelist="$samplelist,$wdir/samples.$ran/ext.$samplefile"
|
|
220 samplename=$(echo $samplefile | awk -F"." '{ print $3 }')
|
|
221 samplefilename=$(echo $samplefile | cut -d'.' -f 3-)
|
|
222 shift=$(echo "$frag" | cut -f "$k" -d ",")
|
|
223 read=$(echo "$readL" | cut -f "$k" -d ",")
|
|
224 k=$(($k+1))
|
|
225 perl "$sPath/readshifter.pl" "$wdir/samples.$ran/$samplefile" $shift $read > "$wdir/samples.$ran/ext.$samplefile"
|
|
226 fi
|
|
227 done
|
|
228
|
|
229 #control file
|
|
230 bkgdfile="None"
|
|
231 if [ ! -z $bdir ]; then
|
|
232 l=$(($dupnum+1))
|
|
233 bshift=$(echo $frag | cut -f "$l" -d ",")
|
|
234 perl "$sPath/readshifter.pl" "$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed" $bshift $readC > "$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed"
|
|
235 bkgdfile="$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed"
|
|
236 fi
|
|
237
|
|
238 #remove leading comma
|
|
239 samplelist=${samplelist#","}
|
|
240 frag=${frag#","}
|
|
241
|
|
242 #call the peak calling R script
|
1
|
243 Rscript "$sPath/signalmaker.r" -chromoS="$chrSize" -chromo="$chr" -bednames=$samplelist -frag=$frag -bkgd=$bkgdfile -out="$wdir/signal.$ran/" -sig="$signal" -regions="$regs" -p="$cores" -chrcount="$counting" -t="$type"
|
0
|
244 cat $wdir/signal.$ran/*.bedSignal | awk -F"\t" -v j=0 '$4 > j' > "$out/signal/$chr.bedGraph"
|
|
245 rm $wdir/signal.$ran/*.bedSignal
|
|
246 counting=$(($counting+1));
|
|
247 done
|
|
248 counting=1;
|
1
|
249 fi
|
|
250
|
|
251
|
|
252
|
|
253 if [ $type == "paired" ]; then
|
|
254 counting=1;
|
|
255 for f in $wdir/sizes.$ran/*; do #for each chromosome
|
|
256 samplelist=""
|
|
257
|
|
258 sizefile=$(basename $f)
|
|
259 chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}');
|
|
260 chrSize=$(cat $f | cut -f2);
|
|
261 printf "Chromosome $chr: "
|
|
262
|
|
263 #list of sample bed files and fragment lengths
|
|
264 for s in $wdir/samples.$ran/*.bed; do #and for each sample file
|
|
265 samplefile=$(basename $s)
|
|
266 chr2=$(echo $samplefile | awk -F"." '{print $2}');
|
|
267 if [ $chr == $chr2 ] #belonging to this chromosome
|
|
268 then
|
|
269 samplelist="$samplelist,$wdir/samples.$ran/$samplefile"
|
|
270 samplename=$(echo $samplefile | awk -F"." '{ print $3 }')
|
|
271 samplefilename=$(echo $samplefile | cut -d'.' -f 3-)
|
|
272 x="$sdir/$samplefilename"
|
|
273 fi
|
|
274 done
|
|
275
|
|
276 #control file
|
|
277 bkgdfile="None"
|
|
278 if [ ! -z $bdir ]; then
|
|
279 bkgdfile="$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed"
|
|
280 fi
|
|
281
|
|
282 #remove leading comma
|
|
283 samplelist=${samplelist#","}
|
|
284 frag=${frag#","}
|
|
285
|
|
286 #call the peak calling R script
|
|
287 Rscript "$sPath/signalmaker.r" -chromoS="$chrSize" -chromo="$chr" -bednames=$samplelist -frag=$frag -bkgd=$bkgdfile -out="$wdir/signal.$ran/" -sig="$signal" -regions="$regs" -p="$cores" -chrcount="$counting" -t="$type"
|
|
288 cat $wdir/signal.$ran/*.bedSignal | awk -F"\t" -v j=0 '$4 > j' > "$out/signal/$chr.bedGraph"
|
|
289 rm $wdir/signal.$ran/*.bedSignal
|
|
290 counting=$(($counting+1));
|
|
291 done
|
|
292 counting=1;
|
|
293 fi
|
0
|
294 #=======================> DONE!
|
|
295
|
|
296 rm -rf $wdir
|
|
297
|
|
298 printf "\n\n========================================\nWe're done...Congratulations!\n========================================\n\n"
|