diff JAMM.sh @ 1:243f75d0ed6e draft default tip

Uploaded. Includes new release 1.0.7 with fixed optional controls.
author messersc
date Thu, 19 Feb 2015 05:39:45 -0500
parents d42f4d78c85e
children
line wrap: on
line diff
--- a/JAMM.sh	Wed Dec 17 10:40:23 2014 -0500
+++ b/JAMM.sh	Thu Feb 19 05:39:45 2015 -0500
@@ -1,10 +1,23 @@
-#!/bin/bash
-########################################
-#### Peak finding Pipeline for NGS Data
-#### Bash script		
-########################################
+########################################################################
+# JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates.
+# Copyright (C) 2014-2015  Mahmoud Ibrahim
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# Contact: mahmoud.ibrahim@mdc-berlin.de
+########################################################################
 
-export LC_ALL=en_US.UTF-8
 
 ##Finding out the path
 sPath="`dirname \"$0\"`"
@@ -15,7 +28,7 @@
 usage()
 {
 cat << EOF
-Welcome to JAMM v1.0.6rev2 (GNU GPLv3). Copyright (C) 2014  Mahmoud Ibrahim.
+Welcome to JAMM v1.0.7rev1 (GNU GPLv3). Copyright (C) 2014-2015  Mahmoud Ibrahim.
 
 This program comes with ABSOLUTELY NO WARRANTY; for details visit http://www.gnu.org/licenses/gpl.html. This is free software, and you are welcome to redistribute it under certain conditions; visit http://www.gnu.org/licenses/gpl.html for details.
 
@@ -24,14 +37,17 @@
    -g      Genome size file (required)
    -o      Output directory (required)
    -c      directory containing input or Control files
+   -f      Fragment length(s) (default: estimated)
+   -r      Resolution, peak or region or window (default: peak)
    -m      Mode, normal or narrow (default: normal)
-   -r      Resolution, peak or region (default: peak)
-   -f      Fragment length(s) (default: estimated)
+   -i      clustering Initialization window selection, deterministic or stochastic (default: deterministic)
+   -b	   Bin Size (default: estimated)
    -w      minimum Window size (default: 2 --- Note: this means minimum_window_size = bin_size x the_value_of_-w)
+   -e	   window Enrichment cutoff, auto or any numeric value (default: 1 --- Set this to "auto" to estimate the window enrichment cutoff)
+   -d	   keep PCR Dupicates in single-end mode, y or n (default: n --- if -t is "paired", this option has no effect) 
+   -t	   Type, single or paired (default: single, requires BED files. paired requires BEDPE files)
    -p	   Number of processors used by R scripts (default: 1)
-   -b	   Bin Size (default: estimated)
-   -t	   Type, single or paired (default: single, requires BED files. paired requires BEDPE files)
-   -e	   window Enrichment cutoff, auto or any numeric value (default: 1 --- Set this to auto to estimate the appropriate window enrichment cutoff)
+
 EOF
 }
 
@@ -48,7 +64,8 @@
 window="2"
 type="single"
 windowe="1"
-
+initModel="deterministic"
+uniq="n"
 
 #Defaults -- Do not change
 sdir=""
@@ -58,8 +75,10 @@
 fraglen="ns"
 ran=$RANDOM
 wdir=$(mktemp -d)
+export LANG=C #locale defaults
+export LC_ALL=C #locale defaults
 
-while getopts "s:g:o:c:m:r:f:p:w:b:t:e:" OPTION
+while getopts "s:g:o:c:m:r:f:p:w:b:t:e:i:d:" OPTION
 do
 	case $OPTION in
 	s) sdir=$OPTARG
@@ -86,6 +105,10 @@
 	;;
 	e) windowe=$OPTARG
 	;;
+	i) initModel=$OPTARG
+	;;
+	d) uniq=$OPTARG
+	;;
 	?)
 	usage
 	exit
@@ -104,14 +127,25 @@
      usage
      exit 1
 fi
+if [[ -d "$out/peaks" ]]; then
+	printf "\n\nOutput directory $out/peaks already exists. I can't override existing results!\n\n"
+	exit 0
+fi
+if [ $fraglen == "ns" ]; then
+	if [[ -d "$out/xcorr" ]]; then
+		printf "\n\nOutput directory $out/xcorr already exists. I can't override existing results!\n\n"
+		exit 0
+	fi
+fi
 #=======================> DONE!
 
 
 
+
 # ============================= 
 # Step One: Initial Processing
 # =============================
-printf "\n\n========================================\nStarted JAMM Pipeline v1.0.6rev2...Hang on!\n========================================\n\n"
+printf "\n\n============================================\nStarted JAMM Pipeline v1.0.7rev1...Hang on!\n============================================\n\n"
 
 if [ ! -d "$wdir" ]; then
 	mkdir $wdir #make working directory
@@ -161,17 +195,21 @@
 fi
 
 #determine average read lengths
-printf "Getting average read lengths..."
-	if [ ! -z $bdir ]; then
-		readC=$(awk '{a=$3-$2;print a;}' "$wdir/bkgd.$ran/ctrl.bed" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{a=$1+0.5;print a;}' | cut -d"." -f1)
-	fi
-	readL=""
-	for s in $sdir/*.bed; do #and for each sample file
-		read=$(awk '{a=$3-$2;print a;}' "$s" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{a=$1+0.5;print a;}' | cut -d"." -f1)
-		readL="$readL,$read"
-	done
-	readL=${readL#","}
-printf "Done!\n"
+printf "Getting average read lengths...\n"
+readL=""
+if [ ! -z $bdir ]; then
+	readC=$(awk '{a=$3-$2;print a;}' "$wdir/bkgd.$ran/ctrl.bed" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{a=$1+0.5;print a;}' | cut -d"." -f1)
+	printf "Control: $readC\n"
+fi
+readL=""
+for s in $sdir/*.bed; do #and for each sample file
+	file=$(basename $s)
+	samplefile=$(echo $file | awk -F"." '{print $1}');	
+	read=$(awk '{a=$3-$2;print a;}' "$s" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{a=$1+0.5;print a;}' | cut -d"." -f1)
+	printf "$samplefile: $read\n"
+	readL="$readL,$read"
+done
+readL=${readL#","}
 #=======================> DONE!
 
 
@@ -299,7 +337,6 @@
 	#remove leading comma
 	samplelist=${samplelist#","}
 	frag=${frag#","}
-	
 	Rscript "$sPath/bincalculator.r" -ibed="$samplelist" -s="$gsize" -rl="$readL" -d="$wdir" -p="$cores"	-f="$frag" -type="$type"
 fi
 if [ $binsize != "ns" ]; then
@@ -360,7 +397,12 @@
 					read=$(echo $readL | cut -f "$k" -d ",")
 					k=$(($k+1))
 				fi
-				perl "$sPath/readshifter.pl" "$wdir/samples.$ran/$samplefile" $shift $read > "$wdir/samples.$ran/ext.$samplefile"
+				if [ $uniq == "y" ]; then
+					perl "$sPath/readshifter.pl" "$wdir/samples.$ran/$samplefile" $shift $read > "$wdir/samples.$ran/ext.$samplefile"
+				fi
+				if [ $uniq == "n" ]; then
+					perl "$sPath/readshifter.pl" "$wdir/samples.$ran/$samplefile" $shift $read | uniq > "$wdir/samples.$ran/ext.$samplefile"
+				fi				
 			fi
 		done
 		
@@ -375,7 +417,13 @@
 				l=$(($dupnum+1))
 				bshift=$(echo $frag | cut -f "$l" -d ",")
 			fi
-			perl "$sPath/readshifter.pl" "$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed" $bshift $readC > "$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed"
+			
+			if [ $uniq == "y" ]; then
+				perl "$sPath/readshifter.pl" "$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed" $bshift $readC > "$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed"
+			fi
+			if [ $uniq == "n" ]; then
+				perl "$sPath/readshifter.pl" "$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed" $bshift $readC | uniq > "$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed"
+			fi
 			bkgdfile="$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed"
 		fi
 		
@@ -384,7 +432,7 @@
 		frag=${frag#","}
 		
 		#call the peak calling R script
-		Rscript "$sPath/peakfinder.r" -sfile="$f" -bednames="$samplelist" -frag="$frag" -bkgd=$bkgdfile -out="$wdir/peaks.$ran/" -clustnummer="$clustno" -resolution="$resol" -window="$window" -p="$cores" -bin="$binsize" -type="$type" -chrcount="$counting" -windowe="$windowe"
+		Rscript "$sPath/peakfinder.r" -sfile="$f" -bednames="$samplelist" -frag="$frag" -bkgd=$bkgdfile -out="$wdir/peaks.$ran/" -clustnummer="$clustno" -resolution="$resol" -window="$window" -p="$cores" -bin="$binsize" -type="$type" -chrcount="$counting" -initModel="$initModel" -windowe="$windowe"
 		counting=$(($counting+1));
 		cp "$wdir/peaks.$ran/$chr.peaks.bed" "$out/peaks/$chr.peaks.bed"
 		rm "$wdir/peaks.$ran/$chr.peaks.bed"
@@ -434,7 +482,7 @@
 	frag=${frag#","}
 		
 	#call the peak calling R script
-	Rscript "$sPath/peakfinder.r" -sfile=$f -bednames=$samplelist -frag="NA" -bkgd=$bkgdfile -out="$wdir/peaks.$ran/" -clustnummer="$clustno" -resolution="$resol" -window="$window" -p="$cores" -bin="$binsize" -type="$type" -chrcount="$counting" -windowe="$windowe"
+	Rscript "$sPath/peakfinder.r" -sfile=$f -bednames=$samplelist -frag="NA" -bkgd=$bkgdfile -out="$wdir/peaks.$ran/" -clustnummer="$clustno" -resolution="$resol" -window="$window" -p="$cores" -bin="$binsize" -type="$type" -chrcount="$counting" -initModel="$initModel" -windowe="$windowe"
 	counting=$(($counting+1));
 	cp "$wdir/peaks.$ran/$chr.peaks.bed" "$out/peaks/$chr.peaks.bed"
 	rm "$wdir/peaks.$ran/$chr.peaks.bed"
@@ -447,10 +495,12 @@
 
 
 #concatenate, sort and filter
-cat $out/peaks/*.bed > $out/peaks/all.narrowPeak
-Rscript "$sPath/peakhelper.r" -filelist="$out/peaks/all.narrowPeak"
-perl "$sPath/peakfilter.pl" $out/peaks/all.narrowPeak | sort -nr -k7 > $out/peaks/filtered.peaks.narrowPeak
-cut -f1-10 $out/peaks/all.narrowPeak | awk -F"\t" -v j=0 '$7 > j' | sort -nr -k7 > $out/peaks/all.peaks.narrowPeak
+cat $out/peaks/*.bed > $out/peaks/all.narrowPeak 
+if [[ -s  $out/peaks/all.narrowPeak ]]; then
+	Rscript "$sPath/peakhelper.r" -filelist="$out/peaks/all.narrowPeak"
+	perl "$sPath/peakfilter.pl" $out/peaks/all.narrowPeak | sort -nr -k7 > $out/peaks/filtered.peaks.narrowPeak
+	cut -f1-10 $out/peaks/all.narrowPeak | awk -F"\t" -v j=0 '$7 > j' | sort -nr -k7 > $out/peaks/all.peaks.narrowPeak
+fi
 rm $out/peaks/all.narrowPeak
 rm $out/peaks/*.bed
 rm $out/peaks/min.peaksize