Mercurial > repos > messersc > jamm
diff JAMM.sh @ 0:d42f4d78c85e draft
Uploaded
author | messersc |
---|---|
date | Wed, 17 Dec 2014 10:40:23 -0500 |
parents | |
children | 243f75d0ed6e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/JAMM.sh Wed Dec 17 10:40:23 2014 -0500 @@ -0,0 +1,465 @@ +#!/bin/bash +######################################## +#### Peak finding Pipeline for NGS Data +#### Bash script +######################################## + +export LC_ALL=en_US.UTF-8 + +##Finding out the path +sPath="`dirname \"$0\"`" +sPath="`( cd \"$sPath\" && pwd )`" + + + +usage() +{ +cat << EOF +Welcome to JAMM v1.0.6rev2 (GNU GPLv3). Copyright (C) 2014 Mahmoud Ibrahim. + +This program comes with ABSOLUTELY NO WARRANTY; for details visit http://www.gnu.org/licenses/gpl.html. This is free software, and you are welcome to redistribute it under certain conditions; visit http://www.gnu.org/licenses/gpl.html for details. + +OPTIONS: + -s directory containing Sample files (required) + -g Genome size file (required) + -o Output directory (required) + -c directory containing input or Control files + -m Mode, normal or narrow (default: normal) + -r Resolution, peak or region (default: peak) + -f Fragment length(s) (default: estimated) + -w minimum Window size (default: 2 --- Note: this means minimum_window_size = bin_size x the_value_of_-w) + -p Number of processors used by R scripts (default: 1) + -b Bin Size (default: estimated) + -t Type, single or paired (default: single, requires BED files. paired requires BEDPE files) + -e window Enrichment cutoff, auto or any numeric value (default: 1 --- Set this to auto to estimate the appropriate window enrichment cutoff) +EOF +} + + +# ========================= +# Process Input parameters +# ========================= + + +#Defaults -- Change those if you want +mode="normal" +resol="peak" +cores="1" +window="2" +type="single" +windowe="1" + + +#Defaults -- Do not change +sdir="" +gsize="" +out="" +binsize="ns" +fraglen="ns" +ran=$RANDOM +wdir=$(mktemp -d) + +while getopts "s:g:o:c:m:r:f:p:w:b:t:e:" OPTION +do + case $OPTION in + s) sdir=$OPTARG + ;; + g) gsize=$OPTARG + ;; + o) out=$OPTARG + ;; + c) bdir=$OPTARG + ;; + m) mode=$OPTARG + ;; + r) resol=$OPTARG + ;; + f) fraglen=$OPTARG + ;; + p) cores=$OPTARG + ;; + w) window=$OPTARG + ;; + b) binsize=$OPTARG + ;; + t) type=$OPTARG + ;; + e) windowe=$OPTARG + ;; + ?) + usage + exit + ;; + esac +done +if [ "$mode" == "normal" ]; then + clustno="2" +fi +if [ "$mode" == "narrow" ]; then + clustno="3" +fi + +if [[ -z $sdir ]] || [[ -z $gsize ]] || [[ -z $out ]] +then + usage + exit 1 +fi +#=======================> DONE! + + + +# ============================= +# Step One: Initial Processing +# ============================= +printf "\n\n========================================\nStarted JAMM Pipeline v1.0.6rev2...Hang on!\n========================================\n\n" + +if [ ! -d "$wdir" ]; then + mkdir $wdir #make working directory +fi +if [ ! -d "$out" ]; then + mkdir $out #make output directory +fi +mkdir $wdir/bkgd.$ran/ #directory to store background files +mkdir $wdir/sizes.$ran/ #chromosomes and sizes +mkdir $wdir/samples.$ran/ #store sample files + +dupnum=$(ls -1 $sdir | wc -l) #count how many sample files + + +#separate chromosome sizes +printf "Loading genome size file..." +ext="$wdir/sizes.$ran/" +awk -v ext="$ext" '{ print >> ext"/size." $1 ".bed" }' $gsize +printf "Done!\n" + + +printf "Processing sample files..." +#load each chromosome from each sample file +for i in $sdir/*.bed; do +samplefile=$(basename $i) + for f in $wdir/sizes.$ran/*; do + sizefile=$(basename $f) + chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}'); + awk -v chr="$chr" -v ext="$wdir/samples.$ran/" -v samplefile="$samplefile" -F"\t" '$1 == chr { print $2"\t"$6 >> ext"sample."chr"."samplefile }' "$i" + done +done +printf "Done!\n" + + +if [ ! -z $bdir ]; then +#concatenate all background files into one file +printf "Processing control files..." +cat $bdir/*.bed > $wdir/bkgd.$ran/ctrl.bed + +for f in $wdir/sizes.$ran/*; do + sizefile=$(basename $f) + chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}'); + awk -v chr="$chr" -v ext="$wdir/bkgd.$ran/" -F"\t" '$1 == chr { print $2"\t"$6 >> ext"bkgd."chr".ctrl.bed" }' "$wdir/bkgd.$ran/ctrl.bed" +done + +printf "Done!\n" +fi + +#determine average read lengths +printf "Getting average read lengths..." + if [ ! -z $bdir ]; then + readC=$(awk '{a=$3-$2;print a;}' "$wdir/bkgd.$ran/ctrl.bed" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{a=$1+0.5;print a;}' | cut -d"." -f1) + fi + readL="" + for s in $sdir/*.bed; do #and for each sample file + read=$(awk '{a=$3-$2;print a;}' "$s" | perl -lane '$a+=$_;END{print $a/$.}' | awk '{a=$1+0.5;print a;}' | cut -d"." -f1) + readL="$readL,$read" + done + readL=${readL#","} +printf "Done!\n" +#=======================> DONE! + + + +# ============================= +# Step Two: Fragment Length +# ============================= +#single-end +if [ $type == "single" ]; then + +if [ $fraglen == "ns" ]; then + + ##Counting Where Reads Start and Calculating Cross Correlation + mkdir $wdir/stats.$ran/ #store count files + mkdir $out/xcorr #final xcorr results + + + printf "Calculating Fragment Length(s)...\n" + for f in $wdir/sizes.$ran/*; do #for each chromosome + samplelist="" + readlist="" + sizefile=$(basename $f) + chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}'); + + #list of sample bed files and read lengths + for s in $wdir/samples.$ran/*.bed; do #and for each sample file + samplefile=$(basename $s) + chr2=$(echo $samplefile | awk -F"." '{print $2}'); + if [ $chr == $chr2 ] #belonging to this chromosome + then + samplelist="$samplelist,$wdir/samples.$ran/$samplefile" + fi + done + readlist="$readL" + + #list of control bed files and read lengths + if [ ! -z $bdir ]; then + for s in $wdir/bkgd.$ran/*.bed; do #and for each sample file + samplefile=$(basename $s) + chr2=$(echo $samplefile | awk -F"." '{print $2}'); + if [ $chr == $chr2 ] #belonging to this chromosome + then + samplelist="$samplelist,$wdir/bkgd.$ran/$samplefile" + readlist="$readL,$readC" + fi + done + fi + + #remove leading comma + samplelist=${samplelist#","} + + #call R script for xcorr calculation + Rscript "$sPath/xcorr.r" -ibed="$samplelist" -s="$wdir/sizes.$ran/size.$chr.bed" -rl="$readlist" -d="$wdir/stats.$ran" -p="$cores" + done + + #report xcorr results (samples) + for f in $sdir/*.bed; do + file=$(basename $f) + samplefile=$(echo $file | awk -F"." '{print $1}'); + mkdir "$out/xcorr/$samplefile" #final xcorr results + if [ -f "$wdir/stats.$ran/xc.$samplefile.tab" ]; then + cp $wdir/stats.$ran/xc.$samplefile.tab $out/xcorr/$samplefile/shifts.txt + fi + Rscript "$sPath/xcorrhelper.r" -infile="$out/xcorr/$samplefile/shifts.txt" -out="$out/xcorr/$samplefile" + done + #report xcorr results (control) + if [ ! -z $bdir ]; then + mkdir "$out/xcorr/ctrl" #final xcorr results + if [ -f "$wdir/stats.$ran/xc.ctrl.tab" ]; then + cp $wdir/stats.$ran/xc.ctrl.tab $out/xcorr/ctrl/shifts.txt + fi + Rscript "$sPath/xcorrhelper.r" -infile="$out/xcorr/ctrl/shifts.txt" -out="$out/xcorr/ctrl" + fi + +fi +fi + +#paired-end +if [ $type == "paired" ]; then + printf "Getting Average Fragment Length(s)...\n" + mkdir $out/xcorr #final xcorr results + + for f in $sdir/*.bed; do + file=$(basename $f) + samplefile=$(echo $file | awk -F"." '{print $1}'); + mkdir "$out/xcorr/$samplefile" + frag=$(awk '{a=$6-$2;print a;}' $f | perl -lane '$a+=$_;END{print $a/$.}' | awk '{a=$1+0.5;print a;}' | cut -d"." -f1) + echo "Average_from_paired $frag" > $out/xcorr/$samplefile/shifts.txt + Rscript "$sPath/xcorrhelper.r" -infile="$out/xcorr/$samplefile/shifts.txt" -out="$out/xcorr/$samplefile" + done +fi +#=======================> DONE! + + + +# ================================= +# Step Three: Calculating Bin Size +# ================================= +if [ $binsize == "ns" ]; then + printf "Getting Bin Size: " + + chr=$(sort -nr -k2 $gsize | head -n 1 | awk -F"\t" '{print $1}'); + samplelist="" + frag="" + if [ $fraglen != "ns" ]; then + frag=$fraglen + k=1 + fi + + #list of sample bed files and read lengths + for s in $wdir/samples.$ran/*.bed; do + samplefile=$(basename $s) + chr2=$(echo $samplefile | awk -F"." '{print $2}'); + if [ $chr == $chr2 ] + then + samplelist="$samplelist,$wdir/samples.$ran/$samplefile" + samplename=$(echo $samplefile | awk -F"." '{ print $3 }') + samplefilename=$(echo $samplefile | cut -d'.' -f 3-) + if [ $fraglen == "ns" ]; then + shift=$(awk -F":" '$1 == "Fragment Length" { print $2 }' "$out/xcorr/$samplename/xcorrsummary.txt") + frag="$frag,$shift" + fi + fi + done + #remove leading comma + samplelist=${samplelist#","} + frag=${frag#","} + + Rscript "$sPath/bincalculator.r" -ibed="$samplelist" -s="$gsize" -rl="$readL" -d="$wdir" -p="$cores" -f="$frag" -type="$type" +fi +if [ $binsize != "ns" ]; then + printf "You set a Bin Size: $binsize \n" +fi +#=======================> DONE! + + + +# =========================== +# Step Four: Calling Peaks +# =========================== +mkdir $wdir/peaks.$ran/ #store count files +mkdir $out/peaks #store peak files + +printf "Calling Peaks...(mode: $mode, resolution: $resol)\n" + + +#single-end reads +if [ $type == "single" ]; then + +if [ $binsize == "ns" ]; then + binsize=$(cat "$wdir/binsize.txt") +fi + +counting=1; +for f in $wdir/sizes.$ran/*; do #for each chromosome + samplelist="" + frag="" + k=1 + if [ $fraglen != "ns" ]; then + frag=$fraglen + fi + + + sizefile=$(basename $f) + chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}'); + + printf "Chromosome $chr: " + + #list of sample bed files and fragment lengths + for s in $wdir/samples.$ran/*.bed; do #and for each sample file + samplefile=$(basename $s) + chr2=$(echo $samplefile | awk -F"." '{print $2}'); + if [ $chr == $chr2 ] #belonging to this chromosome + then + samplelist="$samplelist,$wdir/samples.$ran/ext.$samplefile" + samplename=$(echo $samplefile | awk -F"." '{ print $3 }') + samplefilename=$(echo $samplefile | cut -d'.' -f 3-) + if [ $fraglen == "ns" ]; then + shift=$(awk -F":" '$1 == "Fragment Length" { print $2 }' "$out/xcorr/$samplename/xcorrsummary.txt") + frag="$frag,$shift" + read=$(echo $readL | cut -f "$k" -d ",") + k=$(($k+1)) + fi + if [ $fraglen != "ns" ]; then + shift=$(echo $frag | cut -f "$k" -d ",") + read=$(echo $readL | cut -f "$k" -d ",") + k=$(($k+1)) + fi + perl "$sPath/readshifter.pl" "$wdir/samples.$ran/$samplefile" $shift $read > "$wdir/samples.$ran/ext.$samplefile" + fi + done + + #control file + bkgdfile="None" + if [ ! -z $bdir ]; then + if [ $fraglen == "ns" ]; then + bshift=$(awk -F":" '$1 == "Fragment Length" { print $2 }' "$out/xcorr/ctrl/xcorrsummary.txt") + frag="$frag,$bshift" + fi + if [ $fraglen != "ns" ]; then + l=$(($dupnum+1)) + bshift=$(echo $frag | cut -f "$l" -d ",") + fi + perl "$sPath/readshifter.pl" "$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed" $bshift $readC > "$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed" + bkgdfile="$wdir/bkgd.$ran/ext.bkgd.$chr.ctrl.bed" + fi + + #remove leading comma + samplelist=${samplelist#","} + frag=${frag#","} + + #call the peak calling R script + Rscript "$sPath/peakfinder.r" -sfile="$f" -bednames="$samplelist" -frag="$frag" -bkgd=$bkgdfile -out="$wdir/peaks.$ran/" -clustnummer="$clustno" -resolution="$resol" -window="$window" -p="$cores" -bin="$binsize" -type="$type" -chrcount="$counting" -windowe="$windowe" + counting=$(($counting+1)); + cp "$wdir/peaks.$ran/$chr.peaks.bed" "$out/peaks/$chr.peaks.bed" + rm "$wdir/peaks.$ran/$chr.peaks.bed" +done +counting=1; +fi + + +#paired-end reads +if [ $type == "paired" ]; then + +if [ $binsize == "ns" ]; then + binsize=$(cat "$wdir/binsize.txt") +fi + + +counting=1; +for f in $wdir/sizes.$ran/*; do #for each chromosome + samplelist="" + + sizefile=$(basename $f) + chr=$(echo $sizefile | awk -F"." '{print $2}' | awk -F"." '{print $1}'); + + printf "Chromosome $chr: " + + #list of sample bed files and fragment lengths + for s in $wdir/samples.$ran/*.bed; do #and for each sample file + samplefile=$(basename $s) + chr2=$(echo $samplefile | awk -F"." '{print $2}'); + if [ $chr == $chr2 ] #belonging to this chromosome + then + samplelist="$samplelist,$wdir/samples.$ran/$samplefile" + samplename=$(echo $samplefile | awk -F"." '{ print $3 }') + samplefilename=$(echo $samplefile | cut -d'.' -f 3-) + x="$sdir/$samplefilename" + fi + done + + #control file + bkgdfile="None" + if [ ! -z $bdir ]; then + bkgdfile="$wdir/bkgd.$ran/bkgd.$chr.ctrl.bed" + fi + + #remove leading comma + samplelist=${samplelist#","} + frag=${frag#","} + + #call the peak calling R script + Rscript "$sPath/peakfinder.r" -sfile=$f -bednames=$samplelist -frag="NA" -bkgd=$bkgdfile -out="$wdir/peaks.$ran/" -clustnummer="$clustno" -resolution="$resol" -window="$window" -p="$cores" -bin="$binsize" -type="$type" -chrcount="$counting" -windowe="$windowe" + counting=$(($counting+1)); + cp "$wdir/peaks.$ran/$chr.peaks.bed" "$out/peaks/$chr.peaks.bed" + rm "$wdir/peaks.$ran/$chr.peaks.bed" +done +counting=1; +fi + + +cp $wdir/peaks.$ran/min.peaksize $out/peaks/min.peaksize + + +#concatenate, sort and filter +cat $out/peaks/*.bed > $out/peaks/all.narrowPeak +Rscript "$sPath/peakhelper.r" -filelist="$out/peaks/all.narrowPeak" +perl "$sPath/peakfilter.pl" $out/peaks/all.narrowPeak | sort -nr -k7 > $out/peaks/filtered.peaks.narrowPeak +cut -f1-10 $out/peaks/all.narrowPeak | awk -F"\t" -v j=0 '$7 > j' | sort -nr -k7 > $out/peaks/all.peaks.narrowPeak +rm $out/peaks/all.narrowPeak +rm $out/peaks/*.bed +rm $out/peaks/min.peaksize +#=======================> DONE! + + + +rm -rf $wdir + + + +printf "\n\n========================================\nWe're done...Congratulations!\n========================================\n\n"