| 8 | 1 #!/bin/bash | 
|  | 2 # $Id: bigwig2summary_stdout 34 2014-02-20 08:31:20Z jens $ | 
|  | 3 | 
|  | 4 #USE IN GALAXY | 
|  | 5 #This script extract summary values (mean, min, max, std or coverage) from a bigwig file for a number of equal sized bins across genomic regions given in bed file. | 
|  | 6 #If the bed file contains 6 columns (or more), column 6 is expected to contain strand information. Summary values from a negative strand will be reversed. | 
|  | 7 | 
|  | 8 ######################################## | 
|  | 9 #bigwigSummary has 3 non-standard outputs: | 
|  | 10 #1) n/a #(no data in bin) | 
|  | 11 #2) no data #(no data in entire region) | 
|  | 12 #3) <number> is not a valid option (typically if negative coordinate) | 
|  | 13 | 
|  | 14 #Default settings & input parsing. "" indicates required user input. | 
|  | 15 nbins=1 | 
|  | 16 rm_header_line=0 | 
|  | 17 summary_type=mean | 
|  | 18 | 
|  | 19 #parse input | 
|  | 20 while getopts hef:b:o:n:t: myarg | 
|  | 21 do	case "$myarg" in | 
|  | 22 	h)	echo "Usage: bigwig2summary_stdout -f <BIGWIG_FILE> -b <BED_FILE> -o <ASSEMBLY> -n <NUMBER OF BINS> -t <SUMMARY TYPE> -e" | 
|  | 23 	        echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation" | 
|  | 24 		exit ;; | 
|  | 25 	f)	bigwig_file="$OPTARG" ;; | 
|  | 26 	b)	bed_file="$OPTARG" ;; #must be tab separated without header | 
|  | 27 	o)	org_assembly="$OPTARG" ;; | 
|  | 28 	n)	nbins="$OPTARG" ;; | 
|  | 29 	t)	summary_type="$OPTARG" ;; | 
|  | 30 	e)	rm_header_line=2 ;; #flag. if -e then first line is removed | 
|  | 31 	[?])	echo "Usage: bigwig2summary_stdout -f <BIGWIG_FILE> -b <BED_FILE> -o <ASSEMBLY> -n <NUMBER OF BINS> -t <SUMMARY TYPE> -e" | 
|  | 32 	        echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation" | 
|  | 33 		exit 1 ;; | 
|  | 34 	esac | 
|  | 35 done | 
|  | 36 | 
|  | 37 ################################################### | 
|  | 38 ###VALIDATE INPUT | 
|  | 39 ################################################### | 
|  | 40 | 
|  | 41 #get chromosome sizes from bigwig file. bigwig file does not contain name of genome assembly. | 
|  | 42 org_assembly_file=`mktemp -u` | 
|  | 43 fetchChromSizes $org_assembly 2>/dev/null > $org_assembly_file | 
|  | 44 if [ $? -ne 0 ]; then | 
|  | 45   echo "ERROR: Organism genome assembly does not exist" | 
|  | 46   rm $org_assembly_file | 
|  | 47   exit | 
|  | 48 fi | 
|  | 49 | 
|  | 50 #check input bed_file. bedClip only checks first 3 columns! | 
|  | 51 if [ `bedClip -verbose=2 <(tail -n +${rm_header_line} $bed_file) $org_assembly_file /dev/null 2>&1 | wc -l` -gt 0 ]; then | 
|  | 52   echo -e "ERROR: Input bed file is not in proper format!\nTry 'bedClip' to find lines causing error" | 
|  | 53   echo "Make sure that bigwig and bed files are using the same genome assembly" | 
|  | 54   exit 1 | 
|  | 55 fi | 
|  | 56 | 
|  | 57 #make string of "nbins" 0's to insert in regions, where no reads are found | 
|  | 58 if [ $nbins -gt 1 ]; then | 
|  | 59   seq_string=`seq 1 $nbins` | 
|  | 60   zero_string=`printf "0\t%.s" {$seq_string} | perl -pe "s/\t$//"` | 
|  | 61 fi | 
|  | 62 | 
|  | 63 #make sure the given summary type exists | 
|  | 64 if [ `echo $summary_type | egrep "(mean|max|min|std|coverage)" | wc -l` -ne 1 ]; then | 
|  | 65   echo "ERROR: Summary type must be: mean, max, min, std or coverage. Default is 'mean'" | 
|  | 66   exit 1 | 
|  | 67 fi | 
|  | 68 | 
|  | 69 #determine number of fields in bed_file | 
|  | 70 if [ `tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq | wc -l` -eq 1 ]; then | 
|  | 71   nfields_bed=`tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq` | 
|  | 72 else | 
|  | 73   echo "ERROR: Bed file does not have constant number of line columns" | 
|  | 74   exit 1 | 
|  | 75 fi | 
|  | 76 | 
|  | 77 if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then | 
|  | 78   strand_uniq_chars=`tail -n +${rm_header_line} $bed_file | cut -f6 | sort -u | perl -pe "s/\n//"` | 
|  | 79   if [[ $strand_uniq_chars != "+-" && $strand_uniq_chars != "-+" ]] ; then | 
|  | 80     echo "ERROR: Column 6 in bed file must only contain '+' or '-' characters" | 
|  | 81     exit 1 | 
|  | 82   fi | 
|  | 83 fi | 
|  | 84 | 
|  | 85 ################################################### | 
|  | 86 ###EXTRACT DENSITIES FROM NORMALIZED BIGWIG FILE | 
|  | 87 ################################################### | 
|  | 88 | 
|  | 89 | 
|  | 90 #if more than 1 bin AND >= 6 fields (i.e. has strand column) | 
|  | 91 if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then | 
|  | 92 | 
|  | 93   #cut columns 1-3+6 | rm header if flag set | 
|  | 94   cut -f1-3,6 $bed_file | tail -n +${rm_header_line} | while read -r line; do | 
|  | 95 | 
|  | 96     #read 4 fields into variables | 
|  | 97     read -r cur_chr cur_start cur_end cur_strand <<<"$line" | 
|  | 98     #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done' | 
|  | 99     bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1 | perl -pe "s/no data.+$/${zero_string}/" | awk 'BEGIN{OFS="\t"}{ if("'"$cur_strand"'"=="-") { for (i=NF; i>0; i--) { printf("%s\t",$i) } printf("\n") } else { print $0 } }' | 
|  | 100   done | perl -pe "s/n\/a/0/g" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g"  | perl -pe "s/\t$//" |  sed '/^$/d' | 
|  | 101 | 
|  | 102 #if more than 1 bin AND less than 6 fields (i.e. no strand column) | 
|  | 103 elif [[ $nbins -gt 1 && $nfields_bed -lt 6 ]]; then | 
|  | 104 | 
|  | 105   #cut columns 1-3 | rm header if flag set | 
|  | 106   cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do | 
|  | 107 | 
|  | 108     #read 3 fields into variables | 
|  | 109     read -r cur_chr cur_start cur_end <<<"$line" | 
|  | 110     #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done' | 
|  | 111     bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1 | 
|  | 112 | 
|  | 113   done | perl -pe "s/n\/a/0/g" | perl -pe "s/no data.+$/${zero_string}/" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | sed '/^$/d' | 
|  | 114 | 
|  | 115 | 
|  | 116 #if 1 bin. Strand column irrelevant | 
|  | 117 else | 
|  | 118 | 
|  | 119   cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do | 
|  | 120 | 
|  | 121     read -r cur_chr cur_start cur_end <<<"$line" | 
|  | 122     bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end 1 -type=${summary_type} 2>&1 | 
|  | 123 | 
|  | 124   done | perl -pe "s/no data.+$/0/" | perl -pe "s/n\/a/0/g" | sed '/^$/d' | 
|  | 125 | 
|  | 126 fi | 
|  | 127 | 
|  | 128 | 
|  | 129 | 
|  | 130 | 
|  | 131 |