8
|
1 #!/bin/bash
|
|
2 # $Id: bigwig2summary_stdout 34 2014-02-20 08:31:20Z jens $
|
|
3
|
|
4 #USE IN GALAXY
|
|
5 #This script extract summary values (mean, min, max, std or coverage) from a bigwig file for a number of equal sized bins across genomic regions given in bed file.
|
|
6 #If the bed file contains 6 columns (or more), column 6 is expected to contain strand information. Summary values from a negative strand will be reversed.
|
|
7
|
|
8 ########################################
|
|
9 #bigwigSummary has 3 non-standard outputs:
|
|
10 #1) n/a #(no data in bin)
|
|
11 #2) no data #(no data in entire region)
|
|
12 #3) <number> is not a valid option (typically if negative coordinate)
|
|
13
|
|
14 #Default settings & input parsing. "" indicates required user input.
|
|
15 nbins=1
|
|
16 rm_header_line=0
|
|
17 summary_type=mean
|
|
18
|
|
19 #parse input
|
|
20 while getopts hef:b:o:n:t: myarg
|
|
21 do case "$myarg" in
|
|
22 h) echo "Usage: bigwig2summary_stdout -f <BIGWIG_FILE> -b <BED_FILE> -o <ASSEMBLY> -n <NUMBER OF BINS> -t <SUMMARY TYPE> -e"
|
|
23 echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation"
|
|
24 exit ;;
|
|
25 f) bigwig_file="$OPTARG" ;;
|
|
26 b) bed_file="$OPTARG" ;; #must be tab separated without header
|
|
27 o) org_assembly="$OPTARG" ;;
|
|
28 n) nbins="$OPTARG" ;;
|
|
29 t) summary_type="$OPTARG" ;;
|
|
30 e) rm_header_line=2 ;; #flag. if -e then first line is removed
|
|
31 [?]) echo "Usage: bigwig2summary_stdout -f <BIGWIG_FILE> -b <BED_FILE> -o <ASSEMBLY> -n <NUMBER OF BINS> -t <SUMMARY TYPE> -e"
|
|
32 echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation"
|
|
33 exit 1 ;;
|
|
34 esac
|
|
35 done
|
|
36
|
|
37 ###################################################
|
|
38 ###VALIDATE INPUT
|
|
39 ###################################################
|
|
40
|
|
41 #get chromosome sizes from bigwig file. bigwig file does not contain name of genome assembly.
|
|
42 org_assembly_file=`mktemp -u`
|
|
43 fetchChromSizes $org_assembly 2>/dev/null > $org_assembly_file
|
|
44 if [ $? -ne 0 ]; then
|
|
45 echo "ERROR: Organism genome assembly does not exist"
|
|
46 rm $org_assembly_file
|
|
47 exit
|
|
48 fi
|
|
49
|
|
50 #check input bed_file. bedClip only checks first 3 columns!
|
|
51 if [ `bedClip -verbose=2 <(tail -n +${rm_header_line} $bed_file) $org_assembly_file /dev/null 2>&1 | wc -l` -gt 0 ]; then
|
|
52 echo -e "ERROR: Input bed file is not in proper format!\nTry 'bedClip' to find lines causing error"
|
|
53 echo "Make sure that bigwig and bed files are using the same genome assembly"
|
|
54 exit 1
|
|
55 fi
|
|
56
|
|
57 #make string of "nbins" 0's to insert in regions, where no reads are found
|
|
58 if [ $nbins -gt 1 ]; then
|
|
59 seq_string=`seq 1 $nbins`
|
|
60 zero_string=`printf "0\t%.s" {$seq_string} | perl -pe "s/\t$//"`
|
|
61 fi
|
|
62
|
|
63 #make sure the given summary type exists
|
|
64 if [ `echo $summary_type | egrep "(mean|max|min|std|coverage)" | wc -l` -ne 1 ]; then
|
|
65 echo "ERROR: Summary type must be: mean, max, min, std or coverage. Default is 'mean'"
|
|
66 exit 1
|
|
67 fi
|
|
68
|
|
69 #determine number of fields in bed_file
|
|
70 if [ `tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq | wc -l` -eq 1 ]; then
|
|
71 nfields_bed=`tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq`
|
|
72 else
|
|
73 echo "ERROR: Bed file does not have constant number of line columns"
|
|
74 exit 1
|
|
75 fi
|
|
76
|
|
77 if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then
|
|
78 strand_uniq_chars=`tail -n +${rm_header_line} $bed_file | cut -f6 | sort -u | perl -pe "s/\n//"`
|
|
79 if [[ $strand_uniq_chars != "+-" && $strand_uniq_chars != "-+" ]] ; then
|
|
80 echo "ERROR: Column 6 in bed file must only contain '+' or '-' characters"
|
|
81 exit 1
|
|
82 fi
|
|
83 fi
|
|
84
|
|
85 ###################################################
|
|
86 ###EXTRACT DENSITIES FROM NORMALIZED BIGWIG FILE
|
|
87 ###################################################
|
|
88
|
|
89
|
|
90 #if more than 1 bin AND >= 6 fields (i.e. has strand column)
|
|
91 if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then
|
|
92
|
|
93 #cut columns 1-3+6 | rm header if flag set
|
|
94 cut -f1-3,6 $bed_file | tail -n +${rm_header_line} | while read -r line; do
|
|
95
|
|
96 #read 4 fields into variables
|
|
97 read -r cur_chr cur_start cur_end cur_strand <<<"$line"
|
|
98 #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done'
|
|
99 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1 | perl -pe "s/no data.+$/${zero_string}/" | awk 'BEGIN{OFS="\t"}{ if("'"$cur_strand"'"=="-") { for (i=NF; i>0; i--) { printf("%s\t",$i) } printf("\n") } else { print $0 } }'
|
|
100 done | perl -pe "s/n\/a/0/g" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | perl -pe "s/\t$//" | sed '/^$/d'
|
|
101
|
|
102 #if more than 1 bin AND less than 6 fields (i.e. no strand column)
|
|
103 elif [[ $nbins -gt 1 && $nfields_bed -lt 6 ]]; then
|
|
104
|
|
105 #cut columns 1-3 | rm header if flag set
|
|
106 cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do
|
|
107
|
|
108 #read 3 fields into variables
|
|
109 read -r cur_chr cur_start cur_end <<<"$line"
|
|
110 #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done'
|
|
111 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1
|
|
112
|
|
113 done | perl -pe "s/n\/a/0/g" | perl -pe "s/no data.+$/${zero_string}/" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | sed '/^$/d'
|
|
114
|
|
115
|
|
116 #if 1 bin. Strand column irrelevant
|
|
117 else
|
|
118
|
|
119 cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do
|
|
120
|
|
121 read -r cur_chr cur_start cur_end <<<"$line"
|
|
122 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end 1 -type=${summary_type} 2>&1
|
|
123
|
|
124 done | perl -pe "s/no data.+$/0/" | perl -pe "s/n\/a/0/g" | sed '/^$/d'
|
|
125
|
|
126 fi
|
|
127
|
|
128
|
|
129
|
|
130
|
|
131
|