comparison bigwig2summary.sh @ 8:ad32956798e3 draft

Uploaded
author nikos
date Tue, 02 Sep 2014 12:04:30 -0400
parents
children
comparison
equal deleted inserted replaced
7:eebdd6da9353 8:ad32956798e3
1 #!/bin/bash
2 # $Id: bigwig2summary_stdout 34 2014-02-20 08:31:20Z jens $
3
4 #USE IN GALAXY
5 #This script extract summary values (mean, min, max, std or coverage) from a bigwig file for a number of equal sized bins across genomic regions given in bed file.
6 #If the bed file contains 6 columns (or more), column 6 is expected to contain strand information. Summary values from a negative strand will be reversed.
7
8 ########################################
9 #bigwigSummary has 3 non-standard outputs:
10 #1) n/a #(no data in bin)
11 #2) no data #(no data in entire region)
12 #3) <number> is not a valid option (typically if negative coordinate)
13
14 #Default settings & input parsing. "" indicates required user input.
15 nbins=1
16 rm_header_line=0
17 summary_type=mean
18
19 #parse input
20 while getopts hef:b:o:n:t: myarg
21 do case "$myarg" in
22 h) echo "Usage: bigwig2summary_stdout -f <BIGWIG_FILE> -b <BED_FILE> -o <ASSEMBLY> -n <NUMBER OF BINS> -t <SUMMARY TYPE> -e"
23 echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation"
24 exit ;;
25 f) bigwig_file="$OPTARG" ;;
26 b) bed_file="$OPTARG" ;; #must be tab separated without header
27 o) org_assembly="$OPTARG" ;;
28 n) nbins="$OPTARG" ;;
29 t) summary_type="$OPTARG" ;;
30 e) rm_header_line=2 ;; #flag. if -e then first line is removed
31 [?]) echo "Usage: bigwig2summary_stdout -f <BIGWIG_FILE> -b <BED_FILE> -o <ASSEMBLY> -n <NUMBER OF BINS> -t <SUMMARY TYPE> -e"
32 echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation"
33 exit 1 ;;
34 esac
35 done
36
37 ###################################################
38 ###VALIDATE INPUT
39 ###################################################
40
41 #get chromosome sizes from bigwig file. bigwig file does not contain name of genome assembly.
42 org_assembly_file=`mktemp -u`
43 fetchChromSizes $org_assembly 2>/dev/null > $org_assembly_file
44 if [ $? -ne 0 ]; then
45 echo "ERROR: Organism genome assembly does not exist"
46 rm $org_assembly_file
47 exit
48 fi
49
50 #check input bed_file. bedClip only checks first 3 columns!
51 if [ `bedClip -verbose=2 <(tail -n +${rm_header_line} $bed_file) $org_assembly_file /dev/null 2>&1 | wc -l` -gt 0 ]; then
52 echo -e "ERROR: Input bed file is not in proper format!\nTry 'bedClip' to find lines causing error"
53 echo "Make sure that bigwig and bed files are using the same genome assembly"
54 exit 1
55 fi
56
57 #make string of "nbins" 0's to insert in regions, where no reads are found
58 if [ $nbins -gt 1 ]; then
59 seq_string=`seq 1 $nbins`
60 zero_string=`printf "0\t%.s" {$seq_string} | perl -pe "s/\t$//"`
61 fi
62
63 #make sure the given summary type exists
64 if [ `echo $summary_type | egrep "(mean|max|min|std|coverage)" | wc -l` -ne 1 ]; then
65 echo "ERROR: Summary type must be: mean, max, min, std or coverage. Default is 'mean'"
66 exit 1
67 fi
68
69 #determine number of fields in bed_file
70 if [ `tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq | wc -l` -eq 1 ]; then
71 nfields_bed=`tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq`
72 else
73 echo "ERROR: Bed file does not have constant number of line columns"
74 exit 1
75 fi
76
77 if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then
78 strand_uniq_chars=`tail -n +${rm_header_line} $bed_file | cut -f6 | sort -u | perl -pe "s/\n//"`
79 if [[ $strand_uniq_chars != "+-" && $strand_uniq_chars != "-+" ]] ; then
80 echo "ERROR: Column 6 in bed file must only contain '+' or '-' characters"
81 exit 1
82 fi
83 fi
84
85 ###################################################
86 ###EXTRACT DENSITIES FROM NORMALIZED BIGWIG FILE
87 ###################################################
88
89
90 #if more than 1 bin AND >= 6 fields (i.e. has strand column)
91 if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then
92
93 #cut columns 1-3+6 | rm header if flag set
94 cut -f1-3,6 $bed_file | tail -n +${rm_header_line} | while read -r line; do
95
96 #read 4 fields into variables
97 read -r cur_chr cur_start cur_end cur_strand <<<"$line"
98 #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done'
99 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1 | perl -pe "s/no data.+$/${zero_string}/" | awk 'BEGIN{OFS="\t"}{ if("'"$cur_strand"'"=="-") { for (i=NF; i>0; i--) { printf("%s\t",$i) } printf("\n") } else { print $0 } }'
100 done | perl -pe "s/n\/a/0/g" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | perl -pe "s/\t$//" | sed '/^$/d'
101
102 #if more than 1 bin AND less than 6 fields (i.e. no strand column)
103 elif [[ $nbins -gt 1 && $nfields_bed -lt 6 ]]; then
104
105 #cut columns 1-3 | rm header if flag set
106 cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do
107
108 #read 3 fields into variables
109 read -r cur_chr cur_start cur_end <<<"$line"
110 #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done'
111 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1
112
113 done | perl -pe "s/n\/a/0/g" | perl -pe "s/no data.+$/${zero_string}/" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | sed '/^$/d'
114
115
116 #if 1 bin. Strand column irrelevant
117 else
118
119 cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do
120
121 read -r cur_chr cur_start cur_end <<<"$line"
122 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end 1 -type=${summary_type} 2>&1
123
124 done | perl -pe "s/no data.+$/0/" | perl -pe "s/n\/a/0/g" | sed '/^$/d'
125
126 fi
127
128
129
130
131