Mercurial > repos > nikos > ucsc_tools
comparison bigwig2summary.sh @ 8:ad32956798e3 draft
Uploaded
author | nikos |
---|---|
date | Tue, 02 Sep 2014 12:04:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
7:eebdd6da9353 | 8:ad32956798e3 |
---|---|
1 #!/bin/bash | |
2 # $Id: bigwig2summary_stdout 34 2014-02-20 08:31:20Z jens $ | |
3 | |
4 #USE IN GALAXY | |
5 #This script extract summary values (mean, min, max, std or coverage) from a bigwig file for a number of equal sized bins across genomic regions given in bed file. | |
6 #If the bed file contains 6 columns (or more), column 6 is expected to contain strand information. Summary values from a negative strand will be reversed. | |
7 | |
8 ######################################## | |
9 #bigwigSummary has 3 non-standard outputs: | |
10 #1) n/a #(no data in bin) | |
11 #2) no data #(no data in entire region) | |
12 #3) <number> is not a valid option (typically if negative coordinate) | |
13 | |
14 #Default settings & input parsing. "" indicates required user input. | |
15 nbins=1 | |
16 rm_header_line=0 | |
17 summary_type=mean | |
18 | |
19 #parse input | |
20 while getopts hef:b:o:n:t: myarg | |
21 do case "$myarg" in | |
22 h) echo "Usage: bigwig2summary_stdout -f <BIGWIG_FILE> -b <BED_FILE> -o <ASSEMBLY> -n <NUMBER OF BINS> -t <SUMMARY TYPE> -e" | |
23 echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation" | |
24 exit ;; | |
25 f) bigwig_file="$OPTARG" ;; | |
26 b) bed_file="$OPTARG" ;; #must be tab separated without header | |
27 o) org_assembly="$OPTARG" ;; | |
28 n) nbins="$OPTARG" ;; | |
29 t) summary_type="$OPTARG" ;; | |
30 e) rm_header_line=2 ;; #flag. if -e then first line is removed | |
31 [?]) echo "Usage: bigwig2summary_stdout -f <BIGWIG_FILE> -b <BED_FILE> -o <ASSEMBLY> -n <NUMBER OF BINS> -t <SUMMARY TYPE> -e" | |
32 echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation" | |
33 exit 1 ;; | |
34 esac | |
35 done | |
36 | |
37 ################################################### | |
38 ###VALIDATE INPUT | |
39 ################################################### | |
40 | |
41 #get chromosome sizes from bigwig file. bigwig file does not contain name of genome assembly. | |
42 org_assembly_file=`mktemp -u` | |
43 fetchChromSizes $org_assembly 2>/dev/null > $org_assembly_file | |
44 if [ $? -ne 0 ]; then | |
45 echo "ERROR: Organism genome assembly does not exist" | |
46 rm $org_assembly_file | |
47 exit | |
48 fi | |
49 | |
50 #check input bed_file. bedClip only checks first 3 columns! | |
51 if [ `bedClip -verbose=2 <(tail -n +${rm_header_line} $bed_file) $org_assembly_file /dev/null 2>&1 | wc -l` -gt 0 ]; then | |
52 echo -e "ERROR: Input bed file is not in proper format!\nTry 'bedClip' to find lines causing error" | |
53 echo "Make sure that bigwig and bed files are using the same genome assembly" | |
54 exit 1 | |
55 fi | |
56 | |
57 #make string of "nbins" 0's to insert in regions, where no reads are found | |
58 if [ $nbins -gt 1 ]; then | |
59 seq_string=`seq 1 $nbins` | |
60 zero_string=`printf "0\t%.s" {$seq_string} | perl -pe "s/\t$//"` | |
61 fi | |
62 | |
63 #make sure the given summary type exists | |
64 if [ `echo $summary_type | egrep "(mean|max|min|std|coverage)" | wc -l` -ne 1 ]; then | |
65 echo "ERROR: Summary type must be: mean, max, min, std or coverage. Default is 'mean'" | |
66 exit 1 | |
67 fi | |
68 | |
69 #determine number of fields in bed_file | |
70 if [ `tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq | wc -l` -eq 1 ]; then | |
71 nfields_bed=`tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq` | |
72 else | |
73 echo "ERROR: Bed file does not have constant number of line columns" | |
74 exit 1 | |
75 fi | |
76 | |
77 if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then | |
78 strand_uniq_chars=`tail -n +${rm_header_line} $bed_file | cut -f6 | sort -u | perl -pe "s/\n//"` | |
79 if [[ $strand_uniq_chars != "+-" && $strand_uniq_chars != "-+" ]] ; then | |
80 echo "ERROR: Column 6 in bed file must only contain '+' or '-' characters" | |
81 exit 1 | |
82 fi | |
83 fi | |
84 | |
85 ################################################### | |
86 ###EXTRACT DENSITIES FROM NORMALIZED BIGWIG FILE | |
87 ################################################### | |
88 | |
89 | |
90 #if more than 1 bin AND >= 6 fields (i.e. has strand column) | |
91 if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then | |
92 | |
93 #cut columns 1-3+6 | rm header if flag set | |
94 cut -f1-3,6 $bed_file | tail -n +${rm_header_line} | while read -r line; do | |
95 | |
96 #read 4 fields into variables | |
97 read -r cur_chr cur_start cur_end cur_strand <<<"$line" | |
98 #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done' | |
99 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1 | perl -pe "s/no data.+$/${zero_string}/" | awk 'BEGIN{OFS="\t"}{ if("'"$cur_strand"'"=="-") { for (i=NF; i>0; i--) { printf("%s\t",$i) } printf("\n") } else { print $0 } }' | |
100 done | perl -pe "s/n\/a/0/g" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | perl -pe "s/\t$//" | sed '/^$/d' | |
101 | |
102 #if more than 1 bin AND less than 6 fields (i.e. no strand column) | |
103 elif [[ $nbins -gt 1 && $nfields_bed -lt 6 ]]; then | |
104 | |
105 #cut columns 1-3 | rm header if flag set | |
106 cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do | |
107 | |
108 #read 3 fields into variables | |
109 read -r cur_chr cur_start cur_end <<<"$line" | |
110 #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done' | |
111 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1 | |
112 | |
113 done | perl -pe "s/n\/a/0/g" | perl -pe "s/no data.+$/${zero_string}/" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | sed '/^$/d' | |
114 | |
115 | |
116 #if 1 bin. Strand column irrelevant | |
117 else | |
118 | |
119 cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do | |
120 | |
121 read -r cur_chr cur_start cur_end <<<"$line" | |
122 bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end 1 -type=${summary_type} 2>&1 | |
123 | |
124 done | perl -pe "s/no data.+$/0/" | perl -pe "s/n\/a/0/g" | sed '/^$/d' | |
125 | |
126 fi | |
127 | |
128 | |
129 | |
130 | |
131 |