# HG changeset patch # User nikos # Date 1409673870 14400 # Node ID ad32956798e3f49ae2321b127b5c85bd9457f2db # Parent eebdd6da935398810a14516a3fdf0c35b2efcc0b Uploaded diff -r eebdd6da9353 -r ad32956798e3 bedClip.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bedClip.sh Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,3 @@ +#!/bin/bash + +bedClip $1 <(fetchChromSizes $2 2> /dev/null) $3 diff -r eebdd6da9353 -r ad32956798e3 bedClip.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bedClip.xml Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,65 @@ + + Remove lines from bed file that refer to off-chromosome places. + + fetchChromSizes + bedClip + + + ## Set genome assembly + + #set $Genome = str( $genome_cond.genome ) + #if str( $genome_cond ) == 'OTHER': + #set $Genome = str( $genome_cond.genome_other ) + #end if + + bedClip.sh $input $Genome $output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** +bedClip - Remove lines from bed file that refer to off-chromosome places. + +**Usage** + + bedClip input.bed chrom.sizes output.bed + + + diff -r eebdd6da9353 -r ad32956798e3 bedExtendRanges.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bedExtendRanges.xml Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,82 @@ + + Extend length of entries in bed 6+. + + bedExtendRanges + + + ## Set genome assembly + + #set $Genome = str( $genome_cond.genome ) + #if str( $genome_cond ) == 'OTHER': + #set $Genome = str( $genome_cond.genome_other ) + #end if + + bedExtendRanges -user=genome -host=genome-mysql.cse.ucsc.edu $Genome $length $input 2> /dev/null > $output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +bedExtendRanges_ - extend length of entries in bed 6+ data to be at least the given length, +taking strand directionality into account. + +.. _bedExtendRanges: http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bedExtendRanges + +**Usage** + + bedExtendRanges database length files(s) + +**Example** + + * bedExtendRanges -user=genome -host=genome-mysql.cse.ucsc.edu hg18 250 stdin + + will transform: + chr1 500 525 . 100 + + chr1 1000 1025 . 100 - + to: + chr1 500 750 . 100 + + chr1 775 1025 . 100 - + + + diff -r eebdd6da9353 -r ad32956798e3 bigWigSummary.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bigWigSummary.xml Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,43 @@ + + Extract summary information from a bigWig file. + + bigWigSummary $input $chrom $start $end $dataPoints -type=$type> $output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**Usage** + * bigWigSummary file.bigWig chrom start end dataPoints + * Get summary data from bigWig for indicated region, broken into dataPoints equal parts. (Use dataPoints=1 for simple summary.) + + + diff -r eebdd6da9353 -r ad32956798e3 bigwig2summary.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bigwig2summary.sh Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,131 @@ +#!/bin/bash +# $Id: bigwig2summary_stdout 34 2014-02-20 08:31:20Z jens $ + +#USE IN GALAXY +#This script extract summary values (mean, min, max, std or coverage) from a bigwig file for a number of equal sized bins across genomic regions given in bed file. +#If the bed file contains 6 columns (or more), column 6 is expected to contain strand information. Summary values from a negative strand will be reversed. + +######################################## +#bigwigSummary has 3 non-standard outputs: +#1) n/a #(no data in bin) +#2) no data #(no data in entire region) +#3) is not a valid option (typically if negative coordinate) + +#Default settings & input parsing. "" indicates required user input. +nbins=1 +rm_header_line=0 +summary_type=mean + +#parse input +while getopts hef:b:o:n:t: myarg +do case "$myarg" in + h) echo "Usage: bigwig2summary_stdout -f -b -o -n -t -e" + echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation" + exit ;; + f) bigwig_file="$OPTARG" ;; + b) bed_file="$OPTARG" ;; #must be tab separated without header + o) org_assembly="$OPTARG" ;; + n) nbins="$OPTARG" ;; + t) summary_type="$OPTARG" ;; + e) rm_header_line=2 ;; #flag. if -e then first line is removed + [?]) echo "Usage: bigwig2summary_stdout -f -b -o -n -t -e" + echo "Corrects for strand if bed-file has 6 columns or more. Col 6 assumed to contain strand infomation" + exit 1 ;; + esac +done + +################################################### +###VALIDATE INPUT +################################################### + +#get chromosome sizes from bigwig file. bigwig file does not contain name of genome assembly. +org_assembly_file=`mktemp -u` +fetchChromSizes $org_assembly 2>/dev/null > $org_assembly_file +if [ $? -ne 0 ]; then + echo "ERROR: Organism genome assembly does not exist" + rm $org_assembly_file + exit +fi + +#check input bed_file. bedClip only checks first 3 columns! +if [ `bedClip -verbose=2 <(tail -n +${rm_header_line} $bed_file) $org_assembly_file /dev/null 2>&1 | wc -l` -gt 0 ]; then + echo -e "ERROR: Input bed file is not in proper format!\nTry 'bedClip' to find lines causing error" + echo "Make sure that bigwig and bed files are using the same genome assembly" + exit 1 +fi + +#make string of "nbins" 0's to insert in regions, where no reads are found +if [ $nbins -gt 1 ]; then + seq_string=`seq 1 $nbins` + zero_string=`printf "0\t%.s" {$seq_string} | perl -pe "s/\t$//"` +fi + +#make sure the given summary type exists +if [ `echo $summary_type | egrep "(mean|max|min|std|coverage)" | wc -l` -ne 1 ]; then + echo "ERROR: Summary type must be: mean, max, min, std or coverage. Default is 'mean'" + exit 1 +fi + +#determine number of fields in bed_file +if [ `tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq | wc -l` -eq 1 ]; then + nfields_bed=`tail -n +${rm_header_line} $bed_file | awk '{print NF}' | uniq` +else + echo "ERROR: Bed file does not have constant number of line columns" + exit 1 +fi + +if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then + strand_uniq_chars=`tail -n +${rm_header_line} $bed_file | cut -f6 | sort -u | perl -pe "s/\n//"` + if [[ $strand_uniq_chars != "+-" && $strand_uniq_chars != "-+" ]] ; then + echo "ERROR: Column 6 in bed file must only contain '+' or '-' characters" + exit 1 + fi +fi + +################################################### +###EXTRACT DENSITIES FROM NORMALIZED BIGWIG FILE +################################################### + + +#if more than 1 bin AND >= 6 fields (i.e. has strand column) +if [[ $nbins -gt 1 && $nfields_bed -ge 6 ]]; then + + #cut columns 1-3+6 | rm header if flag set + cut -f1-3,6 $bed_file | tail -n +${rm_header_line} | while read -r line; do + + #read 4 fields into variables + read -r cur_chr cur_start cur_end cur_strand <<<"$line" + #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done' + bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1 | perl -pe "s/no data.+$/${zero_string}/" | awk 'BEGIN{OFS="\t"}{ if("'"$cur_strand"'"=="-") { for (i=NF; i>0; i--) { printf("%s\t",$i) } printf("\n") } else { print $0 } }' + done | perl -pe "s/n\/a/0/g" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | perl -pe "s/\t$//" | sed '/^$/d' + +#if more than 1 bin AND less than 6 fields (i.e. no strand column) +elif [[ $nbins -gt 1 && $nfields_bed -lt 6 ]]; then + + #cut columns 1-3 | rm header if flag set + cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do + + #read 3 fields into variables + read -r cur_chr cur_start cur_end <<<"$line" + #run bigWigSummary. Combine stdout and stderr | treat exceptions and errors after 'done' + bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end $nbins -type=${summary_type} 2>&1 + + done | perl -pe "s/n\/a/0/g" | perl -pe "s/no data.+$/${zero_string}/" | perl -pe "s/^\t?0\t?$/${zero_string}/" | perl -pe "s/ +/\t/g" | sed '/^$/d' + + +#if 1 bin. Strand column irrelevant +else + + cut -f1-3 $bed_file | tail -n +${rm_header_line} | while read -r line; do + + read -r cur_chr cur_start cur_end <<<"$line" + bigWigSummary $bigwig_file $cur_chr $cur_start $cur_end 1 -type=${summary_type} 2>&1 + + done | perl -pe "s/no data.+$/0/" | perl -pe "s/n\/a/0/g" | sed '/^$/d' + +fi + + + + + diff -r eebdd6da9353 -r ad32956798e3 bigwig2summary.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bigwig2summary.xml Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,104 @@ + + Extract summary information from a bigWig file across multiple genomic regions specified by the user. + + bigWigSummary + fetchChromSizes + bedClip + + + bigwig2summary.sh -f $input_bw -b $input_bed -n $bins -o $assembly -t $type $header > $output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool extracts summary values (mean, min, max, std or coverage) from a **bigWig** file for a number of equal sized bins across genomic regions given in an a "BED-like" file. + +The script this tool is based on is written by Jens Vilstrup Johansen and uses bigWigSummary_, bedClip_ and fetchChromSizes_. + +.. _bigWigSummary: http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bigWigSummary + +.. _bedClip: http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bedClip + +-- _fetchChromSizes: http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/fetchChromSizes + +----- + +.. class:: infomark + +The file contaning the genomic region must be TAB-delimited with at list 3 columns representing Chromosome, ChrStart and ChrEnd. If the file contains 6 columns (or more), column 6 is expected to contain strand information. Summary values from a negative strand will be reversed. + +----- + +**Example 1** + +Input BED file:: + + chr19 50178708 50180708 + chr6 90348174 90350174 + chr16 58495848 58497848 + chr5 180580242 180582242 + chr9 120177017 120179017 + +Extract summary (*#* of bins = 3):: + + 0 0 0 + 0.144886 0 0 + 0.507327 1.14649 1.38456 + 0.221471 0.144886 0.309857 + 0.348944 0.426638 0.244495 + +**Example 2** + +Input BED file (with strand information):: + + chr19 50178708 50180708 NM_198318 0 + PRMT1 + chr6 90348174 90350174 NM_020466 0 - LYRM2 + chr16 58495848 58497848 NM_020465 0 + NDRG4 + chr5 180580242 180582242 NM_206880 0 + OR2V2 + chr9 120177017 120179017 NM_014010 0 - ASTN2 + +Extract summary (*#* of bins = 3):: + + 0 0 0 + 0 0 0.144886 + 0.507327 1.14649 1.38456 + 0.221471 0.144886 0.309857 + 0.244495 0.426638 0.348944 + + + diff -r eebdd6da9353 -r ad32956798e3 test-data/1.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.bed Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,5 @@ +chr19 50178708 50180708 +chr6 90348174 90350174 +chr16 58495848 58497848 +chr5 180580242 180582242 +chr9 120177017 120179017 diff -r eebdd6da9353 -r ad32956798e3 test-data/1.bigwig Binary file test-data/1.bigwig has changed diff -r eebdd6da9353 -r ad32956798e3 test-data/1.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.tabular Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,4 @@ +0.144886 0 0 +0.507327 1.14649 1.38456 +0.221471 0.144886 0.309857 +0.348944 0.426638 0.244495 diff -r eebdd6da9353 -r ad32956798e3 test-data/1.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.txt Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,1 @@ +0.796113 0.273952 0.437672 diff -r eebdd6da9353 -r ad32956798e3 test-data/2.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2.bed Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,5 @@ +chr19 50178708 50180708 NM_198318 0 + PRMT1 +chr6 90348174 90350174 NM_020466 0 - LYRM2 +chr16 58495848 58497848 NM_020465 0 + NDRG4 +chr5 180580242 180582242 NM_206880 0 + OR2V2 +chr9 120177017 120179017 NM_014010 0 - ASTN2 diff -r eebdd6da9353 -r ad32956798e3 test-data/2.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2.tabular Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,4 @@ +0 0 0 0.144886 0 +0.396878 0.970009 1.1091 1.17538 1.40612 +0.203829 0.21262 0.144886 0.409996 0.274558 +0.180424 0.430989 0.458898 0.35171 0.307157 diff -r eebdd6da9353 -r ad32956798e3 test-data/3.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/3.bed Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,7 @@ +chr19 50178708 50180708 +chr21 48119895 48139895 +chr6 90348174 90350174 +chr16 58495848 58497848 +chr5 180580242 180582242 +chr17 81194210 81196210 +chr9 120177017 120179017 diff -r eebdd6da9353 -r ad32956798e3 test-data/4.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/4.bed Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,5 @@ +chr19 50178708 50183708 NM_198318 0 + PRMT1 +chr6 90345174 90350174 NM_020466 0 - LYRM2 +chr16 58495848 58500848 NM_020465 0 + NDRG4 +chr5 180580242 180585242 NM_206880 0 + OR2V2 +chr9 120174017 120179017 NM_014010 0 - ASTN2 diff -r eebdd6da9353 -r ad32956798e3 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Sep 02 12:04:30 2014 -0400 @@ -0,0 +1,71 @@ + + + + + + http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigWigSummary + + . + $INSTALL_DIR + + $INSTALL_DIR/bigWigSummary + + $INSTALL_DIR + + + + + + + + + + http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedClip + + . + $INSTALL_DIR + + $INSTALL_DIR/bedClip + + $INSTALL_DIR + + + + + + + + + + http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedExtendRanges + + . + $INSTALL_DIR + + $INSTALL_DIR/bedExtendRanges + + $INSTALL_DIR + + + + + + + + + + http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/fetchChromSizes + + . + $INSTALL_DIR + + $INSTALL_DIR/fetchChromSizes + + $INSTALL_DIR + + + + + + +