Mercurial > repos > morinlab > oncocircos
changeset 4:f60d66a2a6b0 draft
Uploaded
| author | morinlab |
|---|---|
| date | Wed, 30 Nov 2016 13:34:35 -0500 |
| parents | 0f30d19bc378 |
| children | e6fa60c7e2f0 |
| files | parse |
| diffstat | 1 files changed, 559 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parse Wed Nov 30 13:34:35 2016 -0500 @@ -0,0 +1,559 @@ +#!/bin/env perl + +=pod + +=head1 NAME + +parse - parse Ryan's MAF and CNV files and generate a summary table of all genes and their mutations and CNV status + +=head1 SYNOPSIS + + # automatically load etc/parse.conf + bin/parse + + # if config file is elsewhere + bin/parse -conf elsewhere/my.conf + +=head1 DESCRIPTION + +See etc/parse.conf for all settings. + +=head1 OPTIONS + +=cut + +use strict; +use warnings FATAL=>"all"; + +use Carp; +use Config::General; +use Cwd qw(getcwd abs_path); +use Data::Dumper; +use File::Basename; +use FindBin; +use Getopt::Long; +use Math::Round qw(round nearest); +use Math::VecStat qw(sum min max average); +use Pod::Usage; +use Time::HiRes qw(gettimeofday tv_interval); +use Statistics::Basic qw(median); +use Storable; +use lib "$FindBin::RealBin"; +use lib "$FindBin::RealBin/../lib"; +use lib "$FindBin::RealBin/lib"; + +our (%OPT,%CONF,$conf); +our @COMMAND_LINE = ("file=s", + "configfile=s", + "help", + "cdump", + "man", + "debug"); +our $VERSION = 0.02; + +# common and custom module imports below +#use Regexp::Common; +#use IO::File; +#use List::Util; +#use List::MoreUtils; +use Set::IntSpan; +#use Statistics::Descriptive; + +# read and parse configuration file +parse_config(); + +sub validateconfiguration { +} + +#map2interval(5,$CONF{cnv}{log2}); +#exit; + +################################################################ +# get files +my $sv = read_file($CONF{files}{sv} ,"sv" ); +my $genes = read_file($CONF{files}{mart},"genes"); +my $cnv = read_file($CONF{files}{cnv} ,"cnv" ); +my $cnvlg = read_file($CONF{files}{cnvlg},"cnvlg"); +my %affected_genes; +open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n"; +while(<GENES>){ + chomp; + $affected_genes{$_}++; +} + + + +################################################################ +# traverse all genes from biomart and determine number +# of SV and CNV events across samples +for my $chr (keys %$genes) { + next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr}; + printdebug("processing",$chr); + for my $gene (@{$genes->{$chr}}) { + my $id = $gene->{id}; + my $name = $gene->{name}; + #print "GENE: $name, ID: $id\n"; + # filter out by presence and number of SV events + + # number of samples that have SV event + my @samples_sv = keys %{$sv->{$id}}; + if(defined $affected_genes{$name}){ + #override normal criteria for this gene + + } + else{ + next if $CONF{filter}{sv} && ! $sv->{$id}; + next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num}; + } + $gene->{affected} = 1; + + # register SV events + my $pos; + for my $sample (@samples_sv) { + for my $sv (sort {$b->{weight} <=> $a->{weight}} @{$sv->{$id}{$sample}}) { + $gene->{sv}{ $sv->{type} }++; + $gene->{sv}{ "*" }++; + $pos->{ $sv->{aa} }++; # register the protein position of the SV + next if $CONF{sv}{top_damage_only}; + } + } + # top SV event + if($gene->{sv}) { + my ($sv_top) = sort {$gene->{sv}{$b} <=> $gene->{sv}{$a}} grep($_ ne "*",keys %{$gene->{sv}}); + $gene->{sv_top}{$sv_top} = $gene->{sv}{$sv_top}; + } + for my $aa (sort {$pos->{$b} <=> $pos->{$a}} keys %$pos) { + #next unless $pos->{$aa} > 1; + my $n = $pos->{$aa}; + $gene->{svaa_top}{$aa} = $n if ! defined $gene->{svaa_top}; + $gene->{svaa}{"*"} += $n; + $gene->{svaa}{$aa} = $n; + } + # register CNV events + my @samples_cnv = keys %$cnv; + # lookup any CNV events -- this can take a bit of time + # we can bin the CNV hash later if needed + for my $sample (@samples_cnv) { + my $chr = $gene->{chr}; + next unless $cnv->{$sample}{$chr}; + for my $cnv (@{$cnv->{$sample}{$chr}}) { + my $int = $cnv->{set}->intersect($gene->{set})->cardinality; + next unless $int; + push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg}; + } + } + for my $sample (keys %$cnvlg) { + my $chr = $gene->{chr}; + next unless $cnvlg->{$chr}; + for my $cnv (@{$cnvlg->{$chr}}) { + my $int = $cnv->{set}->intersect($gene->{set})->cardinality; + next unless $int; + $gene->{cnvlg}{$cnv->{idx}} = $cnv->{type}; + } + } + } +} + +################################################################ +# report +my $i = 0; +for my $chr (1..22,"X","Y") { + next unless $genes->{$chr}; + for my $gene (sort {$a->{start} <=> $b->{start}} @{$genes->{$chr}}) { + next unless $gene->{affected}; + my @report = ($i++,@{$gene}{qw(id name chr start end size)}); + if($gene->{sv}) { + push @report, sprintf("sv_top:%s:%d",keys %{$gene->{sv_top}},values %{$gene->{sv_top}}); + for my $type (sort keys %{$gene->{sv}}) { + push @report, sprintf("sv:%s:%d",$type,$gene->{sv}{$type}); + } + } + if($gene->{svaa}) { + push @report, sprintf("svaa_top:%s:%d",keys %{$gene->{svaa_top}},values %{$gene->{svaa_top}}); + for my $aa (sort {$gene->{svaa}{$b} <=> $gene->{svaa}{$a}} keys %{$gene->{svaa}}) { + push @report, sprintf("svaa:%s:%d",$aa,$gene->{svaa}{$aa}); + } + } + if($gene->{cnv}) { + my $type_count; + my $delins_count; + my $values_by_type; + for my $type (sort keys %{$gene->{cnv}}) { + my @sample_avg; + for my $sample (keys %{$gene->{cnv}{$type}}) { + # number of samples with this kind of CNV event + $type_count->{$type}++; + my @values = @{$gene->{cnv}{$type}{$sample}}; + push @sample_avg, average(@values); + push @{$values_by_type->{$type}}, @values; + } + push @report, sprintf("cnv:%s:%d:%f:%f:%f:%f", + $type, + int(@sample_avg), + scalar(min(@sample_avg)), + average(@sample_avg), + median(@sample_avg)->query, + scalar(max(@sample_avg))); + } + my ($top_type) = sort {$type_count->{$b} <=> $type_count->{$a}} keys %$type_count; + push @report, sprintf("cnv_top:%s:%d:%f:%f:%f:%f", + $top_type, + $type_count->{$top_type}, + scalar(min(@{$values_by_type->{$top_type}})), + average(@{$values_by_type->{$top_type}}), + median(@{$values_by_type->{$top_type}})->query, + scalar(max(@{$values_by_type->{$top_type}}))); + } + if($gene->{cnvlg}) { + for my $cnv_idx (sort {$a <=> $b} keys %{$gene->{cnvlg}}) { + push @report, sprintf("cnvlg:%d:%s", + $cnv_idx, + $gene->{cnvlg}{$cnv_idx}); + } + } + printinfo(@report); + } +} + +exit; + +################################################################ + +sub map2value { + my ($field_name,$x) = @_; + if($CONF{map}{$field_name}) { + my ($var,$fn) = split(":",$CONF{map}{$field_name}); + $fn =~ s/x/$x/g; + my $value = eval $fn; + die "Could not evaluate remapping [$fn]" if $@; + return ($var,$value); + } else { + return ($field_name,$x); + } +} + +sub str2leaf { + my $str = shift; + my $delim = "}{"; + $str =~ s/\//$delim/g; + $str = sprintf("\$CONF{$str}"); + my $leaf = eval $str; + if($@) { + die "Could not parse [$str] as a data structure."; + } + return $leaf; +} + +sub read_file { + my ($file,$type) = @_; + + open(F,$file) || die "Could not open file [$file] for reading"; + my $data; + + my @fields = grep(/\d/,keys %{$CONF{fields}{$type}}); + my @keys = split(",",$CONF{fields}{$type}{key}); + + my $i = 0; + while(<F>) { + chomp; + next if /^\#/; + my @tok = split "\t"; + my $entry = {class=>$type,idx=>$i}; + for my $col (@fields) { + my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col}); + my ($new_field_name,$value) = map2value($field_name,$tok[$col]); + if($new_field_name ne $field_name) { + $entry->{$field_name} = $tok[$col]; + $field_name = $new_field_name; + } + if($field_transform) { + if($field_transform eq "lc") { + $value = lc $value; + } elsif ($field_transform =~ /map\((.+),(.+)\)/) { + $entry->{$field_name} = $value; + $field_name = $1; + $value = map2interval($value,str2leaf($2)); + } + } + $entry->{ $field_name } = $value; + } + # skip mutation types that are not important + next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}}; + next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}}; + if($type eq "sv") { + $entry->{weight} = $CONF{sv}{types}{$entry->{type}}; + } + $entry->{chr} = "X" if $entry->{chr} eq 23; + $entry->{chr} = "Y" if $entry->{chr} eq 24; + next unless grep($entry->{chr} eq $_, (1..22,"X","Y")); + $entry->{set} = span(@{$entry}{qw(start end)}) if $entry->{start}; + $entry->{size} = $entry->{set}->cardinality; + #printdumper($entry); + $i++; + if(@keys == 1) { + push @{$data->{$entry->{$keys[0]}}}, $entry; + } elsif (@keys == 2) { + push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry; + } + } + printdebug("got",$i,$type); + return $data; +} + +sub map2interval { + my ($x,$conf) = @_; + for my $int (keys %$conf) { + my $cat = $conf->{$int}; + return $cat if in_interval($x,$int); + } + printdumper($conf); + die "Could not map $x into any of the intervals seen"; +} + +sub in_interval { + my ($x,$int) = @_; + my $open_L = "\\("; + my $closed_L = "\\["; + my $open_R = "\\)"; + my $closed_R = "\\]"; + my $num = "[0-9]*"; #[0-9]+(?:.[0-9]+)?"; + my $rx = "($open_L|$closed_L)($num),($num)($open_R|$closed_R)"; + my $bool = 1; + if($int =~ /$rx/) { + if($2 ne "") { + $bool &&= $1 eq "(" ? $x > $2 : $x >= $2; + } + if($3 ne "") { + $bool &&= $4 eq ")" ? $x < $3 : $x <= $3; + } + } else { + $bool = $x == $int; + } + return $bool; +} + +sub list2hash { + my %h; + map {$h{$_}=1} @_; + return %h; +} + +sub span { + my ($x,$y) = @_; + if($x==$y) { + return Set::IntSpan->new($x); + } else { + return Set::IntSpan->new("$x-$y"); + } +} + +sub get_handle { + my $h; + if(my $file = $CONF{file}) { + die "No such file [$file]" unless -e $file; + open(FILE,$file); + $h = \*FILE; + } else { + $h = \*STDIN; + } + return $h; +} + +# HOUSEKEEPING ############################################################### + +sub dump_config { + printdumper(\%OPT,\%CONF); +} + +sub parse_config { + my $dump_debug_level = 3; + GetOptions(\%OPT,@COMMAND_LINE); + pod2usage() if $OPT{help}; + pod2usage(-verbose=>2) if $OPT{man}; + loadconfiguration($OPT{configfile}); + populateconfiguration(); # copy command line options to config hash + validateconfiguration(); + if ($CONF{cdump}) { + $Data::Dumper::Indent = 2; + $Data::Dumper::Quotekeys = 0; + $Data::Dumper::Terse = 0; + $Data::Dumper::Sortkeys = 1; + $Data::Dumper::Varname = "OPT"; + printdumper(\%OPT); + $Data::Dumper::Varname = "CONF"; + printdumper(\%CONF); + exit; + } +} + +sub populateconfiguration { + for my $var (keys %OPT) { + $CONF{$var} = $OPT{$var}; + } + repopulateconfiguration(\%CONF); +} + +sub repopulateconfiguration { + my ($node,$parent_node_name) = shift; + return unless ref($node) eq "HASH"; + for my $key (keys %$node) { + my $value = $node->{$key}; + if (ref($value) eq "HASH") { + repopulateconfiguration($value,$key); + } elsif (ref($value) eq "ARRAY") { + for my $item (@$value) { + repopulateconfiguration($item,$key); + } + } elsif (defined $value) { + my $new_value = parse_field($value,$key,$parent_node_name,$node); + $node->{$key} = $new_value; + } + } +} + +sub parse_field { + my ($str,$key,$parent_node_name,$node) = @_; + # replace configuration field + # conf(LEAF,LEAF,...) + while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) { + my ($template,$leaf) = ($1,$2); + if (defined $template && defined $leaf) { + my @leaf = split(/\s*,\s*/,$leaf); + my $new_template; + if (@leaf == 2 && $leaf[0] eq ".") { + $new_template = $node->{$leaf[1]}; + } else { + $new_template = fetch_conf(@leaf); + } + $str =~ s/\Q$template\E/$new_template/g; + } + } + if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) { + my $fn = $1; + $str = eval $fn; + if ($@) { + die "could not parse configuration parameter [$@]"; + } + } + return $str; +} + +sub fetch_configuration { + my @config_path = @_; + my $node = \%CONF; + if(! @config_path) { + return \%CONF; + } + for my $path_element (@config_path) { + if (! exists $node->{$path_element}) { + return undef; + } else { + $node = $node->{$path_element}; + } + } + return $node; +} + +sub fetch_conf { + return fetch_configuration(@_); +} + +sub loadconfiguration { + my $file = shift; + if (defined $file) { + if (-e $file && -r _) { + # provided configuration file exists and can be read + $file = abs_path($file); + } else { + confess "The configuration file [$file] passed with -configfile does not exist or cannot be read."; + } + } else { + # otherwise, try to automatically find a configuration file + my ($scriptname,$path,$suffix) = fileparse($0); + my $cwd = getcwd(); + my $bindir = $FindBin::RealBin; + my $userdir = $ENV{HOME}; + my @candidate_files = ( + "$cwd/$scriptname.conf", + "$cwd/etc/$scriptname.conf", + "$cwd/../etc/$scriptname.conf", + "$bindir/$scriptname.conf", + "$bindir/etc/$scriptname.conf", + "$bindir/../etc/$scriptname.conf", + "$userdir/.$scriptname.conf", + ); + my @additional_files = (); + for my $candidate_file (@additional_files,@candidate_files) { + #printinfo("configsearch",$candidate_file); + if (-e $candidate_file && -r _) { + $file = $candidate_file; + #printinfo("configfound",$candidate_file); + last; + } + } + } + if (defined $file) { + $OPT{configfile} = $file; + $conf = new Config::General( + -ConfigFile=>$file, + -IncludeRelative=>1, + -IncludeAgain=>1, + -ExtendedAccess=>1, + -AllowMultiOptions=>"yes", + #-LowerCaseNames=>1, + -AutoTrue=>1 + ); + %CONF = $conf->getall; + } +} + +sub printdebug { + printinfo("debug",@_) if defined $CONF{debug}; +} + +sub printinfo { + print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; +} + +sub printfinfo { + my ($fmt,@args) = @_; + @args = map { defined $_ ? $_ : "_undef_" } @args; + printf("$fmt\n",@args); +} + +sub printerr { + print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; +} + +sub printdumper { + print Dumper(@_); +} + +=pod + +=head1 HISTORY + +=over + +=item * 30 Nov 2015 + +Started. + +=back + +=head1 AUTHOR + +Martin Krzywinski + +=head1 CONTACT + +Martin Krzywinski +Genome Sciences Center +BC Cancer Research Center +100-570 W 7th Ave +Vancouver BC V5Z 4S6 + +mkweb.bcgsc.ca +martink@bcgsc.ca + +=cut
