Mercurial > repos > morinlab > oncocircos
changeset 5:e6fa60c7e2f0 draft
Deleted selected files
| author | morinlab |
|---|---|
| date | Wed, 30 Nov 2016 13:34:41 -0500 |
| parents | f60d66a2a6b0 |
| children | d248caf924d3 |
| files | parse |
| diffstat | 1 files changed, 0 insertions(+), 559 deletions(-) [+] |
line wrap: on
line diff
--- a/parse Wed Nov 30 13:34:35 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,559 +0,0 @@ -#!/bin/env perl - -=pod - -=head1 NAME - -parse - parse Ryan's MAF and CNV files and generate a summary table of all genes and their mutations and CNV status - -=head1 SYNOPSIS - - # automatically load etc/parse.conf - bin/parse - - # if config file is elsewhere - bin/parse -conf elsewhere/my.conf - -=head1 DESCRIPTION - -See etc/parse.conf for all settings. - -=head1 OPTIONS - -=cut - -use strict; -use warnings FATAL=>"all"; - -use Carp; -use Config::General; -use Cwd qw(getcwd abs_path); -use Data::Dumper; -use File::Basename; -use FindBin; -use Getopt::Long; -use Math::Round qw(round nearest); -use Math::VecStat qw(sum min max average); -use Pod::Usage; -use Time::HiRes qw(gettimeofday tv_interval); -use Statistics::Basic qw(median); -use Storable; -use lib "$FindBin::RealBin"; -use lib "$FindBin::RealBin/../lib"; -use lib "$FindBin::RealBin/lib"; - -our (%OPT,%CONF,$conf); -our @COMMAND_LINE = ("file=s", - "configfile=s", - "help", - "cdump", - "man", - "debug"); -our $VERSION = 0.02; - -# common and custom module imports below -#use Regexp::Common; -#use IO::File; -#use List::Util; -#use List::MoreUtils; -use Set::IntSpan; -#use Statistics::Descriptive; - -# read and parse configuration file -parse_config(); - -sub validateconfiguration { -} - -#map2interval(5,$CONF{cnv}{log2}); -#exit; - -################################################################ -# get files -my $sv = read_file($CONF{files}{sv} ,"sv" ); -my $genes = read_file($CONF{files}{mart},"genes"); -my $cnv = read_file($CONF{files}{cnv} ,"cnv" ); -my $cnvlg = read_file($CONF{files}{cnvlg},"cnvlg"); -my %affected_genes; -open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n"; -while(<GENES>){ - chomp; - $affected_genes{$_}++; -} - - - -################################################################ -# traverse all genes from biomart and determine number -# of SV and CNV events across samples -for my $chr (keys %$genes) { - next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr}; - printdebug("processing",$chr); - for my $gene (@{$genes->{$chr}}) { - my $id = $gene->{id}; - my $name = $gene->{name}; - #print "GENE: $name, ID: $id\n"; - # filter out by presence and number of SV events - - # number of samples that have SV event - my @samples_sv = keys %{$sv->{$id}}; - if(defined $affected_genes{$name}){ - #override normal criteria for this gene - - } - else{ - next if $CONF{filter}{sv} && ! $sv->{$id}; - next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num}; - } - $gene->{affected} = 1; - - # register SV events - my $pos; - for my $sample (@samples_sv) { - for my $sv (sort {$b->{weight} <=> $a->{weight}} @{$sv->{$id}{$sample}}) { - $gene->{sv}{ $sv->{type} }++; - $gene->{sv}{ "*" }++; - $pos->{ $sv->{aa} }++; # register the protein position of the SV - next if $CONF{sv}{top_damage_only}; - } - } - # top SV event - if($gene->{sv}) { - my ($sv_top) = sort {$gene->{sv}{$b} <=> $gene->{sv}{$a}} grep($_ ne "*",keys %{$gene->{sv}}); - $gene->{sv_top}{$sv_top} = $gene->{sv}{$sv_top}; - } - for my $aa (sort {$pos->{$b} <=> $pos->{$a}} keys %$pos) { - #next unless $pos->{$aa} > 1; - my $n = $pos->{$aa}; - $gene->{svaa_top}{$aa} = $n if ! defined $gene->{svaa_top}; - $gene->{svaa}{"*"} += $n; - $gene->{svaa}{$aa} = $n; - } - # register CNV events - my @samples_cnv = keys %$cnv; - # lookup any CNV events -- this can take a bit of time - # we can bin the CNV hash later if needed - for my $sample (@samples_cnv) { - my $chr = $gene->{chr}; - next unless $cnv->{$sample}{$chr}; - for my $cnv (@{$cnv->{$sample}{$chr}}) { - my $int = $cnv->{set}->intersect($gene->{set})->cardinality; - next unless $int; - push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg}; - } - } - for my $sample (keys %$cnvlg) { - my $chr = $gene->{chr}; - next unless $cnvlg->{$chr}; - for my $cnv (@{$cnvlg->{$chr}}) { - my $int = $cnv->{set}->intersect($gene->{set})->cardinality; - next unless $int; - $gene->{cnvlg}{$cnv->{idx}} = $cnv->{type}; - } - } - } -} - -################################################################ -# report -my $i = 0; -for my $chr (1..22,"X","Y") { - next unless $genes->{$chr}; - for my $gene (sort {$a->{start} <=> $b->{start}} @{$genes->{$chr}}) { - next unless $gene->{affected}; - my @report = ($i++,@{$gene}{qw(id name chr start end size)}); - if($gene->{sv}) { - push @report, sprintf("sv_top:%s:%d",keys %{$gene->{sv_top}},values %{$gene->{sv_top}}); - for my $type (sort keys %{$gene->{sv}}) { - push @report, sprintf("sv:%s:%d",$type,$gene->{sv}{$type}); - } - } - if($gene->{svaa}) { - push @report, sprintf("svaa_top:%s:%d",keys %{$gene->{svaa_top}},values %{$gene->{svaa_top}}); - for my $aa (sort {$gene->{svaa}{$b} <=> $gene->{svaa}{$a}} keys %{$gene->{svaa}}) { - push @report, sprintf("svaa:%s:%d",$aa,$gene->{svaa}{$aa}); - } - } - if($gene->{cnv}) { - my $type_count; - my $delins_count; - my $values_by_type; - for my $type (sort keys %{$gene->{cnv}}) { - my @sample_avg; - for my $sample (keys %{$gene->{cnv}{$type}}) { - # number of samples with this kind of CNV event - $type_count->{$type}++; - my @values = @{$gene->{cnv}{$type}{$sample}}; - push @sample_avg, average(@values); - push @{$values_by_type->{$type}}, @values; - } - push @report, sprintf("cnv:%s:%d:%f:%f:%f:%f", - $type, - int(@sample_avg), - scalar(min(@sample_avg)), - average(@sample_avg), - median(@sample_avg)->query, - scalar(max(@sample_avg))); - } - my ($top_type) = sort {$type_count->{$b} <=> $type_count->{$a}} keys %$type_count; - push @report, sprintf("cnv_top:%s:%d:%f:%f:%f:%f", - $top_type, - $type_count->{$top_type}, - scalar(min(@{$values_by_type->{$top_type}})), - average(@{$values_by_type->{$top_type}}), - median(@{$values_by_type->{$top_type}})->query, - scalar(max(@{$values_by_type->{$top_type}}))); - } - if($gene->{cnvlg}) { - for my $cnv_idx (sort {$a <=> $b} keys %{$gene->{cnvlg}}) { - push @report, sprintf("cnvlg:%d:%s", - $cnv_idx, - $gene->{cnvlg}{$cnv_idx}); - } - } - printinfo(@report); - } -} - -exit; - -################################################################ - -sub map2value { - my ($field_name,$x) = @_; - if($CONF{map}{$field_name}) { - my ($var,$fn) = split(":",$CONF{map}{$field_name}); - $fn =~ s/x/$x/g; - my $value = eval $fn; - die "Could not evaluate remapping [$fn]" if $@; - return ($var,$value); - } else { - return ($field_name,$x); - } -} - -sub str2leaf { - my $str = shift; - my $delim = "}{"; - $str =~ s/\//$delim/g; - $str = sprintf("\$CONF{$str}"); - my $leaf = eval $str; - if($@) { - die "Could not parse [$str] as a data structure."; - } - return $leaf; -} - -sub read_file { - my ($file,$type) = @_; - - open(F,$file) || die "Could not open file [$file] for reading"; - my $data; - - my @fields = grep(/\d/,keys %{$CONF{fields}{$type}}); - my @keys = split(",",$CONF{fields}{$type}{key}); - - my $i = 0; - while(<F>) { - chomp; - next if /^\#/; - my @tok = split "\t"; - my $entry = {class=>$type,idx=>$i}; - for my $col (@fields) { - my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col}); - my ($new_field_name,$value) = map2value($field_name,$tok[$col]); - if($new_field_name ne $field_name) { - $entry->{$field_name} = $tok[$col]; - $field_name = $new_field_name; - } - if($field_transform) { - if($field_transform eq "lc") { - $value = lc $value; - } elsif ($field_transform =~ /map\((.+),(.+)\)/) { - $entry->{$field_name} = $value; - $field_name = $1; - $value = map2interval($value,str2leaf($2)); - } - } - $entry->{ $field_name } = $value; - } - # skip mutation types that are not important - next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}}; - next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}}; - if($type eq "sv") { - $entry->{weight} = $CONF{sv}{types}{$entry->{type}}; - } - $entry->{chr} = "X" if $entry->{chr} eq 23; - $entry->{chr} = "Y" if $entry->{chr} eq 24; - next unless grep($entry->{chr} eq $_, (1..22,"X","Y")); - $entry->{set} = span(@{$entry}{qw(start end)}) if $entry->{start}; - $entry->{size} = $entry->{set}->cardinality; - #printdumper($entry); - $i++; - if(@keys == 1) { - push @{$data->{$entry->{$keys[0]}}}, $entry; - } elsif (@keys == 2) { - push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry; - } - } - printdebug("got",$i,$type); - return $data; -} - -sub map2interval { - my ($x,$conf) = @_; - for my $int (keys %$conf) { - my $cat = $conf->{$int}; - return $cat if in_interval($x,$int); - } - printdumper($conf); - die "Could not map $x into any of the intervals seen"; -} - -sub in_interval { - my ($x,$int) = @_; - my $open_L = "\\("; - my $closed_L = "\\["; - my $open_R = "\\)"; - my $closed_R = "\\]"; - my $num = "[0-9]*"; #[0-9]+(?:.[0-9]+)?"; - my $rx = "($open_L|$closed_L)($num),($num)($open_R|$closed_R)"; - my $bool = 1; - if($int =~ /$rx/) { - if($2 ne "") { - $bool &&= $1 eq "(" ? $x > $2 : $x >= $2; - } - if($3 ne "") { - $bool &&= $4 eq ")" ? $x < $3 : $x <= $3; - } - } else { - $bool = $x == $int; - } - return $bool; -} - -sub list2hash { - my %h; - map {$h{$_}=1} @_; - return %h; -} - -sub span { - my ($x,$y) = @_; - if($x==$y) { - return Set::IntSpan->new($x); - } else { - return Set::IntSpan->new("$x-$y"); - } -} - -sub get_handle { - my $h; - if(my $file = $CONF{file}) { - die "No such file [$file]" unless -e $file; - open(FILE,$file); - $h = \*FILE; - } else { - $h = \*STDIN; - } - return $h; -} - -# HOUSEKEEPING ############################################################### - -sub dump_config { - printdumper(\%OPT,\%CONF); -} - -sub parse_config { - my $dump_debug_level = 3; - GetOptions(\%OPT,@COMMAND_LINE); - pod2usage() if $OPT{help}; - pod2usage(-verbose=>2) if $OPT{man}; - loadconfiguration($OPT{configfile}); - populateconfiguration(); # copy command line options to config hash - validateconfiguration(); - if ($CONF{cdump}) { - $Data::Dumper::Indent = 2; - $Data::Dumper::Quotekeys = 0; - $Data::Dumper::Terse = 0; - $Data::Dumper::Sortkeys = 1; - $Data::Dumper::Varname = "OPT"; - printdumper(\%OPT); - $Data::Dumper::Varname = "CONF"; - printdumper(\%CONF); - exit; - } -} - -sub populateconfiguration { - for my $var (keys %OPT) { - $CONF{$var} = $OPT{$var}; - } - repopulateconfiguration(\%CONF); -} - -sub repopulateconfiguration { - my ($node,$parent_node_name) = shift; - return unless ref($node) eq "HASH"; - for my $key (keys %$node) { - my $value = $node->{$key}; - if (ref($value) eq "HASH") { - repopulateconfiguration($value,$key); - } elsif (ref($value) eq "ARRAY") { - for my $item (@$value) { - repopulateconfiguration($item,$key); - } - } elsif (defined $value) { - my $new_value = parse_field($value,$key,$parent_node_name,$node); - $node->{$key} = $new_value; - } - } -} - -sub parse_field { - my ($str,$key,$parent_node_name,$node) = @_; - # replace configuration field - # conf(LEAF,LEAF,...) - while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) { - my ($template,$leaf) = ($1,$2); - if (defined $template && defined $leaf) { - my @leaf = split(/\s*,\s*/,$leaf); - my $new_template; - if (@leaf == 2 && $leaf[0] eq ".") { - $new_template = $node->{$leaf[1]}; - } else { - $new_template = fetch_conf(@leaf); - } - $str =~ s/\Q$template\E/$new_template/g; - } - } - if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) { - my $fn = $1; - $str = eval $fn; - if ($@) { - die "could not parse configuration parameter [$@]"; - } - } - return $str; -} - -sub fetch_configuration { - my @config_path = @_; - my $node = \%CONF; - if(! @config_path) { - return \%CONF; - } - for my $path_element (@config_path) { - if (! exists $node->{$path_element}) { - return undef; - } else { - $node = $node->{$path_element}; - } - } - return $node; -} - -sub fetch_conf { - return fetch_configuration(@_); -} - -sub loadconfiguration { - my $file = shift; - if (defined $file) { - if (-e $file && -r _) { - # provided configuration file exists and can be read - $file = abs_path($file); - } else { - confess "The configuration file [$file] passed with -configfile does not exist or cannot be read."; - } - } else { - # otherwise, try to automatically find a configuration file - my ($scriptname,$path,$suffix) = fileparse($0); - my $cwd = getcwd(); - my $bindir = $FindBin::RealBin; - my $userdir = $ENV{HOME}; - my @candidate_files = ( - "$cwd/$scriptname.conf", - "$cwd/etc/$scriptname.conf", - "$cwd/../etc/$scriptname.conf", - "$bindir/$scriptname.conf", - "$bindir/etc/$scriptname.conf", - "$bindir/../etc/$scriptname.conf", - "$userdir/.$scriptname.conf", - ); - my @additional_files = (); - for my $candidate_file (@additional_files,@candidate_files) { - #printinfo("configsearch",$candidate_file); - if (-e $candidate_file && -r _) { - $file = $candidate_file; - #printinfo("configfound",$candidate_file); - last; - } - } - } - if (defined $file) { - $OPT{configfile} = $file; - $conf = new Config::General( - -ConfigFile=>$file, - -IncludeRelative=>1, - -IncludeAgain=>1, - -ExtendedAccess=>1, - -AllowMultiOptions=>"yes", - #-LowerCaseNames=>1, - -AutoTrue=>1 - ); - %CONF = $conf->getall; - } -} - -sub printdebug { - printinfo("debug",@_) if defined $CONF{debug}; -} - -sub printinfo { - print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; -} - -sub printfinfo { - my ($fmt,@args) = @_; - @args = map { defined $_ ? $_ : "_undef_" } @args; - printf("$fmt\n",@args); -} - -sub printerr { - print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; -} - -sub printdumper { - print Dumper(@_); -} - -=pod - -=head1 HISTORY - -=over - -=item * 30 Nov 2015 - -Started. - -=back - -=head1 AUTHOR - -Martin Krzywinski - -=head1 CONTACT - -Martin Krzywinski -Genome Sciences Center -BC Cancer Research Center -100-570 W 7th Ave -Vancouver BC V5Z 4S6 - -mkweb.bcgsc.ca -martink@bcgsc.ca - -=cut
