Mercurial > repos > morinlab > oncocircos
view bin/parse @ 6:d248caf924d3 draft
Uploaded
| author | morinlab |
|---|---|
| date | Wed, 30 Nov 2016 13:36:59 -0500 |
| parents | b77ab858eac1 |
| children |
line wrap: on
line source
#!/bin/env perl =pod =head1 NAME parse - parse Ryan's MAF and CNV files and generate a summary table of all genes and their mutations and CNV status =head1 SYNOPSIS # automatically load etc/parse.conf bin/parse # if config file is elsewhere bin/parse -conf elsewhere/my.conf =head1 DESCRIPTION See etc/parse.conf for all settings. =head1 OPTIONS =cut use strict; use warnings FATAL=>"all"; use Carp; use Config::General; use Cwd qw(getcwd abs_path); use Data::Dumper; use File::Basename; use FindBin; use Getopt::Long; use Math::Round qw(round nearest); use Math::VecStat qw(sum min max average); use Pod::Usage; use Time::HiRes qw(gettimeofday tv_interval); use Statistics::Basic qw(median); use Storable; use lib "$FindBin::RealBin"; use lib "$FindBin::RealBin/../lib"; use lib "$FindBin::RealBin/lib"; our (%OPT,%CONF,$conf); our @COMMAND_LINE = ("file=s", "configfile=s", "help", "cdump", "man", "debug"); our $VERSION = 0.02; # common and custom module imports below #use Regexp::Common; #use IO::File; #use List::Util; #use List::MoreUtils; use Set::IntSpan; #use Statistics::Descriptive; # read and parse configuration file parse_config(); sub validateconfiguration { } #map2interval(5,$CONF{cnv}{log2}); #exit; ################################################################ # get files my $sv = read_file($CONF{files}{sv} ,"sv" ); my $genes = read_file($CONF{files}{mart},"genes"); my $cnv = read_file($CONF{files}{cnv} ,"cnv" ); my $cnvlg = read_file($CONF{files}{cnvlg},"cnvlg"); my %affected_genes; open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n"; while(<GENES>){ chomp; $affected_genes{$_}++; } ################################################################ # traverse all genes from biomart and determine number # of SV and CNV events across samples for my $chr (keys %$genes) { next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr}; printdebug("processing",$chr); for my $gene (@{$genes->{$chr}}) { my $id = $gene->{id}; my $name = $gene->{name}; #print "GENE: $name, ID: $id\n"; # filter out by presence and number of SV events # number of samples that have SV event my @samples_sv = keys %{$sv->{$id}}; if(defined $affected_genes{$name}){ #override normal criteria for this gene } else{ next if $CONF{filter}{sv} && ! $sv->{$id}; next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num}; } $gene->{affected} = 1; # register SV events my $pos; for my $sample (@samples_sv) { for my $sv (sort {$b->{weight} <=> $a->{weight}} @{$sv->{$id}{$sample}}) { $gene->{sv}{ $sv->{type} }++; $gene->{sv}{ "*" }++; $pos->{ $sv->{aa} }++; # register the protein position of the SV next if $CONF{sv}{top_damage_only}; } } # top SV event if($gene->{sv}) { my ($sv_top) = sort {$gene->{sv}{$b} <=> $gene->{sv}{$a}} grep($_ ne "*",keys %{$gene->{sv}}); $gene->{sv_top}{$sv_top} = $gene->{sv}{$sv_top}; } for my $aa (sort {$pos->{$b} <=> $pos->{$a}} keys %$pos) { #next unless $pos->{$aa} > 1; my $n = $pos->{$aa}; $gene->{svaa_top}{$aa} = $n if ! defined $gene->{svaa_top}; $gene->{svaa}{"*"} += $n; $gene->{svaa}{$aa} = $n; } # register CNV events my @samples_cnv = keys %$cnv; # lookup any CNV events -- this can take a bit of time # we can bin the CNV hash later if needed for my $sample (@samples_cnv) { my $chr = $gene->{chr}; next unless $cnv->{$sample}{$chr}; for my $cnv (@{$cnv->{$sample}{$chr}}) { my $int = $cnv->{set}->intersect($gene->{set})->cardinality; next unless $int; push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg}; } } for my $sample (keys %$cnvlg) { my $chr = $gene->{chr}; next unless $cnvlg->{$chr}; for my $cnv (@{$cnvlg->{$chr}}) { my $int = $cnv->{set}->intersect($gene->{set})->cardinality; next unless $int; $gene->{cnvlg}{$cnv->{idx}} = $cnv->{type}; } } } } ################################################################ # report my $i = 0; for my $chr (1..22,"X","Y") { next unless $genes->{$chr}; for my $gene (sort {$a->{start} <=> $b->{start}} @{$genes->{$chr}}) { next unless $gene->{affected}; my @report = ($i++,@{$gene}{qw(id name chr start end size)}); if($gene->{sv}) { push @report, sprintf("sv_top:%s:%d",keys %{$gene->{sv_top}},values %{$gene->{sv_top}}); for my $type (sort keys %{$gene->{sv}}) { push @report, sprintf("sv:%s:%d",$type,$gene->{sv}{$type}); } } if($gene->{svaa}) { push @report, sprintf("svaa_top:%s:%d",keys %{$gene->{svaa_top}},values %{$gene->{svaa_top}}); for my $aa (sort {$gene->{svaa}{$b} <=> $gene->{svaa}{$a}} keys %{$gene->{svaa}}) { push @report, sprintf("svaa:%s:%d",$aa,$gene->{svaa}{$aa}); } } if($gene->{cnv}) { my $type_count; my $delins_count; my $values_by_type; for my $type (sort keys %{$gene->{cnv}}) { my @sample_avg; for my $sample (keys %{$gene->{cnv}{$type}}) { # number of samples with this kind of CNV event $type_count->{$type}++; my @values = @{$gene->{cnv}{$type}{$sample}}; push @sample_avg, average(@values); push @{$values_by_type->{$type}}, @values; } push @report, sprintf("cnv:%s:%d:%f:%f:%f:%f", $type, int(@sample_avg), scalar(min(@sample_avg)), average(@sample_avg), median(@sample_avg)->query, scalar(max(@sample_avg))); } my ($top_type) = sort {$type_count->{$b} <=> $type_count->{$a}} keys %$type_count; push @report, sprintf("cnv_top:%s:%d:%f:%f:%f:%f", $top_type, $type_count->{$top_type}, scalar(min(@{$values_by_type->{$top_type}})), average(@{$values_by_type->{$top_type}}), median(@{$values_by_type->{$top_type}})->query, scalar(max(@{$values_by_type->{$top_type}}))); } if($gene->{cnvlg}) { for my $cnv_idx (sort {$a <=> $b} keys %{$gene->{cnvlg}}) { push @report, sprintf("cnvlg:%d:%s", $cnv_idx, $gene->{cnvlg}{$cnv_idx}); } } printinfo(@report); } } exit; ################################################################ sub map2value { my ($field_name,$x) = @_; if($CONF{map}{$field_name}) { my ($var,$fn) = split(":",$CONF{map}{$field_name}); $fn =~ s/x/$x/g; my $value = eval $fn; die "Could not evaluate remapping [$fn]" if $@; return ($var,$value); } else { return ($field_name,$x); } } sub str2leaf { my $str = shift; my $delim = "}{"; $str =~ s/\//$delim/g; $str = sprintf("\$CONF{$str}"); my $leaf = eval $str; if($@) { die "Could not parse [$str] as a data structure."; } return $leaf; } sub read_file { my ($file,$type) = @_; open(F,$file) || die "Could not open file [$file] for reading"; my $data; my @fields = grep(/\d/,keys %{$CONF{fields}{$type}}); my @keys = split(",",$CONF{fields}{$type}{key}); my $i = 0; while(<F>) { chomp; next if /^\#/; my @tok = split "\t"; my $entry = {class=>$type,idx=>$i}; for my $col (@fields) { my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col}); my ($new_field_name,$value) = map2value($field_name,$tok[$col]); if($new_field_name ne $field_name) { $entry->{$field_name} = $tok[$col]; $field_name = $new_field_name; } if($field_transform) { if($field_transform eq "lc") { $value = lc $value; } elsif ($field_transform =~ /map\((.+),(.+)\)/) { $entry->{$field_name} = $value; $field_name = $1; $value = map2interval($value,str2leaf($2)); } } $entry->{ $field_name } = $value; } # skip mutation types that are not important next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}}; next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}}; if($type eq "sv") { $entry->{weight} = $CONF{sv}{types}{$entry->{type}}; } $entry->{chr} = "X" if $entry->{chr} eq 23; $entry->{chr} = "Y" if $entry->{chr} eq 24; next unless grep($entry->{chr} eq $_, (1..22,"X","Y")); $entry->{set} = span(@{$entry}{qw(start end)}) if $entry->{start}; $entry->{size} = $entry->{set}->cardinality; #printdumper($entry); $i++; if(@keys == 1) { push @{$data->{$entry->{$keys[0]}}}, $entry; } elsif (@keys == 2) { push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry; } } printdebug("got",$i,$type); return $data; } sub map2interval { my ($x,$conf) = @_; for my $int (keys %$conf) { my $cat = $conf->{$int}; return $cat if in_interval($x,$int); } printdumper($conf); die "Could not map $x into any of the intervals seen"; } sub in_interval { my ($x,$int) = @_; my $open_L = "\\("; my $closed_L = "\\["; my $open_R = "\\)"; my $closed_R = "\\]"; my $num = "[0-9]*"; #[0-9]+(?:.[0-9]+)?"; my $rx = "($open_L|$closed_L)($num),($num)($open_R|$closed_R)"; my $bool = 1; if($int =~ /$rx/) { if($2 ne "") { $bool &&= $1 eq "(" ? $x > $2 : $x >= $2; } if($3 ne "") { $bool &&= $4 eq ")" ? $x < $3 : $x <= $3; } } else { $bool = $x == $int; } return $bool; } sub list2hash { my %h; map {$h{$_}=1} @_; return %h; } sub span { my ($x,$y) = @_; if($x==$y) { return Set::IntSpan->new($x); } else { return Set::IntSpan->new("$x-$y"); } } sub get_handle { my $h; if(my $file = $CONF{file}) { die "No such file [$file]" unless -e $file; open(FILE,$file); $h = \*FILE; } else { $h = \*STDIN; } return $h; } # HOUSEKEEPING ############################################################### sub dump_config { printdumper(\%OPT,\%CONF); } sub parse_config { my $dump_debug_level = 3; GetOptions(\%OPT,@COMMAND_LINE); pod2usage() if $OPT{help}; pod2usage(-verbose=>2) if $OPT{man}; loadconfiguration($OPT{configfile}); populateconfiguration(); # copy command line options to config hash validateconfiguration(); if ($CONF{cdump}) { $Data::Dumper::Indent = 2; $Data::Dumper::Quotekeys = 0; $Data::Dumper::Terse = 0; $Data::Dumper::Sortkeys = 1; $Data::Dumper::Varname = "OPT"; printdumper(\%OPT); $Data::Dumper::Varname = "CONF"; printdumper(\%CONF); exit; } } sub populateconfiguration { for my $var (keys %OPT) { $CONF{$var} = $OPT{$var}; } repopulateconfiguration(\%CONF); } sub repopulateconfiguration { my ($node,$parent_node_name) = shift; return unless ref($node) eq "HASH"; for my $key (keys %$node) { my $value = $node->{$key}; if (ref($value) eq "HASH") { repopulateconfiguration($value,$key); } elsif (ref($value) eq "ARRAY") { for my $item (@$value) { repopulateconfiguration($item,$key); } } elsif (defined $value) { my $new_value = parse_field($value,$key,$parent_node_name,$node); $node->{$key} = $new_value; } } } sub parse_field { my ($str,$key,$parent_node_name,$node) = @_; # replace configuration field # conf(LEAF,LEAF,...) while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) { my ($template,$leaf) = ($1,$2); if (defined $template && defined $leaf) { my @leaf = split(/\s*,\s*/,$leaf); my $new_template; if (@leaf == 2 && $leaf[0] eq ".") { $new_template = $node->{$leaf[1]}; } else { $new_template = fetch_conf(@leaf); } $str =~ s/\Q$template\E/$new_template/g; } } if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) { my $fn = $1; $str = eval $fn; if ($@) { die "could not parse configuration parameter [$@]"; } } return $str; } sub fetch_configuration { my @config_path = @_; my $node = \%CONF; if(! @config_path) { return \%CONF; } for my $path_element (@config_path) { if (! exists $node->{$path_element}) { return undef; } else { $node = $node->{$path_element}; } } return $node; } sub fetch_conf { return fetch_configuration(@_); } sub loadconfiguration { my $file = shift; if (defined $file) { if (-e $file && -r _) { # provided configuration file exists and can be read $file = abs_path($file); } else { confess "The configuration file [$file] passed with -configfile does not exist or cannot be read."; } } else { # otherwise, try to automatically find a configuration file my ($scriptname,$path,$suffix) = fileparse($0); my $cwd = getcwd(); my $bindir = $FindBin::RealBin; my $userdir = $ENV{HOME}; my @candidate_files = ( "$cwd/$scriptname.conf", "$cwd/etc/$scriptname.conf", "$cwd/../etc/$scriptname.conf", "$bindir/$scriptname.conf", "$bindir/etc/$scriptname.conf", "$bindir/../etc/$scriptname.conf", "$userdir/.$scriptname.conf", ); my @additional_files = (); for my $candidate_file (@additional_files,@candidate_files) { #printinfo("configsearch",$candidate_file); if (-e $candidate_file && -r _) { $file = $candidate_file; #printinfo("configfound",$candidate_file); last; } } } if (defined $file) { $OPT{configfile} = $file; $conf = new Config::General( -ConfigFile=>$file, -IncludeRelative=>1, -IncludeAgain=>1, -ExtendedAccess=>1, -AllowMultiOptions=>"yes", #-LowerCaseNames=>1, -AutoTrue=>1 ); %CONF = $conf->getall; } } sub printdebug { printinfo("debug",@_) if defined $CONF{debug}; } sub printinfo { print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; } sub printfinfo { my ($fmt,@args) = @_; @args = map { defined $_ ? $_ : "_undef_" } @args; printf("$fmt\n",@args); } sub printerr { print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; } sub printdumper { print Dumper(@_); } =pod =head1 HISTORY =over =item * 30 Nov 2015 Started. =back =head1 AUTHOR Martin Krzywinski =head1 CONTACT Martin Krzywinski Genome Sciences Center BC Cancer Research Center 100-570 W 7th Ave Vancouver BC V5Z 4S6 mkweb.bcgsc.ca martink@bcgsc.ca =cut
