view bin/parse @ 6:d248caf924d3 draft

Uploaded
author morinlab
date Wed, 30 Nov 2016 13:36:59 -0500
parents b77ab858eac1
children
line wrap: on
line source

#!/bin/env perl

=pod

=head1 NAME

parse - parse Ryan's MAF and CNV files and generate a summary table of all genes and their mutations and CNV status

=head1 SYNOPSIS

  # automatically load etc/parse.conf
  bin/parse 

  # if config file is elsewhere
  bin/parse -conf elsewhere/my.conf

=head1 DESCRIPTION

See etc/parse.conf for all settings.

=head1 OPTIONS

=cut

use strict;
use warnings FATAL=>"all";

use Carp;
use Config::General;
use Cwd qw(getcwd abs_path);
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use Math::Round qw(round nearest);
use Math::VecStat qw(sum min max average);
use Pod::Usage;
use Time::HiRes qw(gettimeofday tv_interval);
use Statistics::Basic qw(median);
use Storable;
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";

our (%OPT,%CONF,$conf);
our @COMMAND_LINE = ("file=s",
										 "configfile=s",
										 "help",
										 "cdump",
										 "man",
										 "debug");
our $VERSION = 0.02;

# common and custom module imports below
#use Regexp::Common;
#use IO::File;
#use List::Util;
#use List::MoreUtils;
use Set::IntSpan;
#use Statistics::Descriptive;

# read and parse configuration file
parse_config();

sub validateconfiguration {
}

#map2interval(5,$CONF{cnv}{log2});
#exit;

################################################################
# get files
my $sv    = read_file($CONF{files}{sv}  ,"sv"   );
my $genes = read_file($CONF{files}{mart},"genes");
my $cnv   = read_file($CONF{files}{cnv} ,"cnv"  );
my $cnvlg = read_file($CONF{files}{cnvlg},"cnvlg");
my %affected_genes;
open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n";
while(<GENES>){
    chomp;
    $affected_genes{$_}++;
}



################################################################
# traverse all genes from biomart and determine number
# of SV and CNV events across samples
for my $chr (keys %$genes) {
	next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr};
	printdebug("processing",$chr);
	for my $gene (@{$genes->{$chr}}) {
	        my $id = $gene->{id};
		my $name = $gene->{name};
		#print "GENE: $name, ID: $id\n";
		# filter out by presence and number of SV events
		
		# number of samples that have SV event
		my @samples_sv  = keys %{$sv->{$id}};
		if(defined $affected_genes{$name}){
		    #override normal criteria for this gene
		    
		}
		else{
		    next if $CONF{filter}{sv} && ! $sv->{$id};
		    next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num};
		}
		$gene->{affected} = 1;

		# register SV events
		my $pos;
		for my $sample (@samples_sv) {
			for my $sv (sort {$b->{weight} <=> $a->{weight}} @{$sv->{$id}{$sample}}) {
				$gene->{sv}{ $sv->{type} }++;
				$gene->{sv}{ "*" }++;
				$pos->{ $sv->{aa} }++; # register the protein position of the SV
				next if $CONF{sv}{top_damage_only};
			}
		}
		# top SV event
		if($gene->{sv}) {
			my ($sv_top) = sort {$gene->{sv}{$b} <=> $gene->{sv}{$a}} grep($_ ne "*",keys %{$gene->{sv}});
			$gene->{sv_top}{$sv_top} = $gene->{sv}{$sv_top};
		}
		for my $aa (sort {$pos->{$b} <=> $pos->{$a}} keys %$pos) {
			#next unless $pos->{$aa} > 1;
			my $n = $pos->{$aa};
			$gene->{svaa_top}{$aa} = $n if ! defined $gene->{svaa_top};
			$gene->{svaa}{"*"}      += $n;
			$gene->{svaa}{$aa}       = $n;
		}
		# register CNV events
		my @samples_cnv = keys %$cnv;
		# lookup any CNV events -- this can take a bit of time
		# we can bin the CNV hash later if needed
		for my $sample (@samples_cnv) {
			my $chr = $gene->{chr};
			next unless $cnv->{$sample}{$chr};
			for my $cnv (@{$cnv->{$sample}{$chr}}) {
				my $int = $cnv->{set}->intersect($gene->{set})->cardinality;
				next unless $int;
				push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg};
			}
		}
		for my $sample (keys %$cnvlg) {
			my $chr = $gene->{chr};
			next unless $cnvlg->{$chr};
			for my $cnv (@{$cnvlg->{$chr}}) {
				my $int = $cnv->{set}->intersect($gene->{set})->cardinality;
				next unless $int;
				$gene->{cnvlg}{$cnv->{idx}} = $cnv->{type};
			}
		}
	}
}

################################################################
# report
my $i = 0;
for my $chr (1..22,"X","Y") {
	next unless $genes->{$chr};
	for my $gene (sort {$a->{start} <=> $b->{start}} @{$genes->{$chr}}) {
		next unless $gene->{affected};
		my @report = ($i++,@{$gene}{qw(id name chr start end size)});
		if($gene->{sv}) {
			push @report, sprintf("sv_top:%s:%d",keys %{$gene->{sv_top}},values %{$gene->{sv_top}});
			for my $type (sort keys %{$gene->{sv}}) {
				push @report, sprintf("sv:%s:%d",$type,$gene->{sv}{$type});
			}
		}
		if($gene->{svaa}) {
			push @report, sprintf("svaa_top:%s:%d",keys %{$gene->{svaa_top}},values %{$gene->{svaa_top}});
			for my $aa (sort {$gene->{svaa}{$b} <=> $gene->{svaa}{$a}} keys %{$gene->{svaa}}) {
				push @report, sprintf("svaa:%s:%d",$aa,$gene->{svaa}{$aa});
			}
		}
		if($gene->{cnv}) {
			my $type_count;
			my $delins_count;
			my $values_by_type;
			for my $type (sort keys %{$gene->{cnv}}) {
				my @sample_avg;
				for my $sample (keys %{$gene->{cnv}{$type}}) {
					# number of samples with this kind of CNV event
					$type_count->{$type}++;
					my @values = @{$gene->{cnv}{$type}{$sample}};
					push @sample_avg, average(@values);
					push @{$values_by_type->{$type}}, @values;
				}
				push @report, sprintf("cnv:%s:%d:%f:%f:%f:%f",
															$type,
															int(@sample_avg),
															scalar(min(@sample_avg)),
															average(@sample_avg),
															median(@sample_avg)->query,
															scalar(max(@sample_avg)));
			}
			my ($top_type) = sort {$type_count->{$b} <=> $type_count->{$a}} keys %$type_count;
			push @report, sprintf("cnv_top:%s:%d:%f:%f:%f:%f",
														$top_type,
														$type_count->{$top_type},
														scalar(min(@{$values_by_type->{$top_type}})),
														average(@{$values_by_type->{$top_type}}),
														median(@{$values_by_type->{$top_type}})->query,
														scalar(max(@{$values_by_type->{$top_type}})));
		}
		if($gene->{cnvlg}) {
			for my $cnv_idx (sort {$a <=> $b} keys %{$gene->{cnvlg}}) {
				push @report, sprintf("cnvlg:%d:%s",
															$cnv_idx,
															$gene->{cnvlg}{$cnv_idx});
			}
		}
		printinfo(@report);
	}
}

exit;

################################################################

sub map2value {
	my ($field_name,$x) = @_;
	if($CONF{map}{$field_name}) {
		my ($var,$fn) = split(":",$CONF{map}{$field_name});
		$fn =~ s/x/$x/g;
		my $value = eval $fn;
		die "Could not evaluate remapping [$fn]" if $@;
		return ($var,$value);
	} else {
		return ($field_name,$x);
	}
}

sub str2leaf {
	my $str   = shift;
	my $delim = "}{";
	$str =~ s/\//$delim/g;
	$str = sprintf("\$CONF{$str}");
	my $leaf = eval $str;
	if($@) {
		die "Could not parse [$str] as a data structure.";
	}
	return $leaf;
}

sub read_file {
	my ($file,$type) = @_;
	
	open(F,$file) || die "Could not open file [$file] for reading";
	my $data;

	my @fields  = grep(/\d/,keys %{$CONF{fields}{$type}});
	my @keys    = split(",",$CONF{fields}{$type}{key});

	my $i = 0;
	while(<F>) {
		chomp;
		next if /^\#/;
		my @tok = split "\t";
		my $entry = {class=>$type,idx=>$i};
		for my $col (@fields) {
			my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col});
			my ($new_field_name,$value) = map2value($field_name,$tok[$col]);
			if($new_field_name ne $field_name) {
				$entry->{$field_name} = $tok[$col];
				$field_name = $new_field_name;
			}
			if($field_transform) {
				if($field_transform eq "lc") {
					$value = lc $value;
				} elsif ($field_transform =~ /map\((.+),(.+)\)/) {
					$entry->{$field_name} = $value;
					$field_name   = $1;
					$value        = map2interval($value,str2leaf($2));
				}
			}
			$entry->{ $field_name } = $value;
		}
		# skip mutation types that are not important
		next if $CONF{sv}{filter}  && $type eq "sv"  && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}};
		next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}};
		if($type eq "sv") {
			$entry->{weight} = $CONF{sv}{types}{$entry->{type}};
		}
		$entry->{chr} = "X" if $entry->{chr} eq 23;
		$entry->{chr} = "Y" if $entry->{chr} eq 24;
		next unless grep($entry->{chr} eq $_, (1..22,"X","Y"));
		$entry->{set} = span(@{$entry}{qw(start end)}) if $entry->{start};
		$entry->{size} = $entry->{set}->cardinality;
		#printdumper($entry);
		$i++;
		if(@keys == 1) {
			push @{$data->{$entry->{$keys[0]}}}, $entry;
		} elsif (@keys == 2) {
			push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry;
		}
	}
	printdebug("got",$i,$type);
	return $data;
}

sub map2interval {
	my ($x,$conf) = @_;
	for my $int (keys %$conf) {
		my $cat = $conf->{$int};
		return $cat if in_interval($x,$int);
	}
	printdumper($conf);
	die "Could not map $x into any of the intervals seen";
}

sub in_interval {
	my ($x,$int) = @_;
	my $open_L   = "\\(";
	my $closed_L = "\\[";
	my $open_R   = "\\)";
	my $closed_R = "\\]";
	my $num      = "[0-9]*"; #[0-9]+(?:.[0-9]+)?";
	my $rx       = "($open_L|$closed_L)($num),($num)($open_R|$closed_R)";
	my $bool     = 1;
	if($int =~ /$rx/) {
		if($2 ne "") {
			$bool &&= $1 eq "(" ? $x > $2 : $x >= $2;
		}
		if($3 ne "") {
			$bool &&= $4 eq ")" ? $x < $3 : $x <= $3;
		}
	} else {
		$bool = $x == $int;
	}
	return $bool;
}

sub list2hash {
	my %h;
	map {$h{$_}=1} @_;
	return %h;
}

sub span {
	my ($x,$y) = @_;
	if($x==$y) {
		return Set::IntSpan->new($x);
	} else {
		return Set::IntSpan->new("$x-$y");
	}
}

sub get_handle {
	my $h;
	if(my $file = $CONF{file}) {
		die "No such file [$file]" unless -e $file;
		open(FILE,$file);
		$h = \*FILE;
	} else {
		$h = \*STDIN;
	}
	return $h;
}

# HOUSEKEEPING ###############################################################

sub dump_config {
	printdumper(\%OPT,\%CONF);
}

sub parse_config {
  my $dump_debug_level = 3;
  GetOptions(\%OPT,@COMMAND_LINE);
  pod2usage() if $OPT{help};
  pod2usage(-verbose=>2) if $OPT{man};
  loadconfiguration($OPT{configfile});
  populateconfiguration();	# copy command line options to config hash
  validateconfiguration(); 
  if ($CONF{cdump}) {
    $Data::Dumper::Indent    = 2;
    $Data::Dumper::Quotekeys = 0;
    $Data::Dumper::Terse     = 0;
    $Data::Dumper::Sortkeys  = 1;
    $Data::Dumper::Varname = "OPT";
    printdumper(\%OPT);
    $Data::Dumper::Varname = "CONF";
    printdumper(\%CONF);
    exit;
  }
}

sub populateconfiguration {
  for my $var (keys %OPT) {
    $CONF{$var} = $OPT{$var};
  }
  repopulateconfiguration(\%CONF);
}

sub repopulateconfiguration {
  my ($node,$parent_node_name) = shift;
  return unless ref($node) eq "HASH";
  for my $key (keys %$node) {
		my $value = $node->{$key};
		if (ref($value) eq "HASH") {
			repopulateconfiguration($value,$key);
		} elsif (ref($value) eq "ARRAY") {
			for my $item (@$value) {
				repopulateconfiguration($item,$key);
			}
		} elsif (defined $value) {
			my $new_value = parse_field($value,$key,$parent_node_name,$node);
			$node->{$key} = $new_value;
		}
	}
}

sub parse_field {
	my ($str,$key,$parent_node_name,$node) = @_;
	# replace configuration field
	# conf(LEAF,LEAF,...)
	while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) {
		my ($template,$leaf) = ($1,$2);
		if (defined $template && defined $leaf) {
			my @leaf         = split(/\s*,\s*/,$leaf);
			my $new_template;
			if (@leaf == 2 && $leaf[0] eq ".") {
				$new_template = $node->{$leaf[1]};
			} else {
				$new_template = fetch_conf(@leaf);
			}
			$str =~ s/\Q$template\E/$new_template/g;
		}
	}
	if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) {
		my $fn = $1;
		$str = eval $fn;
		if ($@) {
			die "could not parse configuration parameter [$@]";
		}
	}
	return $str;
}

sub fetch_configuration {
	my @config_path = @_;
	my $node        = \%CONF;
	if(! @config_path) {
		return \%CONF;
	}
	for my $path_element (@config_path) {
		if (! exists $node->{$path_element}) {
	    return undef;
		} else {
	    $node = $node->{$path_element};
		}
	}
	return $node;
}

sub fetch_conf {
	return fetch_configuration(@_);
}

sub loadconfiguration {
  my $file = shift;
  if (defined $file) {
    if (-e $file && -r _) {
      # provided configuration file exists and can be read
      $file = abs_path($file);
    } else {
      confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
    }
  } else {
    # otherwise, try to automatically find a configuration file
    my ($scriptname,$path,$suffix) = fileparse($0);
    my $cwd     = getcwd();
    my $bindir  = $FindBin::RealBin;
    my $userdir = $ENV{HOME};
    my @candidate_files = (
													 "$cwd/$scriptname.conf",
													 "$cwd/etc/$scriptname.conf",
													 "$cwd/../etc/$scriptname.conf",
													 "$bindir/$scriptname.conf",
													 "$bindir/etc/$scriptname.conf",
													 "$bindir/../etc/$scriptname.conf",
													 "$userdir/.$scriptname.conf",
													);
    my @additional_files = ();
    for my $candidate_file (@additional_files,@candidate_files) {
			#printinfo("configsearch",$candidate_file);
			if (-e $candidate_file && -r _) {
				$file = $candidate_file;
				#printinfo("configfound",$candidate_file);
				last;
			}
    }
  }
  if (defined $file) {
    $OPT{configfile} = $file;
    $conf = new Config::General(
																-ConfigFile=>$file,
																-IncludeRelative=>1,
																-IncludeAgain=>1,
																-ExtendedAccess=>1,
																-AllowMultiOptions=>"yes",
																#-LowerCaseNames=>1,
																-AutoTrue=>1
															 );
    %CONF = $conf->getall;
  }
}

sub printdebug {
	printinfo("debug",@_) if defined $CONF{debug};
}

sub printinfo {
	print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
}

sub printfinfo {
	my ($fmt,@args) = @_;
	@args = map { defined $_ ? $_ : "_undef_" } @args;
	printf("$fmt\n",@args);
}

sub printerr {
	print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
}

sub printdumper {
	print Dumper(@_);
}

=pod 

=head1 HISTORY

=over

=item * 30 Nov 2015

Started.

=back 

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

Martin Krzywinski
Genome Sciences Center
BC Cancer Research Center
100-570 W 7th Ave
Vancouver BC V5Z 4S6

mkweb.bcgsc.ca
martink@bcgsc.ca

=cut