changeset 4:f60d66a2a6b0 draft

Uploaded
author morinlab
date Wed, 30 Nov 2016 13:34:35 -0500
parents 0f30d19bc378
children e6fa60c7e2f0
files parse
diffstat 1 files changed, 559 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parse	Wed Nov 30 13:34:35 2016 -0500
@@ -0,0 +1,559 @@
+#!/bin/env perl
+
+=pod
+
+=head1 NAME
+
+parse - parse Ryan's MAF and CNV files and generate a summary table of all genes and their mutations and CNV status
+
+=head1 SYNOPSIS
+
+  # automatically load etc/parse.conf
+  bin/parse 
+
+  # if config file is elsewhere
+  bin/parse -conf elsewhere/my.conf
+
+=head1 DESCRIPTION
+
+See etc/parse.conf for all settings.
+
+=head1 OPTIONS
+
+=cut
+
+use strict;
+use warnings FATAL=>"all";
+
+use Carp;
+use Config::General;
+use Cwd qw(getcwd abs_path);
+use Data::Dumper;
+use File::Basename;
+use FindBin;
+use Getopt::Long;
+use Math::Round qw(round nearest);
+use Math::VecStat qw(sum min max average);
+use Pod::Usage;
+use Time::HiRes qw(gettimeofday tv_interval);
+use Statistics::Basic qw(median);
+use Storable;
+use lib "$FindBin::RealBin";
+use lib "$FindBin::RealBin/../lib";
+use lib "$FindBin::RealBin/lib";
+
+our (%OPT,%CONF,$conf);
+our @COMMAND_LINE = ("file=s",
+										 "configfile=s",
+										 "help",
+										 "cdump",
+										 "man",
+										 "debug");
+our $VERSION = 0.02;
+
+# common and custom module imports below
+#use Regexp::Common;
+#use IO::File;
+#use List::Util;
+#use List::MoreUtils;
+use Set::IntSpan;
+#use Statistics::Descriptive;
+
+# read and parse configuration file
+parse_config();
+
+sub validateconfiguration {
+}
+
+#map2interval(5,$CONF{cnv}{log2});
+#exit;
+
+################################################################
+# get files
+my $sv    = read_file($CONF{files}{sv}  ,"sv"   );
+my $genes = read_file($CONF{files}{mart},"genes");
+my $cnv   = read_file($CONF{files}{cnv} ,"cnv"  );
+my $cnvlg = read_file($CONF{files}{cnvlg},"cnvlg");
+my %affected_genes;
+open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n";
+while(<GENES>){
+    chomp;
+    $affected_genes{$_}++;
+}
+
+
+
+################################################################
+# traverse all genes from biomart and determine number
+# of SV and CNV events across samples
+for my $chr (keys %$genes) {
+	next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr};
+	printdebug("processing",$chr);
+	for my $gene (@{$genes->{$chr}}) {
+	        my $id = $gene->{id};
+		my $name = $gene->{name};
+		#print "GENE: $name, ID: $id\n";
+		# filter out by presence and number of SV events
+		
+		# number of samples that have SV event
+		my @samples_sv  = keys %{$sv->{$id}};
+		if(defined $affected_genes{$name}){
+		    #override normal criteria for this gene
+		    
+		}
+		else{
+		    next if $CONF{filter}{sv} && ! $sv->{$id};
+		    next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num};
+		}
+		$gene->{affected} = 1;
+
+		# register SV events
+		my $pos;
+		for my $sample (@samples_sv) {
+			for my $sv (sort {$b->{weight} <=> $a->{weight}} @{$sv->{$id}{$sample}}) {
+				$gene->{sv}{ $sv->{type} }++;
+				$gene->{sv}{ "*" }++;
+				$pos->{ $sv->{aa} }++; # register the protein position of the SV
+				next if $CONF{sv}{top_damage_only};
+			}
+		}
+		# top SV event
+		if($gene->{sv}) {
+			my ($sv_top) = sort {$gene->{sv}{$b} <=> $gene->{sv}{$a}} grep($_ ne "*",keys %{$gene->{sv}});
+			$gene->{sv_top}{$sv_top} = $gene->{sv}{$sv_top};
+		}
+		for my $aa (sort {$pos->{$b} <=> $pos->{$a}} keys %$pos) {
+			#next unless $pos->{$aa} > 1;
+			my $n = $pos->{$aa};
+			$gene->{svaa_top}{$aa} = $n if ! defined $gene->{svaa_top};
+			$gene->{svaa}{"*"}      += $n;
+			$gene->{svaa}{$aa}       = $n;
+		}
+		# register CNV events
+		my @samples_cnv = keys %$cnv;
+		# lookup any CNV events -- this can take a bit of time
+		# we can bin the CNV hash later if needed
+		for my $sample (@samples_cnv) {
+			my $chr = $gene->{chr};
+			next unless $cnv->{$sample}{$chr};
+			for my $cnv (@{$cnv->{$sample}{$chr}}) {
+				my $int = $cnv->{set}->intersect($gene->{set})->cardinality;
+				next unless $int;
+				push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg};
+			}
+		}
+		for my $sample (keys %$cnvlg) {
+			my $chr = $gene->{chr};
+			next unless $cnvlg->{$chr};
+			for my $cnv (@{$cnvlg->{$chr}}) {
+				my $int = $cnv->{set}->intersect($gene->{set})->cardinality;
+				next unless $int;
+				$gene->{cnvlg}{$cnv->{idx}} = $cnv->{type};
+			}
+		}
+	}
+}
+
+################################################################
+# report
+my $i = 0;
+for my $chr (1..22,"X","Y") {
+	next unless $genes->{$chr};
+	for my $gene (sort {$a->{start} <=> $b->{start}} @{$genes->{$chr}}) {
+		next unless $gene->{affected};
+		my @report = ($i++,@{$gene}{qw(id name chr start end size)});
+		if($gene->{sv}) {
+			push @report, sprintf("sv_top:%s:%d",keys %{$gene->{sv_top}},values %{$gene->{sv_top}});
+			for my $type (sort keys %{$gene->{sv}}) {
+				push @report, sprintf("sv:%s:%d",$type,$gene->{sv}{$type});
+			}
+		}
+		if($gene->{svaa}) {
+			push @report, sprintf("svaa_top:%s:%d",keys %{$gene->{svaa_top}},values %{$gene->{svaa_top}});
+			for my $aa (sort {$gene->{svaa}{$b} <=> $gene->{svaa}{$a}} keys %{$gene->{svaa}}) {
+				push @report, sprintf("svaa:%s:%d",$aa,$gene->{svaa}{$aa});
+			}
+		}
+		if($gene->{cnv}) {
+			my $type_count;
+			my $delins_count;
+			my $values_by_type;
+			for my $type (sort keys %{$gene->{cnv}}) {
+				my @sample_avg;
+				for my $sample (keys %{$gene->{cnv}{$type}}) {
+					# number of samples with this kind of CNV event
+					$type_count->{$type}++;
+					my @values = @{$gene->{cnv}{$type}{$sample}};
+					push @sample_avg, average(@values);
+					push @{$values_by_type->{$type}}, @values;
+				}
+				push @report, sprintf("cnv:%s:%d:%f:%f:%f:%f",
+															$type,
+															int(@sample_avg),
+															scalar(min(@sample_avg)),
+															average(@sample_avg),
+															median(@sample_avg)->query,
+															scalar(max(@sample_avg)));
+			}
+			my ($top_type) = sort {$type_count->{$b} <=> $type_count->{$a}} keys %$type_count;
+			push @report, sprintf("cnv_top:%s:%d:%f:%f:%f:%f",
+														$top_type,
+														$type_count->{$top_type},
+														scalar(min(@{$values_by_type->{$top_type}})),
+														average(@{$values_by_type->{$top_type}}),
+														median(@{$values_by_type->{$top_type}})->query,
+														scalar(max(@{$values_by_type->{$top_type}})));
+		}
+		if($gene->{cnvlg}) {
+			for my $cnv_idx (sort {$a <=> $b} keys %{$gene->{cnvlg}}) {
+				push @report, sprintf("cnvlg:%d:%s",
+															$cnv_idx,
+															$gene->{cnvlg}{$cnv_idx});
+			}
+		}
+		printinfo(@report);
+	}
+}
+
+exit;
+
+################################################################
+
+sub map2value {
+	my ($field_name,$x) = @_;
+	if($CONF{map}{$field_name}) {
+		my ($var,$fn) = split(":",$CONF{map}{$field_name});
+		$fn =~ s/x/$x/g;
+		my $value = eval $fn;
+		die "Could not evaluate remapping [$fn]" if $@;
+		return ($var,$value);
+	} else {
+		return ($field_name,$x);
+	}
+}
+
+sub str2leaf {
+	my $str   = shift;
+	my $delim = "}{";
+	$str =~ s/\//$delim/g;
+	$str = sprintf("\$CONF{$str}");
+	my $leaf = eval $str;
+	if($@) {
+		die "Could not parse [$str] as a data structure.";
+	}
+	return $leaf;
+}
+
+sub read_file {
+	my ($file,$type) = @_;
+	
+	open(F,$file) || die "Could not open file [$file] for reading";
+	my $data;
+
+	my @fields  = grep(/\d/,keys %{$CONF{fields}{$type}});
+	my @keys    = split(",",$CONF{fields}{$type}{key});
+
+	my $i = 0;
+	while(<F>) {
+		chomp;
+		next if /^\#/;
+		my @tok = split "\t";
+		my $entry = {class=>$type,idx=>$i};
+		for my $col (@fields) {
+			my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col});
+			my ($new_field_name,$value) = map2value($field_name,$tok[$col]);
+			if($new_field_name ne $field_name) {
+				$entry->{$field_name} = $tok[$col];
+				$field_name = $new_field_name;
+			}
+			if($field_transform) {
+				if($field_transform eq "lc") {
+					$value = lc $value;
+				} elsif ($field_transform =~ /map\((.+),(.+)\)/) {
+					$entry->{$field_name} = $value;
+					$field_name   = $1;
+					$value        = map2interval($value,str2leaf($2));
+				}
+			}
+			$entry->{ $field_name } = $value;
+		}
+		# skip mutation types that are not important
+		next if $CONF{sv}{filter}  && $type eq "sv"  && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}};
+		next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}};
+		if($type eq "sv") {
+			$entry->{weight} = $CONF{sv}{types}{$entry->{type}};
+		}
+		$entry->{chr} = "X" if $entry->{chr} eq 23;
+		$entry->{chr} = "Y" if $entry->{chr} eq 24;
+		next unless grep($entry->{chr} eq $_, (1..22,"X","Y"));
+		$entry->{set} = span(@{$entry}{qw(start end)}) if $entry->{start};
+		$entry->{size} = $entry->{set}->cardinality;
+		#printdumper($entry);
+		$i++;
+		if(@keys == 1) {
+			push @{$data->{$entry->{$keys[0]}}}, $entry;
+		} elsif (@keys == 2) {
+			push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry;
+		}
+	}
+	printdebug("got",$i,$type);
+	return $data;
+}
+
+sub map2interval {
+	my ($x,$conf) = @_;
+	for my $int (keys %$conf) {
+		my $cat = $conf->{$int};
+		return $cat if in_interval($x,$int);
+	}
+	printdumper($conf);
+	die "Could not map $x into any of the intervals seen";
+}
+
+sub in_interval {
+	my ($x,$int) = @_;
+	my $open_L   = "\\(";
+	my $closed_L = "\\[";
+	my $open_R   = "\\)";
+	my $closed_R = "\\]";
+	my $num      = "[0-9]*"; #[0-9]+(?:.[0-9]+)?";
+	my $rx       = "($open_L|$closed_L)($num),($num)($open_R|$closed_R)";
+	my $bool     = 1;
+	if($int =~ /$rx/) {
+		if($2 ne "") {
+			$bool &&= $1 eq "(" ? $x > $2 : $x >= $2;
+		}
+		if($3 ne "") {
+			$bool &&= $4 eq ")" ? $x < $3 : $x <= $3;
+		}
+	} else {
+		$bool = $x == $int;
+	}
+	return $bool;
+}
+
+sub list2hash {
+	my %h;
+	map {$h{$_}=1} @_;
+	return %h;
+}
+
+sub span {
+	my ($x,$y) = @_;
+	if($x==$y) {
+		return Set::IntSpan->new($x);
+	} else {
+		return Set::IntSpan->new("$x-$y");
+	}
+}
+
+sub get_handle {
+	my $h;
+	if(my $file = $CONF{file}) {
+		die "No such file [$file]" unless -e $file;
+		open(FILE,$file);
+		$h = \*FILE;
+	} else {
+		$h = \*STDIN;
+	}
+	return $h;
+}
+
+# HOUSEKEEPING ###############################################################
+
+sub dump_config {
+	printdumper(\%OPT,\%CONF);
+}
+
+sub parse_config {
+  my $dump_debug_level = 3;
+  GetOptions(\%OPT,@COMMAND_LINE);
+  pod2usage() if $OPT{help};
+  pod2usage(-verbose=>2) if $OPT{man};
+  loadconfiguration($OPT{configfile});
+  populateconfiguration();	# copy command line options to config hash
+  validateconfiguration(); 
+  if ($CONF{cdump}) {
+    $Data::Dumper::Indent    = 2;
+    $Data::Dumper::Quotekeys = 0;
+    $Data::Dumper::Terse     = 0;
+    $Data::Dumper::Sortkeys  = 1;
+    $Data::Dumper::Varname = "OPT";
+    printdumper(\%OPT);
+    $Data::Dumper::Varname = "CONF";
+    printdumper(\%CONF);
+    exit;
+  }
+}
+
+sub populateconfiguration {
+  for my $var (keys %OPT) {
+    $CONF{$var} = $OPT{$var};
+  }
+  repopulateconfiguration(\%CONF);
+}
+
+sub repopulateconfiguration {
+  my ($node,$parent_node_name) = shift;
+  return unless ref($node) eq "HASH";
+  for my $key (keys %$node) {
+		my $value = $node->{$key};
+		if (ref($value) eq "HASH") {
+			repopulateconfiguration($value,$key);
+		} elsif (ref($value) eq "ARRAY") {
+			for my $item (@$value) {
+				repopulateconfiguration($item,$key);
+			}
+		} elsif (defined $value) {
+			my $new_value = parse_field($value,$key,$parent_node_name,$node);
+			$node->{$key} = $new_value;
+		}
+	}
+}
+
+sub parse_field {
+	my ($str,$key,$parent_node_name,$node) = @_;
+	# replace configuration field
+	# conf(LEAF,LEAF,...)
+	while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) {
+		my ($template,$leaf) = ($1,$2);
+		if (defined $template && defined $leaf) {
+			my @leaf         = split(/\s*,\s*/,$leaf);
+			my $new_template;
+			if (@leaf == 2 && $leaf[0] eq ".") {
+				$new_template = $node->{$leaf[1]};
+			} else {
+				$new_template = fetch_conf(@leaf);
+			}
+			$str =~ s/\Q$template\E/$new_template/g;
+		}
+	}
+	if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) {
+		my $fn = $1;
+		$str = eval $fn;
+		if ($@) {
+			die "could not parse configuration parameter [$@]";
+		}
+	}
+	return $str;
+}
+
+sub fetch_configuration {
+	my @config_path = @_;
+	my $node        = \%CONF;
+	if(! @config_path) {
+		return \%CONF;
+	}
+	for my $path_element (@config_path) {
+		if (! exists $node->{$path_element}) {
+	    return undef;
+		} else {
+	    $node = $node->{$path_element};
+		}
+	}
+	return $node;
+}
+
+sub fetch_conf {
+	return fetch_configuration(@_);
+}
+
+sub loadconfiguration {
+  my $file = shift;
+  if (defined $file) {
+    if (-e $file && -r _) {
+      # provided configuration file exists and can be read
+      $file = abs_path($file);
+    } else {
+      confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
+    }
+  } else {
+    # otherwise, try to automatically find a configuration file
+    my ($scriptname,$path,$suffix) = fileparse($0);
+    my $cwd     = getcwd();
+    my $bindir  = $FindBin::RealBin;
+    my $userdir = $ENV{HOME};
+    my @candidate_files = (
+													 "$cwd/$scriptname.conf",
+													 "$cwd/etc/$scriptname.conf",
+													 "$cwd/../etc/$scriptname.conf",
+													 "$bindir/$scriptname.conf",
+													 "$bindir/etc/$scriptname.conf",
+													 "$bindir/../etc/$scriptname.conf",
+													 "$userdir/.$scriptname.conf",
+													);
+    my @additional_files = ();
+    for my $candidate_file (@additional_files,@candidate_files) {
+			#printinfo("configsearch",$candidate_file);
+			if (-e $candidate_file && -r _) {
+				$file = $candidate_file;
+				#printinfo("configfound",$candidate_file);
+				last;
+			}
+    }
+  }
+  if (defined $file) {
+    $OPT{configfile} = $file;
+    $conf = new Config::General(
+																-ConfigFile=>$file,
+																-IncludeRelative=>1,
+																-IncludeAgain=>1,
+																-ExtendedAccess=>1,
+																-AllowMultiOptions=>"yes",
+																#-LowerCaseNames=>1,
+																-AutoTrue=>1
+															 );
+    %CONF = $conf->getall;
+  }
+}
+
+sub printdebug {
+	printinfo("debug",@_) if defined $CONF{debug};
+}
+
+sub printinfo {
+	print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
+}
+
+sub printfinfo {
+	my ($fmt,@args) = @_;
+	@args = map { defined $_ ? $_ : "_undef_" } @args;
+	printf("$fmt\n",@args);
+}
+
+sub printerr {
+	print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
+}
+
+sub printdumper {
+	print Dumper(@_);
+}
+
+=pod 
+
+=head1 HISTORY
+
+=over
+
+=item * 30 Nov 2015
+
+Started.
+
+=back 
+
+=head1 AUTHOR
+
+Martin Krzywinski
+
+=head1 CONTACT
+
+Martin Krzywinski
+Genome Sciences Center
+BC Cancer Research Center
+100-570 W 7th Ave
+Vancouver BC V5Z 4S6
+
+mkweb.bcgsc.ca
+martink@bcgsc.ca
+
+=cut