view bin/make.circos.data @ 10:417f984cd2e2 draft default tip

Uploaded
author morinlab
date Wed, 30 Nov 2016 13:41:12 -0500
parents d1917662231c
children
line wrap: on
line source

#!/bin/env perl

=pod

=head1 NAME

make.circos.data - create Circos data files from summary tables of SV/CNV mutations

=head1 SYNOPSIS

  bin/parse > table.txt

  # uses same config file as parse
  cat table.txt | bin/make.circos.data

=head1 DESCRIPTION

=head1 OPTIONS

=cut

use strict;
use warnings FATAL=>"all";

use Carp;
use Config::General;
use Cwd qw(getcwd abs_path);
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use Math::Round qw(round nearest);
use Math::VecStat qw(sum min max average);
use Pod::Usage;
use Time::HiRes qw(gettimeofday tv_interval);
use Storable;
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";

our (%OPT,%CONF,$conf);
our @COMMAND_LINE = ("file=s",
										 "configfile=s",
										 "help",
										 "cdump",
										 "man",
										 "debug");
our $VERSION = 0.02;

# common and custom module imports below
#use Regexp::Common;
#use IO::File;
#use List::Util;
#use List::MoreUtils;
use Set::IntSpan;
#use Statistics::Descriptive;

# read and parse configuration file
parse_config();



my %affected_genes;
open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n";
while(<GENES>){
    chomp;
    $affected_genes{$_}++;
}
close GENES;
my %mask_genes;
open(GENES, $CONF{files}{mask}) or die "#! $CONF{files}{mask}\n";
while(<GENES>){
    chomp;
    $mask_genes{$_}++;
}
close GENES;

sub validateconfiguration {

}

################################################################
# get files
my $table = read_file();

my $path = "$CONF{files}{root}/$CONF{files}{circos}";
# karyotype
open(F,">$path/karyotype.txt");
for my $chr (1..22,"X","Y") {
	my $n = grep($_->{chr} eq $chr,@$table);
	next unless $n;
	printf F ("chr - hs%s %s 0 %d chr%s\n",$chr,$chr,$n,lc $chr);
}
close(F);

# large scale CNV
open(FCNVLG,">$path/cnv.tiles.txt");
# collect CNV region idx
my $cnvlg;
for my $gene (@$table) {
	next unless $gene->{cnvlg};
	for my $idx (keys %{$gene->{cnvlg}}) {
		$cnvlg->{$idx} ||= { chr  => $gene->{chr},
												 type => $gene->{cnvlg}{$idx}{type} };
		push @{$cnvlg->{$idx}{pos}}, $gene->{pos};
	}
}
my $cnvlg_seen;
for my $idx (sort {$a <=> $b} keys %{$cnvlg}) {
	my $cnv = $cnvlg->{$idx};
	my $key = join(",",
								 scalar(min @{$cnv->{pos}}),
								 scalar(max @{$cnv->{pos}}),
								 $cnv->{type});
	next if $cnvlg_seen->{$key}++;
	my $start = scalar(min @{$cnv->{pos}});
	my $end   = 1 + scalar(max @{$cnv->{pos}});
	printf FCNVLG ("hs%s %d %d %s idx=%d\n",
								 $cnv->{chr},
								 $start,
								 $end,
								 $cnv->{type},
								 $idx);
}
close(FCNVLG);

# number of CNV
open(F,     ">$path/mutations.txt");
open(FSV,   ">$path/mutations.stacked.sv.txt");
open(FCNV,  ">$path/mutations.stacked.cnv.txt");
for my $gene (@$table) {
    my $name = $gene->{name};
    if(defined $mask_genes{$name}){
	next;
    }
	my @sv;
	my @sv_vals;
	my @cnv;
	my @cnv_vals;
	# number of samples for each SV type
	for my $type (sort { $CONF{sv}{types}{$b} <=> $CONF{sv}{types}{$a}} keys %{$CONF{sv}{types}}) {
		push @sv, sprintf("sv_%s=%d",lc $type,$gene->{sv}{$type}||0);
		push @sv_vals, $gene->{sv}{$type}||0;
	}
	# number of samples for each CNV type
	for my $type (sort { $CONF{cnv}{types}{$b} <=> $CONF{cnv}{types}{$a}} keys %{$CONF{cnv}{types}}) {
		next unless $CONF{cnv}{types}{$type};
		push @cnv, sprintf("cnv_%s=%d",lc $type,$gene->{cnv}{$type}{n}||0);
		push @cnv_vals, $gene->{cnv}{$type}{n}||0;
	}
	my $cnv_plus  = ($gene->{cnv}{amp}{n} ||0) + ($gene->{cnv}{gain}{n} ||0);
	my $cnv_minus = ($gene->{cnv}{homd}{n}||0) + ($gene->{cnv}{hetd}{n} ||0);
	my $label_flag = "label_gene=0";
	

	if(defined $affected_genes{$name}){
	    $label_flag = "label_gene=1";
	    print "will label $name\n";
	}
	unless(defined $gene->{sv_top}){
	    $gene->{sv_top} = {"missense_mutation"=>0};
	    $gene->{sv} = {"missense_mutation"=>0};
	    $gene->{svaa_top} = {"*"=>0};
	}
	printf F ("hs%s %d %d %s size=%d,sv_top_type=%s,sv_top_n=%d,sv_tot=%d,svaa_max_pos=%s,svaa_max_n=%d,cnv_top_type=%s,cnv_top_n=%d,cnv_top_avg=%f,cnv_top_med=%f,cnv_plus=%d,cnv_minus=%d,%s,%s",
						$gene->{chr},
						$gene->{pos},
						$gene->{pos}+1,
						$gene->{name},
						$gene->{size},
						keys %{$gene->{sv_top}},
						values %{$gene->{sv_top}},
						$gene->{sv}{"*"} || 0,
						(keys %{$gene->{svaa_top}})||0,
						(values %{$gene->{svaa_top}})||0,
						$gene->{cnv_top}{class} || "-",
						$gene->{cnv_top}{n}     || 0,
						$gene->{cnv_top}{avg}   || 0,
						$gene->{cnv_top}{med}   || 0,
						$cnv_plus||0,
						$cnv_minus||0,
						join(",",@sv),
		  join(",",@cnv));
	print F ",$label_flag\n";
	# stacked histograms of number of samples with each SV type
	printf FSV ("hs%s %d %d %s name=%s,sv_top_type=%s,sv_top_n=%d\n",
						$gene->{chr},
						$gene->{pos},
						$gene->{pos}+1,
						join(",",@sv_vals),
						$gene->{name},
						keys %{$gene->{sv_top}},
						values %{$gene->{sv_top}},
						 );
	printf FCNV ("hs%s %d %d %s name=%s,cnv_top_type=%s,cnv_top_n=%d,cnv_top_avg=%f,cnv_top_med=%f\n",
						$gene->{chr},
						$gene->{pos},
						$gene->{pos}+1,
						join(",",@cnv_vals),
						$gene->{name},
						$gene->{cnv_top}{class} || "-",
						$gene->{cnv_top}{n}     || 0,
						$gene->{cnv_top}{avg}   || 0,
						$gene->{cnv_top}{med}   || 0,
							 
							);
}
close(F);
close(FSV);
close(FCNV);

sub read_file {
	my $fh = get_handle();
	my @data;
	my $chrpos;
	while(<$fh>) {
	    
		chomp;
		next if /^\#/;
		
		my @tok = split;
		my $gene = list2hash([qw(i id name chr start end size)],
												 [splice(@tok,0,7)]);
		$gene->{pos} = $chrpos->{ $gene->{chr} }++;
		
		# remaining tokens
		for my $tok (@tok) {
			my @subtok = split(":",$tok);
			my $event  = lc shift @subtok;
			my $type   = lc shift @subtok;
			if($event =~ /sv/) {
				$gene->{$event}{$type} = shift @subtok;
			} elsif($event =~ /cnvlg/) {
				my $h = { idx => $type,
									type => shift @subtok };
				$gene->{$event}{$type} = $h;
			} elsif($event =~ /cnv/) {
				my $h = { class=> $type,
									n=>     shift @subtok,
									min=>   shift @subtok,
									avg=>   shift @subtok,
									med=>   shift @subtok,
									max=>   shift @subtok};
				if($event =~ /top/) {
					$gene->{$event} = $h;
				} else {
					$gene->{$event}{$type} = $h;
				}
			}
		}
		#printdumper($gene);
		push @data, $gene;
	}
	
	return \@data;
}

sub list2hash {
	my ($names,$list) = @_;
	my $h;
	my $i = 0;
	for my $name (@$names) {
		$h->{$name} = $list->[$i++];
	}
	return $h;
}

sub get_handle {
	my $h;
	if(my $file = $CONF{file}) {
		die "No such file [$file]" unless -e $file;
		open(FILE,$file);
		$h = \*FILE;
	} else {
		$h = \*STDIN;
	}
	return $h;
}

# HOUSEKEEPING ###############################################################

sub dump_config {
	printdumper(\%OPT,\%CONF);
}

sub parse_config {
  my $dump_debug_level = 3;
  GetOptions(\%OPT,@COMMAND_LINE);
  pod2usage() if $OPT{help};
  pod2usage(-verbose=>2) if $OPT{man};
  loadconfiguration($OPT{configfile});
  populateconfiguration();	# copy command line options to config hash
  validateconfiguration(); 
  if ($CONF{cdump}) {
    $Data::Dumper::Indent    = 2;
    $Data::Dumper::Quotekeys = 0;
    $Data::Dumper::Terse     = 0;
    $Data::Dumper::Sortkeys  = 1;
    $Data::Dumper::Varname = "OPT";
    printdumper(\%OPT);
    $Data::Dumper::Varname = "CONF";
    printdumper(\%CONF);
    exit;
  }
}

sub populateconfiguration {
  for my $var (keys %OPT) {
    $CONF{$var} = $OPT{$var};
  }
  repopulateconfiguration(\%CONF);
}

sub repopulateconfiguration {
  my ($node,$parent_node_name) = shift;
  return unless ref($node) eq "HASH";
  for my $key (keys %$node) {
		my $value = $node->{$key};
		if (ref($value) eq "HASH") {
			repopulateconfiguration($value,$key);
		} elsif (ref($value) eq "ARRAY") {
			for my $item (@$value) {
				repopulateconfiguration($item,$key);
			}
		} elsif (defined $value) {
			my $new_value = parse_field($value,$key,$parent_node_name,$node);
			$node->{$key} = $new_value;
		}
	}
}

sub parse_field {
	my ($str,$key,$parent_node_name,$node) = @_;
	# replace configuration field
	# conf(LEAF,LEAF,...)
	while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) {
		my ($template,$leaf) = ($1,$2);
		if (defined $template && defined $leaf) {
			my @leaf         = split(/\s*,\s*/,$leaf);
			my $new_template;
			if (@leaf == 2 && $leaf[0] eq ".") {
				$new_template = $node->{$leaf[1]};
			} else {
				$new_template = fetch_conf(@leaf);
			}
			$str =~ s/\Q$template\E/$new_template/g;
		}
	}
	if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) {
		my $fn = $1;
		$str = eval $fn;
		if ($@) {
			die "could not parse configuration parameter [$@]";
		}
	}
	return $str;
}

sub fetch_configuration {
	my @config_path = @_;
	my $node        = \%CONF;
	if(! @config_path) {
		return \%CONF;
	}
	for my $path_element (@config_path) {
		if (! exists $node->{$path_element}) {
	    return undef;
		} else {
	    $node = $node->{$path_element};
		}
	}
	return $node;
}

sub fetch_conf {
	return fetch_configuration(@_);
}

sub loadconfiguration {
  my $file = shift;
  if (defined $file) {
    if (-e $file && -r _) {
      # provided configuration file exists and can be read
      $file = abs_path($file);
    } else {
      confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
    }
  } else {
    # otherwise, try to automatically find a configuration file
    my ($scriptname,$path,$suffix) = fileparse($0);
    my $cwd     = getcwd();
    my $bindir  = $FindBin::RealBin;
    my $userdir = $ENV{HOME};
    my @candidate_files = (
													 "$cwd/$scriptname.conf",
													 "$cwd/etc/$scriptname.conf",
													 "$cwd/../etc/$scriptname.conf",
													 "$bindir/$scriptname.conf",
													 "$bindir/etc/$scriptname.conf",
													 "$bindir/../etc/$scriptname.conf",
													 "$userdir/.$scriptname.conf",
													);
    my @additional_files = ();
    for my $candidate_file (@additional_files,@candidate_files) {
			#printinfo("configsearch",$candidate_file);
			if (-e $candidate_file && -r _) {
				$file = $candidate_file;
				#printinfo("configfound",$candidate_file);
				last;
			}
    }
  }
  if (defined $file) {
    $OPT{configfile} = $file;
    $conf = new Config::General(
																-ConfigFile=>$file,
																-IncludeRelative=>1,
																-IncludeAgain=>1,
																-ExtendedAccess=>1,
																-AllowMultiOptions=>"yes",
																#-LowerCaseNames=>1,
																-AutoTrue=>1
															 );
    %CONF = $conf->getall;
  }
}

sub printdebug {
	printinfo("debug",@_) if defined $CONF{debug};
}

sub printinfo {
	print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
}

sub printfinfo {
	my ($fmt,@args) = @_;
	@args = map { defined $_ ? $_ : "_undef_" } @args;
	printf("$fmt\n",@args);
}

sub printerr {
	print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
}

sub printdumper {
	print Dumper(@_);
}

=pod 

=head1 HISTORY

=over

=item * 30 Nov 2015

Started.

=back 

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

Martin Krzywinski
Genome Sciences Center
BC Cancer Research Center
100-570 W 7th Ave
Vancouver BC V5Z 4S6

mkweb.bcgsc.ca
martink@bcgsc.ca

=cut