Mercurial > repos > morinlab > oncocircos
comparison bin/parse @ 0:b77ab858eac1 draft
Uploaded
| author | morinlab |
|---|---|
| date | Mon, 12 Sep 2016 16:23:26 -0400 |
| parents | |
| children | d248caf924d3 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b77ab858eac1 |
|---|---|
| 1 #!/bin/env perl | |
| 2 | |
| 3 =pod | |
| 4 | |
| 5 =head1 NAME | |
| 6 | |
| 7 parse - parse Ryan's MAF and CNV files and generate a summary table of all genes and their mutations and CNV status | |
| 8 | |
| 9 =head1 SYNOPSIS | |
| 10 | |
| 11 # automatically load etc/parse.conf | |
| 12 bin/parse | |
| 13 | |
| 14 # if config file is elsewhere | |
| 15 bin/parse -conf elsewhere/my.conf | |
| 16 | |
| 17 =head1 DESCRIPTION | |
| 18 | |
| 19 See etc/parse.conf for all settings. | |
| 20 | |
| 21 =head1 OPTIONS | |
| 22 | |
| 23 =cut | |
| 24 | |
| 25 use strict; | |
| 26 use warnings FATAL=>"all"; | |
| 27 | |
| 28 use Carp; | |
| 29 use Config::General; | |
| 30 use Cwd qw(getcwd abs_path); | |
| 31 use Data::Dumper; | |
| 32 use File::Basename; | |
| 33 use FindBin; | |
| 34 use Getopt::Long; | |
| 35 use Math::Round qw(round nearest); | |
| 36 use Math::VecStat qw(sum min max average); | |
| 37 use Pod::Usage; | |
| 38 use Time::HiRes qw(gettimeofday tv_interval); | |
| 39 use Statistics::Basic qw(median); | |
| 40 use Storable; | |
| 41 use lib "$FindBin::RealBin"; | |
| 42 use lib "$FindBin::RealBin/../lib"; | |
| 43 use lib "$FindBin::RealBin/lib"; | |
| 44 | |
| 45 our (%OPT,%CONF,$conf); | |
| 46 our @COMMAND_LINE = ("file=s", | |
| 47 "configfile=s", | |
| 48 "help", | |
| 49 "cdump", | |
| 50 "man", | |
| 51 "debug"); | |
| 52 our $VERSION = 0.02; | |
| 53 | |
| 54 # common and custom module imports below | |
| 55 #use Regexp::Common; | |
| 56 #use IO::File; | |
| 57 #use List::Util; | |
| 58 #use List::MoreUtils; | |
| 59 use Set::IntSpan; | |
| 60 #use Statistics::Descriptive; | |
| 61 | |
| 62 # read and parse configuration file | |
| 63 parse_config(); | |
| 64 | |
| 65 sub validateconfiguration { | |
| 66 } | |
| 67 | |
| 68 ################################################################ | |
| 69 # get files | |
| 70 my $sv = read_file($CONF{files}{sv} ,"sv" ); | |
| 71 my $genes = read_file($CONF{files}{mart},"genes"); | |
| 72 my $cnv = read_file($CONF{files}{cnv} ,"cnv" ); | |
| 73 | |
| 74 ################################################################ | |
| 75 # traverse all genes from biomart and determine number | |
| 76 # of SV and CNV events across samples | |
| 77 for my $chr (keys %$genes) { | |
| 78 next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr}; | |
| 79 printdebug("processing",$chr); | |
| 80 for my $gene (@{$genes->{$chr}}) { | |
| 81 my $id = $gene->{id}; | |
| 82 # filter out by presence and number of SV events | |
| 83 next if $CONF{filter}{sv} && ! $sv->{$id}; | |
| 84 # number of samples that have SV event | |
| 85 my @samples_sv = keys %{$sv->{$id}}; | |
| 86 next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num}; | |
| 87 | |
| 88 $gene->{affected} = 1; | |
| 89 | |
| 90 # register SV events | |
| 91 my $pos; | |
| 92 for my $sample (@samples_sv) { | |
| 93 for my $sv (sort {$b->{weight} <=> $a->{weight}} @{$sv->{$id}{$sample}}) { | |
| 94 $gene->{sv}{ $sv->{type} }++; | |
| 95 $gene->{sv}{ "*" }++; | |
| 96 $pos->{ $sv->{aa} }++; # register the protein position of the SV | |
| 97 next if $CONF{sv}{top_damage_only}; | |
| 98 } | |
| 99 } | |
| 100 # top SV event | |
| 101 if($gene->{sv}) { | |
| 102 my ($sv_top) = sort {$gene->{sv}{$b} <=> $gene->{sv}{$a}} grep($_ ne "*",keys %{$gene->{sv}}); | |
| 103 $gene->{sv_top}{$sv_top} = $gene->{sv}{$sv_top}; | |
| 104 } | |
| 105 for my $aa (sort {$pos->{$b} <=> $pos->{$a}} keys %$pos) { | |
| 106 #next unless $pos->{$aa} > 1; | |
| 107 my $n = $pos->{$aa}; | |
| 108 $gene->{svaa_top}{$aa} = $n if ! defined $gene->{svaa_top}; | |
| 109 $gene->{svaa}{"*"} += $n; | |
| 110 $gene->{svaa}{$aa} = $n; | |
| 111 } | |
| 112 # register CNV events | |
| 113 my @samples_cnv = keys %$cnv; | |
| 114 # lookup any CNV events -- this can take a bit of time | |
| 115 # we can bin the CNV hash later if needed | |
| 116 for my $sample (@samples_cnv) { | |
| 117 my $chr = $gene->{chr}; | |
| 118 next unless $cnv->{$sample}{$chr}; | |
| 119 for my $cnv (@{$cnv->{$sample}{$chr}}) { | |
| 120 my $int = $cnv->{set}->intersect($gene->{set})->cardinality; | |
| 121 next unless $int; | |
| 122 push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg}; | |
| 123 } | |
| 124 } | |
| 125 } | |
| 126 } | |
| 127 | |
| 128 ################################################################ | |
| 129 # report | |
| 130 my $i = 0; | |
| 131 for my $chr (1..22,"X","Y") { | |
| 132 next unless $genes->{$chr}; | |
| 133 for my $gene (sort {$a->{start} <=> $b->{start}} @{$genes->{$chr}}) { | |
| 134 next unless $gene->{affected}; | |
| 135 my @report = ($i++,@{$gene}{qw(id name chr start end size)}); | |
| 136 if($gene->{sv}) { | |
| 137 push @report, sprintf("sv_top:%s:%d",keys %{$gene->{sv_top}},values %{$gene->{sv_top}}); | |
| 138 for my $type (sort keys %{$gene->{sv}}) { | |
| 139 push @report, sprintf("sv:%s:%d",$type,$gene->{sv}{$type}); | |
| 140 } | |
| 141 } | |
| 142 if($gene->{svaa}) { | |
| 143 push @report, sprintf("svaa_top:%s:%d",keys %{$gene->{svaa_top}},values %{$gene->{svaa_top}}); | |
| 144 for my $aa (sort {$gene->{svaa}{$b} <=> $gene->{svaa}{$a}} keys %{$gene->{svaa}}) { | |
| 145 push @report, sprintf("svaa:%s:%d",$aa,$gene->{svaa}{$aa}); | |
| 146 } | |
| 147 } | |
| 148 if($gene->{cnv}) { | |
| 149 my $type_count; | |
| 150 my $delins_count; | |
| 151 my $values_by_type; | |
| 152 for my $type (sort keys %{$gene->{cnv}}) { | |
| 153 my @sample_avg; | |
| 154 for my $sample (keys %{$gene->{cnv}{$type}}) { | |
| 155 # number of samples with this kind of CNV event | |
| 156 $type_count->{$type}++; | |
| 157 my @values = @{$gene->{cnv}{$type}{$sample}}; | |
| 158 push @sample_avg, average(@values); | |
| 159 push @{$values_by_type->{$type}}, @values; | |
| 160 } | |
| 161 push @report, sprintf("cnv:%s:%d:%f:%f:%f:%f", | |
| 162 $type, | |
| 163 int(@sample_avg), | |
| 164 scalar(min(@sample_avg)), | |
| 165 average(@sample_avg), | |
| 166 median(@sample_avg)->query, | |
| 167 scalar(max(@sample_avg))); | |
| 168 } | |
| 169 my ($top_type) = sort {$type_count->{$b} <=> $type_count->{$a}} keys %$type_count; | |
| 170 push @report, sprintf("cnv_top:%s:%d:%f:%f:%f:%f", | |
| 171 $top_type, | |
| 172 $type_count->{$top_type}, | |
| 173 scalar(min(@{$values_by_type->{$top_type}})), | |
| 174 average(@{$values_by_type->{$top_type}}), | |
| 175 median(@{$values_by_type->{$top_type}})->query, | |
| 176 scalar(max(@{$values_by_type->{$top_type}}))); | |
| 177 } | |
| 178 printinfo(@report); | |
| 179 } | |
| 180 } | |
| 181 | |
| 182 exit; | |
| 183 | |
| 184 sub read_file { | |
| 185 my ($file,$type) = @_; | |
| 186 open(F,$file) || die "Could not open file [$file] for reading"; | |
| 187 my $data; | |
| 188 | |
| 189 my @fields = grep(/\d/,keys %{$CONF{fields}{$type}}); | |
| 190 my @keys = split(",",$CONF{fields}{$type}{key}); | |
| 191 | |
| 192 my $i; | |
| 193 while(<F>) { | |
| 194 chomp; | |
| 195 next if /^\#/; | |
| 196 my @tok = split "\t"; | |
| 197 my $entry = {class=>$type}; | |
| 198 for my $col (@fields) { | |
| 199 my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col}); | |
| 200 my $value = $tok[$col]; | |
| 201 if($field_transform) { | |
| 202 $value = lc $value if $field_transform =~ /lc/; | |
| 203 } | |
| 204 $entry->{ $field_name } = $value; | |
| 205 } | |
| 206 # skip mutation types that are not important | |
| 207 next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}}; | |
| 208 next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}}; | |
| 209 if($type eq "sv") { | |
| 210 $entry->{weight} = $CONF{sv}{types}{$entry->{type}}; | |
| 211 } | |
| 212 $entry->{chr} = "X" if $entry->{chr} eq 23; | |
| 213 $entry->{chr} = "Y" if $entry->{chr} eq 24; | |
| 214 next unless grep($entry->{chr} eq $_, (1..22,"X","Y")); | |
| 215 $entry->{set} = span(@{$entry}{qw(start end)}) if $entry->{start}; | |
| 216 $entry->{size} = $entry->{set}->cardinality; | |
| 217 #printdumper($entry); | |
| 218 $i++; | |
| 219 if(@keys == 1) { | |
| 220 push @{$data->{$entry->{$keys[0]}}}, $entry; | |
| 221 } elsif (@keys == 2) { | |
| 222 push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry; | |
| 223 } | |
| 224 } | |
| 225 printdebug("got",$i,$type); | |
| 226 return $data; | |
| 227 } | |
| 228 | |
| 229 sub list2hash { | |
| 230 my %h; | |
| 231 map {$h{$_}=1} @_; | |
| 232 return %h; | |
| 233 } | |
| 234 | |
| 235 sub span { | |
| 236 my ($x,$y) = @_; | |
| 237 if($x==$y) { | |
| 238 return Set::IntSpan->new($x); | |
| 239 } else { | |
| 240 return Set::IntSpan->new("$x-$y"); | |
| 241 } | |
| 242 } | |
| 243 | |
| 244 sub get_handle { | |
| 245 my $h; | |
| 246 if(my $file = $CONF{file}) { | |
| 247 die "No such file [$file]" unless -e $file; | |
| 248 open(FILE,$file); | |
| 249 $h = \*FILE; | |
| 250 } else { | |
| 251 $h = \*STDIN; | |
| 252 } | |
| 253 return $h; | |
| 254 } | |
| 255 | |
| 256 # HOUSEKEEPING ############################################################### | |
| 257 | |
| 258 sub dump_config { | |
| 259 printdumper(\%OPT,\%CONF); | |
| 260 } | |
| 261 | |
| 262 sub parse_config { | |
| 263 my $dump_debug_level = 3; | |
| 264 GetOptions(\%OPT,@COMMAND_LINE); | |
| 265 pod2usage() if $OPT{help}; | |
| 266 pod2usage(-verbose=>2) if $OPT{man}; | |
| 267 loadconfiguration($OPT{configfile}); | |
| 268 populateconfiguration(); # copy command line options to config hash | |
| 269 validateconfiguration(); | |
| 270 if ($CONF{cdump}) { | |
| 271 $Data::Dumper::Indent = 2; | |
| 272 $Data::Dumper::Quotekeys = 0; | |
| 273 $Data::Dumper::Terse = 0; | |
| 274 $Data::Dumper::Sortkeys = 1; | |
| 275 $Data::Dumper::Varname = "OPT"; | |
| 276 printdumper(\%OPT); | |
| 277 $Data::Dumper::Varname = "CONF"; | |
| 278 printdumper(\%CONF); | |
| 279 exit; | |
| 280 } | |
| 281 } | |
| 282 | |
| 283 sub populateconfiguration { | |
| 284 for my $var (keys %OPT) { | |
| 285 $CONF{$var} = $OPT{$var}; | |
| 286 } | |
| 287 repopulateconfiguration(\%CONF); | |
| 288 } | |
| 289 | |
| 290 sub repopulateconfiguration { | |
| 291 my ($node,$parent_node_name) = shift; | |
| 292 return unless ref($node) eq "HASH"; | |
| 293 for my $key (keys %$node) { | |
| 294 my $value = $node->{$key}; | |
| 295 if (ref($value) eq "HASH") { | |
| 296 repopulateconfiguration($value,$key); | |
| 297 } elsif (ref($value) eq "ARRAY") { | |
| 298 for my $item (@$value) { | |
| 299 repopulateconfiguration($item,$key); | |
| 300 } | |
| 301 } elsif (defined $value) { | |
| 302 my $new_value = parse_field($value,$key,$parent_node_name,$node); | |
| 303 $node->{$key} = $new_value; | |
| 304 } | |
| 305 } | |
| 306 } | |
| 307 | |
| 308 sub parse_field { | |
| 309 my ($str,$key,$parent_node_name,$node) = @_; | |
| 310 # replace configuration field | |
| 311 # conf(LEAF,LEAF,...) | |
| 312 while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) { | |
| 313 my ($template,$leaf) = ($1,$2); | |
| 314 if (defined $template && defined $leaf) { | |
| 315 my @leaf = split(/\s*,\s*/,$leaf); | |
| 316 my $new_template; | |
| 317 if (@leaf == 2 && $leaf[0] eq ".") { | |
| 318 $new_template = $node->{$leaf[1]}; | |
| 319 } else { | |
| 320 $new_template = fetch_conf(@leaf); | |
| 321 } | |
| 322 $str =~ s/\Q$template\E/$new_template/g; | |
| 323 } | |
| 324 } | |
| 325 if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) { | |
| 326 my $fn = $1; | |
| 327 $str = eval $fn; | |
| 328 if ($@) { | |
| 329 die "could not parse configuration parameter [$@]"; | |
| 330 } | |
| 331 } | |
| 332 return $str; | |
| 333 } | |
| 334 | |
| 335 sub fetch_configuration { | |
| 336 my @config_path = @_; | |
| 337 my $node = \%CONF; | |
| 338 if(! @config_path) { | |
| 339 return \%CONF; | |
| 340 } | |
| 341 for my $path_element (@config_path) { | |
| 342 if (! exists $node->{$path_element}) { | |
| 343 return undef; | |
| 344 } else { | |
| 345 $node = $node->{$path_element}; | |
| 346 } | |
| 347 } | |
| 348 return $node; | |
| 349 } | |
| 350 | |
| 351 sub fetch_conf { | |
| 352 return fetch_configuration(@_); | |
| 353 } | |
| 354 | |
| 355 sub loadconfiguration { | |
| 356 my $file = shift; | |
| 357 if (defined $file) { | |
| 358 if (-e $file && -r _) { | |
| 359 # provided configuration file exists and can be read | |
| 360 $file = abs_path($file); | |
| 361 } else { | |
| 362 confess "The configuration file [$file] passed with -configfile does not exist or cannot be read."; | |
| 363 } | |
| 364 } else { | |
| 365 # otherwise, try to automatically find a configuration file | |
| 366 my ($scriptname,$path,$suffix) = fileparse($0); | |
| 367 my $cwd = getcwd(); | |
| 368 my $bindir = $FindBin::RealBin; | |
| 369 my $userdir = $ENV{HOME}; | |
| 370 my @candidate_files = ( | |
| 371 "$cwd/$scriptname.conf", | |
| 372 "$cwd/etc/$scriptname.conf", | |
| 373 "$cwd/../etc/$scriptname.conf", | |
| 374 "$bindir/$scriptname.conf", | |
| 375 "$bindir/etc/$scriptname.conf", | |
| 376 "$bindir/../etc/$scriptname.conf", | |
| 377 "$userdir/.$scriptname.conf", | |
| 378 ); | |
| 379 my @additional_files = (); | |
| 380 for my $candidate_file (@additional_files,@candidate_files) { | |
| 381 #printinfo("configsearch",$candidate_file); | |
| 382 if (-e $candidate_file && -r _) { | |
| 383 $file = $candidate_file; | |
| 384 #printinfo("configfound",$candidate_file); | |
| 385 last; | |
| 386 } | |
| 387 } | |
| 388 } | |
| 389 if (defined $file) { | |
| 390 $OPT{configfile} = $file; | |
| 391 $conf = new Config::General( | |
| 392 -ConfigFile=>$file, | |
| 393 -IncludeRelative=>1, | |
| 394 -IncludeAgain=>1, | |
| 395 -ExtendedAccess=>1, | |
| 396 -AllowMultiOptions=>"yes", | |
| 397 #-LowerCaseNames=>1, | |
| 398 -AutoTrue=>1 | |
| 399 ); | |
| 400 %CONF = $conf->getall; | |
| 401 } | |
| 402 } | |
| 403 | |
| 404 sub printdebug { | |
| 405 printinfo("debug",@_) if defined $CONF{debug}; | |
| 406 } | |
| 407 | |
| 408 sub printinfo { | |
| 409 print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; | |
| 410 } | |
| 411 | |
| 412 sub printfinfo { | |
| 413 my ($fmt,@args) = @_; | |
| 414 @args = map { defined $_ ? $_ : "_undef_" } @args; | |
| 415 printf("$fmt\n",@args); | |
| 416 } | |
| 417 | |
| 418 sub printerr { | |
| 419 print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n"; | |
| 420 } | |
| 421 | |
| 422 sub printdumper { | |
| 423 print Dumper(@_); | |
| 424 } | |
| 425 | |
| 426 =pod | |
| 427 | |
| 428 =head1 HISTORY | |
| 429 | |
| 430 =over | |
| 431 | |
| 432 =item * 30 Nov 2015 | |
| 433 | |
| 434 Started. | |
| 435 | |
| 436 =back | |
| 437 | |
| 438 =head1 AUTHOR | |
| 439 | |
| 440 Martin Krzywinski | |
| 441 | |
| 442 =head1 CONTACT | |
| 443 | |
| 444 Martin Krzywinski | |
| 445 Genome Sciences Center | |
| 446 BC Cancer Research Center | |
| 447 100-570 W 7th Ave | |
| 448 Vancouver BC V5Z 4S6 | |
| 449 | |
| 450 mkweb.bcgsc.ca | |
| 451 martink@bcgsc.ca | |
| 452 | |
| 453 =cut |
