|
0
|
1 #!/bin/env perl
|
|
|
2
|
|
|
3 =pod
|
|
|
4
|
|
|
5 =head1 NAME
|
|
|
6
|
|
|
7 parse - parse Ryan's MAF and CNV files and generate a summary table of all genes and their mutations and CNV status
|
|
|
8
|
|
|
9 =head1 SYNOPSIS
|
|
|
10
|
|
|
11 # automatically load etc/parse.conf
|
|
|
12 bin/parse
|
|
|
13
|
|
|
14 # if config file is elsewhere
|
|
|
15 bin/parse -conf elsewhere/my.conf
|
|
|
16
|
|
|
17 =head1 DESCRIPTION
|
|
|
18
|
|
|
19 See etc/parse.conf for all settings.
|
|
|
20
|
|
|
21 =head1 OPTIONS
|
|
|
22
|
|
|
23 =cut
|
|
|
24
|
|
|
25 use strict;
|
|
|
26 use warnings FATAL=>"all";
|
|
|
27
|
|
|
28 use Carp;
|
|
|
29 use Config::General;
|
|
|
30 use Cwd qw(getcwd abs_path);
|
|
|
31 use Data::Dumper;
|
|
|
32 use File::Basename;
|
|
|
33 use FindBin;
|
|
|
34 use Getopt::Long;
|
|
|
35 use Math::Round qw(round nearest);
|
|
|
36 use Math::VecStat qw(sum min max average);
|
|
|
37 use Pod::Usage;
|
|
|
38 use Time::HiRes qw(gettimeofday tv_interval);
|
|
|
39 use Statistics::Basic qw(median);
|
|
|
40 use Storable;
|
|
|
41 use lib "$FindBin::RealBin";
|
|
|
42 use lib "$FindBin::RealBin/../lib";
|
|
|
43 use lib "$FindBin::RealBin/lib";
|
|
|
44
|
|
|
45 our (%OPT,%CONF,$conf);
|
|
|
46 our @COMMAND_LINE = ("file=s",
|
|
|
47 "configfile=s",
|
|
|
48 "help",
|
|
|
49 "cdump",
|
|
|
50 "man",
|
|
|
51 "debug");
|
|
|
52 our $VERSION = 0.02;
|
|
|
53
|
|
|
54 # common and custom module imports below
|
|
|
55 #use Regexp::Common;
|
|
|
56 #use IO::File;
|
|
|
57 #use List::Util;
|
|
|
58 #use List::MoreUtils;
|
|
|
59 use Set::IntSpan;
|
|
|
60 #use Statistics::Descriptive;
|
|
|
61
|
|
|
62 # read and parse configuration file
|
|
|
63 parse_config();
|
|
|
64
|
|
|
65 sub validateconfiguration {
|
|
|
66 }
|
|
|
67
|
|
|
68 ################################################################
|
|
|
69 # get files
|
|
|
70 my $sv = read_file($CONF{files}{sv} ,"sv" );
|
|
|
71 my $genes = read_file($CONF{files}{mart},"genes");
|
|
|
72 my $cnv = read_file($CONF{files}{cnv} ,"cnv" );
|
|
|
73
|
|
|
74 ################################################################
|
|
|
75 # traverse all genes from biomart and determine number
|
|
|
76 # of SV and CNV events across samples
|
|
|
77 for my $chr (keys %$genes) {
|
|
|
78 next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr};
|
|
|
79 printdebug("processing",$chr);
|
|
|
80 for my $gene (@{$genes->{$chr}}) {
|
|
|
81 my $id = $gene->{id};
|
|
|
82 # filter out by presence and number of SV events
|
|
|
83 next if $CONF{filter}{sv} && ! $sv->{$id};
|
|
|
84 # number of samples that have SV event
|
|
|
85 my @samples_sv = keys %{$sv->{$id}};
|
|
|
86 next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num};
|
|
|
87
|
|
|
88 $gene->{affected} = 1;
|
|
|
89
|
|
|
90 # register SV events
|
|
|
91 my $pos;
|
|
|
92 for my $sample (@samples_sv) {
|
|
|
93 for my $sv (sort {$b->{weight} <=> $a->{weight}} @{$sv->{$id}{$sample}}) {
|
|
|
94 $gene->{sv}{ $sv->{type} }++;
|
|
|
95 $gene->{sv}{ "*" }++;
|
|
|
96 $pos->{ $sv->{aa} }++; # register the protein position of the SV
|
|
|
97 next if $CONF{sv}{top_damage_only};
|
|
|
98 }
|
|
|
99 }
|
|
|
100 # top SV event
|
|
|
101 if($gene->{sv}) {
|
|
|
102 my ($sv_top) = sort {$gene->{sv}{$b} <=> $gene->{sv}{$a}} grep($_ ne "*",keys %{$gene->{sv}});
|
|
|
103 $gene->{sv_top}{$sv_top} = $gene->{sv}{$sv_top};
|
|
|
104 }
|
|
|
105 for my $aa (sort {$pos->{$b} <=> $pos->{$a}} keys %$pos) {
|
|
|
106 #next unless $pos->{$aa} > 1;
|
|
|
107 my $n = $pos->{$aa};
|
|
|
108 $gene->{svaa_top}{$aa} = $n if ! defined $gene->{svaa_top};
|
|
|
109 $gene->{svaa}{"*"} += $n;
|
|
|
110 $gene->{svaa}{$aa} = $n;
|
|
|
111 }
|
|
|
112 # register CNV events
|
|
|
113 my @samples_cnv = keys %$cnv;
|
|
|
114 # lookup any CNV events -- this can take a bit of time
|
|
|
115 # we can bin the CNV hash later if needed
|
|
|
116 for my $sample (@samples_cnv) {
|
|
|
117 my $chr = $gene->{chr};
|
|
|
118 next unless $cnv->{$sample}{$chr};
|
|
|
119 for my $cnv (@{$cnv->{$sample}{$chr}}) {
|
|
|
120 my $int = $cnv->{set}->intersect($gene->{set})->cardinality;
|
|
|
121 next unless $int;
|
|
|
122 push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg};
|
|
|
123 }
|
|
|
124 }
|
|
|
125 }
|
|
|
126 }
|
|
|
127
|
|
|
128 ################################################################
|
|
|
129 # report
|
|
|
130 my $i = 0;
|
|
|
131 for my $chr (1..22,"X","Y") {
|
|
|
132 next unless $genes->{$chr};
|
|
|
133 for my $gene (sort {$a->{start} <=> $b->{start}} @{$genes->{$chr}}) {
|
|
|
134 next unless $gene->{affected};
|
|
|
135 my @report = ($i++,@{$gene}{qw(id name chr start end size)});
|
|
|
136 if($gene->{sv}) {
|
|
|
137 push @report, sprintf("sv_top:%s:%d",keys %{$gene->{sv_top}},values %{$gene->{sv_top}});
|
|
|
138 for my $type (sort keys %{$gene->{sv}}) {
|
|
|
139 push @report, sprintf("sv:%s:%d",$type,$gene->{sv}{$type});
|
|
|
140 }
|
|
|
141 }
|
|
|
142 if($gene->{svaa}) {
|
|
|
143 push @report, sprintf("svaa_top:%s:%d",keys %{$gene->{svaa_top}},values %{$gene->{svaa_top}});
|
|
|
144 for my $aa (sort {$gene->{svaa}{$b} <=> $gene->{svaa}{$a}} keys %{$gene->{svaa}}) {
|
|
|
145 push @report, sprintf("svaa:%s:%d",$aa,$gene->{svaa}{$aa});
|
|
|
146 }
|
|
|
147 }
|
|
|
148 if($gene->{cnv}) {
|
|
|
149 my $type_count;
|
|
|
150 my $delins_count;
|
|
|
151 my $values_by_type;
|
|
|
152 for my $type (sort keys %{$gene->{cnv}}) {
|
|
|
153 my @sample_avg;
|
|
|
154 for my $sample (keys %{$gene->{cnv}{$type}}) {
|
|
|
155 # number of samples with this kind of CNV event
|
|
|
156 $type_count->{$type}++;
|
|
|
157 my @values = @{$gene->{cnv}{$type}{$sample}};
|
|
|
158 push @sample_avg, average(@values);
|
|
|
159 push @{$values_by_type->{$type}}, @values;
|
|
|
160 }
|
|
|
161 push @report, sprintf("cnv:%s:%d:%f:%f:%f:%f",
|
|
|
162 $type,
|
|
|
163 int(@sample_avg),
|
|
|
164 scalar(min(@sample_avg)),
|
|
|
165 average(@sample_avg),
|
|
|
166 median(@sample_avg)->query,
|
|
|
167 scalar(max(@sample_avg)));
|
|
|
168 }
|
|
|
169 my ($top_type) = sort {$type_count->{$b} <=> $type_count->{$a}} keys %$type_count;
|
|
|
170 push @report, sprintf("cnv_top:%s:%d:%f:%f:%f:%f",
|
|
|
171 $top_type,
|
|
|
172 $type_count->{$top_type},
|
|
|
173 scalar(min(@{$values_by_type->{$top_type}})),
|
|
|
174 average(@{$values_by_type->{$top_type}}),
|
|
|
175 median(@{$values_by_type->{$top_type}})->query,
|
|
|
176 scalar(max(@{$values_by_type->{$top_type}})));
|
|
|
177 }
|
|
|
178 printinfo(@report);
|
|
|
179 }
|
|
|
180 }
|
|
|
181
|
|
|
182 exit;
|
|
|
183
|
|
|
184 sub read_file {
|
|
|
185 my ($file,$type) = @_;
|
|
|
186 open(F,$file) || die "Could not open file [$file] for reading";
|
|
|
187 my $data;
|
|
|
188
|
|
|
189 my @fields = grep(/\d/,keys %{$CONF{fields}{$type}});
|
|
|
190 my @keys = split(",",$CONF{fields}{$type}{key});
|
|
|
191
|
|
|
192 my $i;
|
|
|
193 while(<F>) {
|
|
|
194 chomp;
|
|
|
195 next if /^\#/;
|
|
|
196 my @tok = split "\t";
|
|
|
197 my $entry = {class=>$type};
|
|
|
198 for my $col (@fields) {
|
|
|
199 my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col});
|
|
|
200 my $value = $tok[$col];
|
|
|
201 if($field_transform) {
|
|
|
202 $value = lc $value if $field_transform =~ /lc/;
|
|
|
203 }
|
|
|
204 $entry->{ $field_name } = $value;
|
|
|
205 }
|
|
|
206 # skip mutation types that are not important
|
|
|
207 next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}};
|
|
|
208 next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}};
|
|
|
209 if($type eq "sv") {
|
|
|
210 $entry->{weight} = $CONF{sv}{types}{$entry->{type}};
|
|
|
211 }
|
|
|
212 $entry->{chr} = "X" if $entry->{chr} eq 23;
|
|
|
213 $entry->{chr} = "Y" if $entry->{chr} eq 24;
|
|
|
214 next unless grep($entry->{chr} eq $_, (1..22,"X","Y"));
|
|
|
215 $entry->{set} = span(@{$entry}{qw(start end)}) if $entry->{start};
|
|
|
216 $entry->{size} = $entry->{set}->cardinality;
|
|
|
217 #printdumper($entry);
|
|
|
218 $i++;
|
|
|
219 if(@keys == 1) {
|
|
|
220 push @{$data->{$entry->{$keys[0]}}}, $entry;
|
|
|
221 } elsif (@keys == 2) {
|
|
|
222 push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry;
|
|
|
223 }
|
|
|
224 }
|
|
|
225 printdebug("got",$i,$type);
|
|
|
226 return $data;
|
|
|
227 }
|
|
|
228
|
|
|
229 sub list2hash {
|
|
|
230 my %h;
|
|
|
231 map {$h{$_}=1} @_;
|
|
|
232 return %h;
|
|
|
233 }
|
|
|
234
|
|
|
235 sub span {
|
|
|
236 my ($x,$y) = @_;
|
|
|
237 if($x==$y) {
|
|
|
238 return Set::IntSpan->new($x);
|
|
|
239 } else {
|
|
|
240 return Set::IntSpan->new("$x-$y");
|
|
|
241 }
|
|
|
242 }
|
|
|
243
|
|
|
244 sub get_handle {
|
|
|
245 my $h;
|
|
|
246 if(my $file = $CONF{file}) {
|
|
|
247 die "No such file [$file]" unless -e $file;
|
|
|
248 open(FILE,$file);
|
|
|
249 $h = \*FILE;
|
|
|
250 } else {
|
|
|
251 $h = \*STDIN;
|
|
|
252 }
|
|
|
253 return $h;
|
|
|
254 }
|
|
|
255
|
|
|
256 # HOUSEKEEPING ###############################################################
|
|
|
257
|
|
|
258 sub dump_config {
|
|
|
259 printdumper(\%OPT,\%CONF);
|
|
|
260 }
|
|
|
261
|
|
|
262 sub parse_config {
|
|
|
263 my $dump_debug_level = 3;
|
|
|
264 GetOptions(\%OPT,@COMMAND_LINE);
|
|
|
265 pod2usage() if $OPT{help};
|
|
|
266 pod2usage(-verbose=>2) if $OPT{man};
|
|
|
267 loadconfiguration($OPT{configfile});
|
|
|
268 populateconfiguration(); # copy command line options to config hash
|
|
|
269 validateconfiguration();
|
|
|
270 if ($CONF{cdump}) {
|
|
|
271 $Data::Dumper::Indent = 2;
|
|
|
272 $Data::Dumper::Quotekeys = 0;
|
|
|
273 $Data::Dumper::Terse = 0;
|
|
|
274 $Data::Dumper::Sortkeys = 1;
|
|
|
275 $Data::Dumper::Varname = "OPT";
|
|
|
276 printdumper(\%OPT);
|
|
|
277 $Data::Dumper::Varname = "CONF";
|
|
|
278 printdumper(\%CONF);
|
|
|
279 exit;
|
|
|
280 }
|
|
|
281 }
|
|
|
282
|
|
|
283 sub populateconfiguration {
|
|
|
284 for my $var (keys %OPT) {
|
|
|
285 $CONF{$var} = $OPT{$var};
|
|
|
286 }
|
|
|
287 repopulateconfiguration(\%CONF);
|
|
|
288 }
|
|
|
289
|
|
|
290 sub repopulateconfiguration {
|
|
|
291 my ($node,$parent_node_name) = shift;
|
|
|
292 return unless ref($node) eq "HASH";
|
|
|
293 for my $key (keys %$node) {
|
|
|
294 my $value = $node->{$key};
|
|
|
295 if (ref($value) eq "HASH") {
|
|
|
296 repopulateconfiguration($value,$key);
|
|
|
297 } elsif (ref($value) eq "ARRAY") {
|
|
|
298 for my $item (@$value) {
|
|
|
299 repopulateconfiguration($item,$key);
|
|
|
300 }
|
|
|
301 } elsif (defined $value) {
|
|
|
302 my $new_value = parse_field($value,$key,$parent_node_name,$node);
|
|
|
303 $node->{$key} = $new_value;
|
|
|
304 }
|
|
|
305 }
|
|
|
306 }
|
|
|
307
|
|
|
308 sub parse_field {
|
|
|
309 my ($str,$key,$parent_node_name,$node) = @_;
|
|
|
310 # replace configuration field
|
|
|
311 # conf(LEAF,LEAF,...)
|
|
|
312 while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) {
|
|
|
313 my ($template,$leaf) = ($1,$2);
|
|
|
314 if (defined $template && defined $leaf) {
|
|
|
315 my @leaf = split(/\s*,\s*/,$leaf);
|
|
|
316 my $new_template;
|
|
|
317 if (@leaf == 2 && $leaf[0] eq ".") {
|
|
|
318 $new_template = $node->{$leaf[1]};
|
|
|
319 } else {
|
|
|
320 $new_template = fetch_conf(@leaf);
|
|
|
321 }
|
|
|
322 $str =~ s/\Q$template\E/$new_template/g;
|
|
|
323 }
|
|
|
324 }
|
|
|
325 if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) {
|
|
|
326 my $fn = $1;
|
|
|
327 $str = eval $fn;
|
|
|
328 if ($@) {
|
|
|
329 die "could not parse configuration parameter [$@]";
|
|
|
330 }
|
|
|
331 }
|
|
|
332 return $str;
|
|
|
333 }
|
|
|
334
|
|
|
335 sub fetch_configuration {
|
|
|
336 my @config_path = @_;
|
|
|
337 my $node = \%CONF;
|
|
|
338 if(! @config_path) {
|
|
|
339 return \%CONF;
|
|
|
340 }
|
|
|
341 for my $path_element (@config_path) {
|
|
|
342 if (! exists $node->{$path_element}) {
|
|
|
343 return undef;
|
|
|
344 } else {
|
|
|
345 $node = $node->{$path_element};
|
|
|
346 }
|
|
|
347 }
|
|
|
348 return $node;
|
|
|
349 }
|
|
|
350
|
|
|
351 sub fetch_conf {
|
|
|
352 return fetch_configuration(@_);
|
|
|
353 }
|
|
|
354
|
|
|
355 sub loadconfiguration {
|
|
|
356 my $file = shift;
|
|
|
357 if (defined $file) {
|
|
|
358 if (-e $file && -r _) {
|
|
|
359 # provided configuration file exists and can be read
|
|
|
360 $file = abs_path($file);
|
|
|
361 } else {
|
|
|
362 confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
|
|
|
363 }
|
|
|
364 } else {
|
|
|
365 # otherwise, try to automatically find a configuration file
|
|
|
366 my ($scriptname,$path,$suffix) = fileparse($0);
|
|
|
367 my $cwd = getcwd();
|
|
|
368 my $bindir = $FindBin::RealBin;
|
|
|
369 my $userdir = $ENV{HOME};
|
|
|
370 my @candidate_files = (
|
|
|
371 "$cwd/$scriptname.conf",
|
|
|
372 "$cwd/etc/$scriptname.conf",
|
|
|
373 "$cwd/../etc/$scriptname.conf",
|
|
|
374 "$bindir/$scriptname.conf",
|
|
|
375 "$bindir/etc/$scriptname.conf",
|
|
|
376 "$bindir/../etc/$scriptname.conf",
|
|
|
377 "$userdir/.$scriptname.conf",
|
|
|
378 );
|
|
|
379 my @additional_files = ();
|
|
|
380 for my $candidate_file (@additional_files,@candidate_files) {
|
|
|
381 #printinfo("configsearch",$candidate_file);
|
|
|
382 if (-e $candidate_file && -r _) {
|
|
|
383 $file = $candidate_file;
|
|
|
384 #printinfo("configfound",$candidate_file);
|
|
|
385 last;
|
|
|
386 }
|
|
|
387 }
|
|
|
388 }
|
|
|
389 if (defined $file) {
|
|
|
390 $OPT{configfile} = $file;
|
|
|
391 $conf = new Config::General(
|
|
|
392 -ConfigFile=>$file,
|
|
|
393 -IncludeRelative=>1,
|
|
|
394 -IncludeAgain=>1,
|
|
|
395 -ExtendedAccess=>1,
|
|
|
396 -AllowMultiOptions=>"yes",
|
|
|
397 #-LowerCaseNames=>1,
|
|
|
398 -AutoTrue=>1
|
|
|
399 );
|
|
|
400 %CONF = $conf->getall;
|
|
|
401 }
|
|
|
402 }
|
|
|
403
|
|
|
404 sub printdebug {
|
|
|
405 printinfo("debug",@_) if defined $CONF{debug};
|
|
|
406 }
|
|
|
407
|
|
|
408 sub printinfo {
|
|
|
409 print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
|
|
|
410 }
|
|
|
411
|
|
|
412 sub printfinfo {
|
|
|
413 my ($fmt,@args) = @_;
|
|
|
414 @args = map { defined $_ ? $_ : "_undef_" } @args;
|
|
|
415 printf("$fmt\n",@args);
|
|
|
416 }
|
|
|
417
|
|
|
418 sub printerr {
|
|
|
419 print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
|
|
|
420 }
|
|
|
421
|
|
|
422 sub printdumper {
|
|
|
423 print Dumper(@_);
|
|
|
424 }
|
|
|
425
|
|
|
426 =pod
|
|
|
427
|
|
|
428 =head1 HISTORY
|
|
|
429
|
|
|
430 =over
|
|
|
431
|
|
|
432 =item * 30 Nov 2015
|
|
|
433
|
|
|
434 Started.
|
|
|
435
|
|
|
436 =back
|
|
|
437
|
|
|
438 =head1 AUTHOR
|
|
|
439
|
|
|
440 Martin Krzywinski
|
|
|
441
|
|
|
442 =head1 CONTACT
|
|
|
443
|
|
|
444 Martin Krzywinski
|
|
|
445 Genome Sciences Center
|
|
|
446 BC Cancer Research Center
|
|
|
447 100-570 W 7th Ave
|
|
|
448 Vancouver BC V5Z 4S6
|
|
|
449
|
|
|
450 mkweb.bcgsc.ca
|
|
|
451 martink@bcgsc.ca
|
|
|
452
|
|
|
453 =cut
|