|
0
|
1 #!/bin/env perl
|
|
|
2
|
|
|
3 =pod
|
|
|
4
|
|
|
5 =head1 NAME
|
|
|
6
|
|
|
7 make.circos.data - create Circos data files from summary tables of SV/CNV mutations
|
|
|
8
|
|
|
9 =head1 SYNOPSIS
|
|
|
10
|
|
|
11 bin/parse > table.txt
|
|
|
12
|
|
|
13 # uses same config file as parse
|
|
|
14 cat table.txt | bin/make.circos.data
|
|
|
15
|
|
|
16 =head1 DESCRIPTION
|
|
|
17
|
|
|
18 =head1 OPTIONS
|
|
|
19
|
|
|
20 =cut
|
|
|
21
|
|
|
22 use strict;
|
|
|
23 use warnings FATAL=>"all";
|
|
|
24
|
|
|
25 use Carp;
|
|
|
26 use Config::General;
|
|
|
27 use Cwd qw(getcwd abs_path);
|
|
|
28 use Data::Dumper;
|
|
|
29 use File::Basename;
|
|
|
30 use FindBin;
|
|
|
31 use Getopt::Long;
|
|
|
32 use Math::Round qw(round nearest);
|
|
|
33 use Math::VecStat qw(sum min max average);
|
|
|
34 use Pod::Usage;
|
|
|
35 use Time::HiRes qw(gettimeofday tv_interval);
|
|
|
36 use Storable;
|
|
|
37 use lib "$FindBin::RealBin";
|
|
|
38 use lib "$FindBin::RealBin/../lib";
|
|
|
39 use lib "$FindBin::RealBin/lib";
|
|
|
40
|
|
|
41 our (%OPT,%CONF,$conf);
|
|
|
42 our @COMMAND_LINE = ("file=s",
|
|
|
43 "configfile=s",
|
|
|
44 "help",
|
|
|
45 "cdump",
|
|
|
46 "man",
|
|
|
47 "debug");
|
|
|
48 our $VERSION = 0.02;
|
|
|
49
|
|
|
50 # common and custom module imports below
|
|
|
51 #use Regexp::Common;
|
|
|
52 #use IO::File;
|
|
|
53 #use List::Util;
|
|
|
54 #use List::MoreUtils;
|
|
|
55 use Set::IntSpan;
|
|
|
56 #use Statistics::Descriptive;
|
|
|
57
|
|
|
58 # read and parse configuration file
|
|
|
59 parse_config();
|
|
|
60
|
|
7
|
61
|
|
|
62
|
|
|
63 my %affected_genes;
|
|
|
64 open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n";
|
|
|
65 while(<GENES>){
|
|
|
66 chomp;
|
|
|
67 $affected_genes{$_}++;
|
|
|
68 }
|
|
|
69 close GENES;
|
|
|
70 my %mask_genes;
|
|
|
71 open(GENES, $CONF{files}{mask}) or die "#! $CONF{files}{mask}\n";
|
|
|
72 while(<GENES>){
|
|
|
73 chomp;
|
|
|
74 $mask_genes{$_}++;
|
|
|
75 }
|
|
|
76 close GENES;
|
|
|
77
|
|
0
|
78 sub validateconfiguration {
|
|
|
79
|
|
|
80 }
|
|
|
81
|
|
|
82 ################################################################
|
|
|
83 # get files
|
|
|
84 my $table = read_file();
|
|
|
85
|
|
|
86 my $path = "$CONF{files}{root}/$CONF{files}{circos}";
|
|
|
87 # karyotype
|
|
|
88 open(F,">$path/karyotype.txt");
|
|
|
89 for my $chr (1..22,"X","Y") {
|
|
|
90 my $n = grep($_->{chr} eq $chr,@$table);
|
|
|
91 next unless $n;
|
|
|
92 printf F ("chr - hs%s %s 0 %d chr%s\n",$chr,$chr,$n,lc $chr);
|
|
|
93 }
|
|
|
94 close(F);
|
|
|
95
|
|
7
|
96 # large scale CNV
|
|
|
97 open(FCNVLG,">$path/cnv.tiles.txt");
|
|
|
98 # collect CNV region idx
|
|
|
99 my $cnvlg;
|
|
|
100 for my $gene (@$table) {
|
|
|
101 next unless $gene->{cnvlg};
|
|
|
102 for my $idx (keys %{$gene->{cnvlg}}) {
|
|
|
103 $cnvlg->{$idx} ||= { chr => $gene->{chr},
|
|
|
104 type => $gene->{cnvlg}{$idx}{type} };
|
|
|
105 push @{$cnvlg->{$idx}{pos}}, $gene->{pos};
|
|
|
106 }
|
|
|
107 }
|
|
|
108 my $cnvlg_seen;
|
|
|
109 for my $idx (sort {$a <=> $b} keys %{$cnvlg}) {
|
|
|
110 my $cnv = $cnvlg->{$idx};
|
|
|
111 my $key = join(",",
|
|
|
112 scalar(min @{$cnv->{pos}}),
|
|
|
113 scalar(max @{$cnv->{pos}}),
|
|
|
114 $cnv->{type});
|
|
|
115 next if $cnvlg_seen->{$key}++;
|
|
|
116 my $start = scalar(min @{$cnv->{pos}});
|
|
|
117 my $end = 1 + scalar(max @{$cnv->{pos}});
|
|
|
118 printf FCNVLG ("hs%s %d %d %s idx=%d\n",
|
|
|
119 $cnv->{chr},
|
|
|
120 $start,
|
|
|
121 $end,
|
|
|
122 $cnv->{type},
|
|
|
123 $idx);
|
|
|
124 }
|
|
|
125 close(FCNVLG);
|
|
|
126
|
|
0
|
127 # number of CNV
|
|
7
|
128 open(F, ">$path/mutations.txt");
|
|
|
129 open(FSV, ">$path/mutations.stacked.sv.txt");
|
|
|
130 open(FCNV, ">$path/mutations.stacked.cnv.txt");
|
|
0
|
131 for my $gene (@$table) {
|
|
7
|
132 my $name = $gene->{name};
|
|
|
133 if(defined $mask_genes{$name}){
|
|
|
134 next;
|
|
|
135 }
|
|
0
|
136 my @sv;
|
|
|
137 my @sv_vals;
|
|
|
138 my @cnv;
|
|
|
139 my @cnv_vals;
|
|
|
140 # number of samples for each SV type
|
|
|
141 for my $type (sort { $CONF{sv}{types}{$b} <=> $CONF{sv}{types}{$a}} keys %{$CONF{sv}{types}}) {
|
|
|
142 push @sv, sprintf("sv_%s=%d",lc $type,$gene->{sv}{$type}||0);
|
|
|
143 push @sv_vals, $gene->{sv}{$type}||0;
|
|
|
144 }
|
|
|
145 # number of samples for each CNV type
|
|
|
146 for my $type (sort { $CONF{cnv}{types}{$b} <=> $CONF{cnv}{types}{$a}} keys %{$CONF{cnv}{types}}) {
|
|
|
147 next unless $CONF{cnv}{types}{$type};
|
|
|
148 push @cnv, sprintf("cnv_%s=%d",lc $type,$gene->{cnv}{$type}{n}||0);
|
|
|
149 push @cnv_vals, $gene->{cnv}{$type}{n}||0;
|
|
|
150 }
|
|
|
151 my $cnv_plus = ($gene->{cnv}{amp}{n} ||0) + ($gene->{cnv}{gain}{n} ||0);
|
|
|
152 my $cnv_minus = ($gene->{cnv}{homd}{n}||0) + ($gene->{cnv}{hetd}{n} ||0);
|
|
7
|
153 my $label_flag = "label_gene=0";
|
|
|
154
|
|
|
155
|
|
|
156 if(defined $affected_genes{$name}){
|
|
|
157 $label_flag = "label_gene=1";
|
|
|
158 print "will label $name\n";
|
|
|
159 }
|
|
|
160 unless(defined $gene->{sv_top}){
|
|
|
161 $gene->{sv_top} = {"missense_mutation"=>0};
|
|
|
162 $gene->{sv} = {"missense_mutation"=>0};
|
|
|
163 $gene->{svaa_top} = {"*"=>0};
|
|
|
164 }
|
|
|
165 printf F ("hs%s %d %d %s size=%d,sv_top_type=%s,sv_top_n=%d,sv_tot=%d,svaa_max_pos=%s,svaa_max_n=%d,cnv_top_type=%s,cnv_top_n=%d,cnv_top_avg=%f,cnv_top_med=%f,cnv_plus=%d,cnv_minus=%d,%s,%s",
|
|
0
|
166 $gene->{chr},
|
|
|
167 $gene->{pos},
|
|
|
168 $gene->{pos}+1,
|
|
|
169 $gene->{name},
|
|
|
170 $gene->{size},
|
|
|
171 keys %{$gene->{sv_top}},
|
|
|
172 values %{$gene->{sv_top}},
|
|
|
173 $gene->{sv}{"*"} || 0,
|
|
|
174 (keys %{$gene->{svaa_top}})||0,
|
|
|
175 (values %{$gene->{svaa_top}})||0,
|
|
|
176 $gene->{cnv_top}{class} || "-",
|
|
|
177 $gene->{cnv_top}{n} || 0,
|
|
|
178 $gene->{cnv_top}{avg} || 0,
|
|
|
179 $gene->{cnv_top}{med} || 0,
|
|
|
180 $cnv_plus||0,
|
|
|
181 $cnv_minus||0,
|
|
|
182 join(",",@sv),
|
|
7
|
183 join(",",@cnv));
|
|
|
184 print F ",$label_flag\n";
|
|
0
|
185 # stacked histograms of number of samples with each SV type
|
|
|
186 printf FSV ("hs%s %d %d %s name=%s,sv_top_type=%s,sv_top_n=%d\n",
|
|
|
187 $gene->{chr},
|
|
|
188 $gene->{pos},
|
|
|
189 $gene->{pos}+1,
|
|
|
190 join(",",@sv_vals),
|
|
|
191 $gene->{name},
|
|
|
192 keys %{$gene->{sv_top}},
|
|
|
193 values %{$gene->{sv_top}},
|
|
|
194 );
|
|
|
195 printf FCNV ("hs%s %d %d %s name=%s,cnv_top_type=%s,cnv_top_n=%d,cnv_top_avg=%f,cnv_top_med=%f\n",
|
|
|
196 $gene->{chr},
|
|
|
197 $gene->{pos},
|
|
|
198 $gene->{pos}+1,
|
|
|
199 join(",",@cnv_vals),
|
|
|
200 $gene->{name},
|
|
|
201 $gene->{cnv_top}{class} || "-",
|
|
|
202 $gene->{cnv_top}{n} || 0,
|
|
|
203 $gene->{cnv_top}{avg} || 0,
|
|
|
204 $gene->{cnv_top}{med} || 0,
|
|
|
205
|
|
|
206 );
|
|
|
207 }
|
|
|
208 close(F);
|
|
|
209 close(FSV);
|
|
|
210 close(FCNV);
|
|
|
211
|
|
|
212 sub read_file {
|
|
|
213 my $fh = get_handle();
|
|
|
214 my @data;
|
|
|
215 my $chrpos;
|
|
|
216 while(<$fh>) {
|
|
7
|
217
|
|
0
|
218 chomp;
|
|
|
219 next if /^\#/;
|
|
7
|
220
|
|
0
|
221 my @tok = split;
|
|
|
222 my $gene = list2hash([qw(i id name chr start end size)],
|
|
|
223 [splice(@tok,0,7)]);
|
|
|
224 $gene->{pos} = $chrpos->{ $gene->{chr} }++;
|
|
7
|
225
|
|
0
|
226 # remaining tokens
|
|
|
227 for my $tok (@tok) {
|
|
|
228 my @subtok = split(":",$tok);
|
|
|
229 my $event = lc shift @subtok;
|
|
|
230 my $type = lc shift @subtok;
|
|
|
231 if($event =~ /sv/) {
|
|
|
232 $gene->{$event}{$type} = shift @subtok;
|
|
7
|
233 } elsif($event =~ /cnvlg/) {
|
|
|
234 my $h = { idx => $type,
|
|
|
235 type => shift @subtok };
|
|
|
236 $gene->{$event}{$type} = $h;
|
|
0
|
237 } elsif($event =~ /cnv/) {
|
|
|
238 my $h = { class=> $type,
|
|
|
239 n=> shift @subtok,
|
|
|
240 min=> shift @subtok,
|
|
|
241 avg=> shift @subtok,
|
|
|
242 med=> shift @subtok,
|
|
|
243 max=> shift @subtok};
|
|
|
244 if($event =~ /top/) {
|
|
|
245 $gene->{$event} = $h;
|
|
|
246 } else {
|
|
|
247 $gene->{$event}{$type} = $h;
|
|
|
248 }
|
|
|
249 }
|
|
|
250 }
|
|
7
|
251 #printdumper($gene);
|
|
0
|
252 push @data, $gene;
|
|
|
253 }
|
|
7
|
254
|
|
0
|
255 return \@data;
|
|
|
256 }
|
|
|
257
|
|
|
258 sub list2hash {
|
|
|
259 my ($names,$list) = @_;
|
|
|
260 my $h;
|
|
|
261 my $i = 0;
|
|
|
262 for my $name (@$names) {
|
|
|
263 $h->{$name} = $list->[$i++];
|
|
|
264 }
|
|
|
265 return $h;
|
|
|
266 }
|
|
|
267
|
|
|
268 sub get_handle {
|
|
|
269 my $h;
|
|
|
270 if(my $file = $CONF{file}) {
|
|
|
271 die "No such file [$file]" unless -e $file;
|
|
|
272 open(FILE,$file);
|
|
|
273 $h = \*FILE;
|
|
|
274 } else {
|
|
|
275 $h = \*STDIN;
|
|
|
276 }
|
|
|
277 return $h;
|
|
|
278 }
|
|
|
279
|
|
|
280 # HOUSEKEEPING ###############################################################
|
|
|
281
|
|
|
282 sub dump_config {
|
|
|
283 printdumper(\%OPT,\%CONF);
|
|
|
284 }
|
|
|
285
|
|
|
286 sub parse_config {
|
|
|
287 my $dump_debug_level = 3;
|
|
|
288 GetOptions(\%OPT,@COMMAND_LINE);
|
|
|
289 pod2usage() if $OPT{help};
|
|
|
290 pod2usage(-verbose=>2) if $OPT{man};
|
|
|
291 loadconfiguration($OPT{configfile});
|
|
|
292 populateconfiguration(); # copy command line options to config hash
|
|
|
293 validateconfiguration();
|
|
|
294 if ($CONF{cdump}) {
|
|
|
295 $Data::Dumper::Indent = 2;
|
|
|
296 $Data::Dumper::Quotekeys = 0;
|
|
|
297 $Data::Dumper::Terse = 0;
|
|
|
298 $Data::Dumper::Sortkeys = 1;
|
|
|
299 $Data::Dumper::Varname = "OPT";
|
|
|
300 printdumper(\%OPT);
|
|
|
301 $Data::Dumper::Varname = "CONF";
|
|
|
302 printdumper(\%CONF);
|
|
|
303 exit;
|
|
|
304 }
|
|
|
305 }
|
|
|
306
|
|
|
307 sub populateconfiguration {
|
|
|
308 for my $var (keys %OPT) {
|
|
|
309 $CONF{$var} = $OPT{$var};
|
|
|
310 }
|
|
|
311 repopulateconfiguration(\%CONF);
|
|
|
312 }
|
|
|
313
|
|
|
314 sub repopulateconfiguration {
|
|
|
315 my ($node,$parent_node_name) = shift;
|
|
|
316 return unless ref($node) eq "HASH";
|
|
|
317 for my $key (keys %$node) {
|
|
|
318 my $value = $node->{$key};
|
|
|
319 if (ref($value) eq "HASH") {
|
|
|
320 repopulateconfiguration($value,$key);
|
|
|
321 } elsif (ref($value) eq "ARRAY") {
|
|
|
322 for my $item (@$value) {
|
|
|
323 repopulateconfiguration($item,$key);
|
|
|
324 }
|
|
|
325 } elsif (defined $value) {
|
|
|
326 my $new_value = parse_field($value,$key,$parent_node_name,$node);
|
|
|
327 $node->{$key} = $new_value;
|
|
|
328 }
|
|
|
329 }
|
|
|
330 }
|
|
|
331
|
|
|
332 sub parse_field {
|
|
|
333 my ($str,$key,$parent_node_name,$node) = @_;
|
|
|
334 # replace configuration field
|
|
|
335 # conf(LEAF,LEAF,...)
|
|
|
336 while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) {
|
|
|
337 my ($template,$leaf) = ($1,$2);
|
|
|
338 if (defined $template && defined $leaf) {
|
|
|
339 my @leaf = split(/\s*,\s*/,$leaf);
|
|
|
340 my $new_template;
|
|
|
341 if (@leaf == 2 && $leaf[0] eq ".") {
|
|
|
342 $new_template = $node->{$leaf[1]};
|
|
|
343 } else {
|
|
|
344 $new_template = fetch_conf(@leaf);
|
|
|
345 }
|
|
|
346 $str =~ s/\Q$template\E/$new_template/g;
|
|
|
347 }
|
|
|
348 }
|
|
|
349 if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) {
|
|
|
350 my $fn = $1;
|
|
|
351 $str = eval $fn;
|
|
|
352 if ($@) {
|
|
|
353 die "could not parse configuration parameter [$@]";
|
|
|
354 }
|
|
|
355 }
|
|
|
356 return $str;
|
|
|
357 }
|
|
|
358
|
|
|
359 sub fetch_configuration {
|
|
|
360 my @config_path = @_;
|
|
|
361 my $node = \%CONF;
|
|
|
362 if(! @config_path) {
|
|
|
363 return \%CONF;
|
|
|
364 }
|
|
|
365 for my $path_element (@config_path) {
|
|
|
366 if (! exists $node->{$path_element}) {
|
|
|
367 return undef;
|
|
|
368 } else {
|
|
|
369 $node = $node->{$path_element};
|
|
|
370 }
|
|
|
371 }
|
|
|
372 return $node;
|
|
|
373 }
|
|
|
374
|
|
|
375 sub fetch_conf {
|
|
|
376 return fetch_configuration(@_);
|
|
|
377 }
|
|
|
378
|
|
|
379 sub loadconfiguration {
|
|
|
380 my $file = shift;
|
|
|
381 if (defined $file) {
|
|
|
382 if (-e $file && -r _) {
|
|
|
383 # provided configuration file exists and can be read
|
|
|
384 $file = abs_path($file);
|
|
|
385 } else {
|
|
|
386 confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
|
|
|
387 }
|
|
|
388 } else {
|
|
|
389 # otherwise, try to automatically find a configuration file
|
|
|
390 my ($scriptname,$path,$suffix) = fileparse($0);
|
|
|
391 my $cwd = getcwd();
|
|
|
392 my $bindir = $FindBin::RealBin;
|
|
|
393 my $userdir = $ENV{HOME};
|
|
|
394 my @candidate_files = (
|
|
|
395 "$cwd/$scriptname.conf",
|
|
|
396 "$cwd/etc/$scriptname.conf",
|
|
|
397 "$cwd/../etc/$scriptname.conf",
|
|
|
398 "$bindir/$scriptname.conf",
|
|
|
399 "$bindir/etc/$scriptname.conf",
|
|
|
400 "$bindir/../etc/$scriptname.conf",
|
|
|
401 "$userdir/.$scriptname.conf",
|
|
|
402 );
|
|
|
403 my @additional_files = ();
|
|
|
404 for my $candidate_file (@additional_files,@candidate_files) {
|
|
|
405 #printinfo("configsearch",$candidate_file);
|
|
|
406 if (-e $candidate_file && -r _) {
|
|
|
407 $file = $candidate_file;
|
|
|
408 #printinfo("configfound",$candidate_file);
|
|
|
409 last;
|
|
|
410 }
|
|
|
411 }
|
|
|
412 }
|
|
|
413 if (defined $file) {
|
|
|
414 $OPT{configfile} = $file;
|
|
|
415 $conf = new Config::General(
|
|
|
416 -ConfigFile=>$file,
|
|
|
417 -IncludeRelative=>1,
|
|
|
418 -IncludeAgain=>1,
|
|
|
419 -ExtendedAccess=>1,
|
|
|
420 -AllowMultiOptions=>"yes",
|
|
|
421 #-LowerCaseNames=>1,
|
|
|
422 -AutoTrue=>1
|
|
|
423 );
|
|
|
424 %CONF = $conf->getall;
|
|
|
425 }
|
|
|
426 }
|
|
|
427
|
|
|
428 sub printdebug {
|
|
|
429 printinfo("debug",@_) if defined $CONF{debug};
|
|
|
430 }
|
|
|
431
|
|
|
432 sub printinfo {
|
|
|
433 print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
|
|
|
434 }
|
|
|
435
|
|
|
436 sub printfinfo {
|
|
|
437 my ($fmt,@args) = @_;
|
|
|
438 @args = map { defined $_ ? $_ : "_undef_" } @args;
|
|
|
439 printf("$fmt\n",@args);
|
|
|
440 }
|
|
|
441
|
|
|
442 sub printerr {
|
|
|
443 print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
|
|
|
444 }
|
|
|
445
|
|
|
446 sub printdumper {
|
|
|
447 print Dumper(@_);
|
|
|
448 }
|
|
|
449
|
|
|
450 =pod
|
|
|
451
|
|
|
452 =head1 HISTORY
|
|
|
453
|
|
|
454 =over
|
|
|
455
|
|
|
456 =item * 30 Nov 2015
|
|
|
457
|
|
|
458 Started.
|
|
|
459
|
|
|
460 =back
|
|
|
461
|
|
|
462 =head1 AUTHOR
|
|
|
463
|
|
|
464 Martin Krzywinski
|
|
|
465
|
|
|
466 =head1 CONTACT
|
|
|
467
|
|
|
468 Martin Krzywinski
|
|
|
469 Genome Sciences Center
|
|
|
470 BC Cancer Research Center
|
|
|
471 100-570 W 7th Ave
|
|
|
472 Vancouver BC V5Z 4S6
|
|
|
473
|
|
|
474 mkweb.bcgsc.ca
|
|
|
475 martink@bcgsc.ca
|
|
|
476
|
|
|
477 =cut
|