|
0
|
1 #!/bin/env perl
|
|
|
2
|
|
|
3 =pod
|
|
|
4
|
|
|
5 =head1 NAME
|
|
|
6
|
|
|
7 make.circos.data - create Circos data files from summary tables of SV/CNV mutations
|
|
|
8
|
|
|
9 =head1 SYNOPSIS
|
|
|
10
|
|
|
11 bin/parse > table.txt
|
|
|
12
|
|
|
13 # uses same config file as parse
|
|
|
14 cat table.txt | bin/make.circos.data
|
|
|
15
|
|
|
16 =head1 DESCRIPTION
|
|
|
17
|
|
|
18 =head1 OPTIONS
|
|
|
19
|
|
|
20 =cut
|
|
|
21
|
|
|
22 use strict;
|
|
|
23 use warnings FATAL=>"all";
|
|
|
24
|
|
|
25 use Carp;
|
|
|
26 use Config::General;
|
|
|
27 use Cwd qw(getcwd abs_path);
|
|
|
28 use Data::Dumper;
|
|
|
29 use File::Basename;
|
|
|
30 use FindBin;
|
|
|
31 use Getopt::Long;
|
|
|
32 use Math::Round qw(round nearest);
|
|
|
33 use Math::VecStat qw(sum min max average);
|
|
|
34 use Pod::Usage;
|
|
|
35 use Time::HiRes qw(gettimeofday tv_interval);
|
|
|
36 use Storable;
|
|
|
37 use lib "$FindBin::RealBin";
|
|
|
38 use lib "$FindBin::RealBin/../lib";
|
|
|
39 use lib "$FindBin::RealBin/lib";
|
|
|
40
|
|
|
41 our (%OPT,%CONF,$conf);
|
|
|
42 our @COMMAND_LINE = ("file=s",
|
|
|
43 "configfile=s",
|
|
|
44 "help",
|
|
|
45 "cdump",
|
|
|
46 "man",
|
|
|
47 "debug");
|
|
|
48 our $VERSION = 0.02;
|
|
|
49
|
|
|
50 # common and custom module imports below
|
|
|
51 #use Regexp::Common;
|
|
|
52 #use IO::File;
|
|
|
53 #use List::Util;
|
|
|
54 #use List::MoreUtils;
|
|
|
55 use Set::IntSpan;
|
|
|
56 #use Statistics::Descriptive;
|
|
|
57
|
|
|
58 # read and parse configuration file
|
|
|
59 parse_config();
|
|
|
60
|
|
|
61 sub validateconfiguration {
|
|
|
62
|
|
|
63 }
|
|
|
64
|
|
|
65 ################################################################
|
|
|
66 # get files
|
|
|
67 my $table = read_file();
|
|
|
68
|
|
|
69 my $path = "$CONF{files}{root}/$CONF{files}{circos}";
|
|
|
70 # karyotype
|
|
|
71 open(F,">$path/karyotype.txt");
|
|
|
72 for my $chr (1..22,"X","Y") {
|
|
|
73 my $n = grep($_->{chr} eq $chr,@$table);
|
|
|
74 next unless $n;
|
|
|
75 printf F ("chr - hs%s %s 0 %d chr%s\n",$chr,$chr,$n,lc $chr);
|
|
|
76 }
|
|
|
77 close(F);
|
|
|
78
|
|
|
79 # number of CNV
|
|
|
80 open(F,">$path/mutations.txt");
|
|
|
81 open(FSV,">$path/mutations.stacked.sv.txt");
|
|
|
82 open(FCNV,">$path/mutations.stacked.cnv.txt");
|
|
|
83 for my $gene (@$table) {
|
|
|
84 my @sv;
|
|
|
85 my @sv_vals;
|
|
|
86 my @cnv;
|
|
|
87 my @cnv_vals;
|
|
|
88 # number of samples for each SV type
|
|
|
89 for my $type (sort { $CONF{sv}{types}{$b} <=> $CONF{sv}{types}{$a}} keys %{$CONF{sv}{types}}) {
|
|
|
90 push @sv, sprintf("sv_%s=%d",lc $type,$gene->{sv}{$type}||0);
|
|
|
91 push @sv_vals, $gene->{sv}{$type}||0;
|
|
|
92 }
|
|
|
93 # number of samples for each CNV type
|
|
|
94 for my $type (sort { $CONF{cnv}{types}{$b} <=> $CONF{cnv}{types}{$a}} keys %{$CONF{cnv}{types}}) {
|
|
|
95 next unless $CONF{cnv}{types}{$type};
|
|
|
96 push @cnv, sprintf("cnv_%s=%d",lc $type,$gene->{cnv}{$type}{n}||0);
|
|
|
97 push @cnv_vals, $gene->{cnv}{$type}{n}||0;
|
|
|
98 }
|
|
|
99 my $cnv_plus = ($gene->{cnv}{amp}{n} ||0) + ($gene->{cnv}{gain}{n} ||0);
|
|
|
100 my $cnv_minus = ($gene->{cnv}{homd}{n}||0) + ($gene->{cnv}{hetd}{n} ||0);
|
|
|
101 printf F ("hs%s %d %d %s size=%d,sv_top_type=%s,sv_top_n=%d,sv_tot=%d,svaa_max_pos=%s,svaa_max_n=%d,cnv_top_type=%s,cnv_top_n=%d,cnv_top_avg=%f,cnv_top_med=%f,cnv_plus=%d,cnv_minus=%d,%s,%s\n",
|
|
|
102 $gene->{chr},
|
|
|
103 $gene->{pos},
|
|
|
104 $gene->{pos}+1,
|
|
|
105 $gene->{name},
|
|
|
106 $gene->{size},
|
|
|
107 keys %{$gene->{sv_top}},
|
|
|
108 values %{$gene->{sv_top}},
|
|
|
109 $gene->{sv}{"*"} || 0,
|
|
|
110 (keys %{$gene->{svaa_top}})||0,
|
|
|
111 (values %{$gene->{svaa_top}})||0,
|
|
|
112 $gene->{cnv_top}{class} || "-",
|
|
|
113 $gene->{cnv_top}{n} || 0,
|
|
|
114 $gene->{cnv_top}{avg} || 0,
|
|
|
115 $gene->{cnv_top}{med} || 0,
|
|
|
116 $cnv_plus||0,
|
|
|
117 $cnv_minus||0,
|
|
|
118 join(",",@sv),
|
|
|
119 join(",",@cnv));
|
|
|
120 # stacked histograms of number of samples with each SV type
|
|
|
121 printf FSV ("hs%s %d %d %s name=%s,sv_top_type=%s,sv_top_n=%d\n",
|
|
|
122 $gene->{chr},
|
|
|
123 $gene->{pos},
|
|
|
124 $gene->{pos}+1,
|
|
|
125 join(",",@sv_vals),
|
|
|
126 $gene->{name},
|
|
|
127 keys %{$gene->{sv_top}},
|
|
|
128 values %{$gene->{sv_top}},
|
|
|
129 );
|
|
|
130 printf FCNV ("hs%s %d %d %s name=%s,cnv_top_type=%s,cnv_top_n=%d,cnv_top_avg=%f,cnv_top_med=%f\n",
|
|
|
131 $gene->{chr},
|
|
|
132 $gene->{pos},
|
|
|
133 $gene->{pos}+1,
|
|
|
134 join(",",@cnv_vals),
|
|
|
135 $gene->{name},
|
|
|
136 $gene->{cnv_top}{class} || "-",
|
|
|
137 $gene->{cnv_top}{n} || 0,
|
|
|
138 $gene->{cnv_top}{avg} || 0,
|
|
|
139 $gene->{cnv_top}{med} || 0,
|
|
|
140
|
|
|
141 );
|
|
|
142 }
|
|
|
143 close(F);
|
|
|
144 close(FSV);
|
|
|
145 close(FCNV);
|
|
|
146
|
|
|
147 sub read_file {
|
|
|
148 my $fh = get_handle();
|
|
|
149 my @data;
|
|
|
150 my $chrpos;
|
|
|
151 while(<$fh>) {
|
|
|
152 chomp;
|
|
|
153 next if /^\#/;
|
|
|
154 my @tok = split;
|
|
|
155 my $gene = list2hash([qw(i id name chr start end size)],
|
|
|
156 [splice(@tok,0,7)]);
|
|
|
157 $gene->{pos} = $chrpos->{ $gene->{chr} }++;
|
|
|
158 # remaining tokens
|
|
|
159 for my $tok (@tok) {
|
|
|
160 my @subtok = split(":",$tok);
|
|
|
161 my $event = lc shift @subtok;
|
|
|
162 my $type = lc shift @subtok;
|
|
|
163 if($event =~ /sv/) {
|
|
|
164 $gene->{$event}{$type} = shift @subtok;
|
|
|
165 } elsif($event =~ /cnv/) {
|
|
|
166 my $h = { class=> $type,
|
|
|
167 n=> shift @subtok,
|
|
|
168 min=> shift @subtok,
|
|
|
169 avg=> shift @subtok,
|
|
|
170 med=> shift @subtok,
|
|
|
171 max=> shift @subtok};
|
|
|
172 if($event =~ /top/) {
|
|
|
173 $gene->{$event} = $h;
|
|
|
174 } else {
|
|
|
175 $gene->{$event}{$type} = $h;
|
|
|
176 }
|
|
|
177 }
|
|
|
178 }
|
|
|
179 printdumper($gene);
|
|
|
180 push @data, $gene;
|
|
|
181 }
|
|
|
182 return \@data;
|
|
|
183 }
|
|
|
184
|
|
|
185 sub list2hash {
|
|
|
186 my ($names,$list) = @_;
|
|
|
187 my $h;
|
|
|
188 my $i = 0;
|
|
|
189 for my $name (@$names) {
|
|
|
190 $h->{$name} = $list->[$i++];
|
|
|
191 }
|
|
|
192 return $h;
|
|
|
193 }
|
|
|
194
|
|
|
195 sub get_handle {
|
|
|
196 my $h;
|
|
|
197 if(my $file = $CONF{file}) {
|
|
|
198 die "No such file [$file]" unless -e $file;
|
|
|
199 open(FILE,$file);
|
|
|
200 $h = \*FILE;
|
|
|
201 } else {
|
|
|
202 $h = \*STDIN;
|
|
|
203 }
|
|
|
204 return $h;
|
|
|
205 }
|
|
|
206
|
|
|
207 # HOUSEKEEPING ###############################################################
|
|
|
208
|
|
|
209 sub dump_config {
|
|
|
210 printdumper(\%OPT,\%CONF);
|
|
|
211 }
|
|
|
212
|
|
|
213 sub parse_config {
|
|
|
214 my $dump_debug_level = 3;
|
|
|
215 GetOptions(\%OPT,@COMMAND_LINE);
|
|
|
216 pod2usage() if $OPT{help};
|
|
|
217 pod2usage(-verbose=>2) if $OPT{man};
|
|
|
218 loadconfiguration($OPT{configfile});
|
|
|
219 populateconfiguration(); # copy command line options to config hash
|
|
|
220 validateconfiguration();
|
|
|
221 if ($CONF{cdump}) {
|
|
|
222 $Data::Dumper::Indent = 2;
|
|
|
223 $Data::Dumper::Quotekeys = 0;
|
|
|
224 $Data::Dumper::Terse = 0;
|
|
|
225 $Data::Dumper::Sortkeys = 1;
|
|
|
226 $Data::Dumper::Varname = "OPT";
|
|
|
227 printdumper(\%OPT);
|
|
|
228 $Data::Dumper::Varname = "CONF";
|
|
|
229 printdumper(\%CONF);
|
|
|
230 exit;
|
|
|
231 }
|
|
|
232 }
|
|
|
233
|
|
|
234 sub populateconfiguration {
|
|
|
235 for my $var (keys %OPT) {
|
|
|
236 $CONF{$var} = $OPT{$var};
|
|
|
237 }
|
|
|
238 repopulateconfiguration(\%CONF);
|
|
|
239 }
|
|
|
240
|
|
|
241 sub repopulateconfiguration {
|
|
|
242 my ($node,$parent_node_name) = shift;
|
|
|
243 return unless ref($node) eq "HASH";
|
|
|
244 for my $key (keys %$node) {
|
|
|
245 my $value = $node->{$key};
|
|
|
246 if (ref($value) eq "HASH") {
|
|
|
247 repopulateconfiguration($value,$key);
|
|
|
248 } elsif (ref($value) eq "ARRAY") {
|
|
|
249 for my $item (@$value) {
|
|
|
250 repopulateconfiguration($item,$key);
|
|
|
251 }
|
|
|
252 } elsif (defined $value) {
|
|
|
253 my $new_value = parse_field($value,$key,$parent_node_name,$node);
|
|
|
254 $node->{$key} = $new_value;
|
|
|
255 }
|
|
|
256 }
|
|
|
257 }
|
|
|
258
|
|
|
259 sub parse_field {
|
|
|
260 my ($str,$key,$parent_node_name,$node) = @_;
|
|
|
261 # replace configuration field
|
|
|
262 # conf(LEAF,LEAF,...)
|
|
|
263 while ( $str =~ /(conf\(\s*(.+?)\s*\))/g ) {
|
|
|
264 my ($template,$leaf) = ($1,$2);
|
|
|
265 if (defined $template && defined $leaf) {
|
|
|
266 my @leaf = split(/\s*,\s*/,$leaf);
|
|
|
267 my $new_template;
|
|
|
268 if (@leaf == 2 && $leaf[0] eq ".") {
|
|
|
269 $new_template = $node->{$leaf[1]};
|
|
|
270 } else {
|
|
|
271 $new_template = fetch_conf(@leaf);
|
|
|
272 }
|
|
|
273 $str =~ s/\Q$template\E/$new_template/g;
|
|
|
274 }
|
|
|
275 }
|
|
|
276 if ($str =~ /\s*eval\s*\(\s*(.+)\s*\)/) {
|
|
|
277 my $fn = $1;
|
|
|
278 $str = eval $fn;
|
|
|
279 if ($@) {
|
|
|
280 die "could not parse configuration parameter [$@]";
|
|
|
281 }
|
|
|
282 }
|
|
|
283 return $str;
|
|
|
284 }
|
|
|
285
|
|
|
286 sub fetch_configuration {
|
|
|
287 my @config_path = @_;
|
|
|
288 my $node = \%CONF;
|
|
|
289 if(! @config_path) {
|
|
|
290 return \%CONF;
|
|
|
291 }
|
|
|
292 for my $path_element (@config_path) {
|
|
|
293 if (! exists $node->{$path_element}) {
|
|
|
294 return undef;
|
|
|
295 } else {
|
|
|
296 $node = $node->{$path_element};
|
|
|
297 }
|
|
|
298 }
|
|
|
299 return $node;
|
|
|
300 }
|
|
|
301
|
|
|
302 sub fetch_conf {
|
|
|
303 return fetch_configuration(@_);
|
|
|
304 }
|
|
|
305
|
|
|
306 sub loadconfiguration {
|
|
|
307 my $file = shift;
|
|
|
308 if (defined $file) {
|
|
|
309 if (-e $file && -r _) {
|
|
|
310 # provided configuration file exists and can be read
|
|
|
311 $file = abs_path($file);
|
|
|
312 } else {
|
|
|
313 confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
|
|
|
314 }
|
|
|
315 } else {
|
|
|
316 # otherwise, try to automatically find a configuration file
|
|
|
317 my ($scriptname,$path,$suffix) = fileparse($0);
|
|
|
318 my $cwd = getcwd();
|
|
|
319 my $bindir = $FindBin::RealBin;
|
|
|
320 my $userdir = $ENV{HOME};
|
|
|
321 my @candidate_files = (
|
|
|
322 "$cwd/$scriptname.conf",
|
|
|
323 "$cwd/etc/$scriptname.conf",
|
|
|
324 "$cwd/../etc/$scriptname.conf",
|
|
|
325 "$bindir/$scriptname.conf",
|
|
|
326 "$bindir/etc/$scriptname.conf",
|
|
|
327 "$bindir/../etc/$scriptname.conf",
|
|
|
328 "$userdir/.$scriptname.conf",
|
|
|
329 );
|
|
|
330 my @additional_files = ();
|
|
|
331 for my $candidate_file (@additional_files,@candidate_files) {
|
|
|
332 #printinfo("configsearch",$candidate_file);
|
|
|
333 if (-e $candidate_file && -r _) {
|
|
|
334 $file = $candidate_file;
|
|
|
335 #printinfo("configfound",$candidate_file);
|
|
|
336 last;
|
|
|
337 }
|
|
|
338 }
|
|
|
339 }
|
|
|
340 if (defined $file) {
|
|
|
341 $OPT{configfile} = $file;
|
|
|
342 $conf = new Config::General(
|
|
|
343 -ConfigFile=>$file,
|
|
|
344 -IncludeRelative=>1,
|
|
|
345 -IncludeAgain=>1,
|
|
|
346 -ExtendedAccess=>1,
|
|
|
347 -AllowMultiOptions=>"yes",
|
|
|
348 #-LowerCaseNames=>1,
|
|
|
349 -AutoTrue=>1
|
|
|
350 );
|
|
|
351 %CONF = $conf->getall;
|
|
|
352 }
|
|
|
353 }
|
|
|
354
|
|
|
355 sub printdebug {
|
|
|
356 printinfo("debug",@_) if defined $CONF{debug};
|
|
|
357 }
|
|
|
358
|
|
|
359 sub printinfo {
|
|
|
360 print join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
|
|
|
361 }
|
|
|
362
|
|
|
363 sub printfinfo {
|
|
|
364 my ($fmt,@args) = @_;
|
|
|
365 @args = map { defined $_ ? $_ : "_undef_" } @args;
|
|
|
366 printf("$fmt\n",@args);
|
|
|
367 }
|
|
|
368
|
|
|
369 sub printerr {
|
|
|
370 print STDERR join(" ",map { defined $_ ? $_ : "_undef_" } @_),"\n";
|
|
|
371 }
|
|
|
372
|
|
|
373 sub printdumper {
|
|
|
374 print Dumper(@_);
|
|
|
375 }
|
|
|
376
|
|
|
377 =pod
|
|
|
378
|
|
|
379 =head1 HISTORY
|
|
|
380
|
|
|
381 =over
|
|
|
382
|
|
|
383 =item * 30 Nov 2015
|
|
|
384
|
|
|
385 Started.
|
|
|
386
|
|
|
387 =back
|
|
|
388
|
|
|
389 =head1 AUTHOR
|
|
|
390
|
|
|
391 Martin Krzywinski
|
|
|
392
|
|
|
393 =head1 CONTACT
|
|
|
394
|
|
|
395 Martin Krzywinski
|
|
|
396 Genome Sciences Center
|
|
|
397 BC Cancer Research Center
|
|
|
398 100-570 W 7th Ave
|
|
|
399 Vancouver BC V5Z 4S6
|
|
|
400
|
|
|
401 mkweb.bcgsc.ca
|
|
|
402 martink@bcgsc.ca
|
|
|
403
|
|
|
404 =cut
|