comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic
24 regions across the genome
25
26 =head1 SYNOPSIS
27
28 # build the SyntenyFramework from unambiguous gene mappings
29 my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
30 -DUMP_PATH => $dump_path,
31 -CACHE_FILE => 'synteny_framework.ser',
32 -LOGGER => $self->logger,
33 -CONF => $self->conf,
34 -CACHE => $self->cache,
35 );
36 $sf->build_synteny($gene_mappings);
37
38 # use it to rescore the genes
39 $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
40
41 =head1 DESCRIPTION
42
43 The SyntenyFramework is a set of SyntenyRegions. These are pairs of
44 locations very analoguous to the information in the assembly table (the
45 locations dont have to be the same length though). They are built from
46 genes that map uniquely between source and target.
47
48 Once built, the SyntenyFramework is used to score source and target gene
49 pairs to determine whether they are similar. This process is slow (it
50 involves testing all gene pairs against all SyntenyRegions), this module
51 therefor has built-in support to run the process in parallel via LSF.
52
53 =head1 METHODS
54
55 new
56 build_synteny
57 _by_overlap
58 add_SyntenyRegion
59 get_all_SyntenyRegions
60 rescore_gene_matrix_lsf
61 rescore_gene_matrix
62 logger
63 conf
64 cache
65
66 =cut
67
68 package Bio::EnsEMBL::IdMapping::SyntenyFramework;
69
70 use strict;
71 use warnings;
72 no warnings 'uninitialized';
73
74 use Bio::EnsEMBL::IdMapping::Serialisable;
75 our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable);
76
77 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
78 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
79 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
80 use Bio::EnsEMBL::IdMapping::SyntenyRegion;
81 use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
82
83 use FindBin qw($Bin);
84 FindBin->again;
85
86
87 =head2 new
88
89 Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
90 Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
91 Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object
92 Arg [DUMP_PATH] : String - path for object serialisation
93 Arg [CACHE_FILE] : String - filename of serialised object
94 Example : my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
95 -DUMP_PATH => $dump_path,
96 -CACHE_FILE => 'synteny_framework.ser',
97 -LOGGER => $self->logger,
98 -CONF => $self->conf,
99 -CACHE => $self->cache,
100 );
101 Description : Constructor.
102 Return type : Bio::EnsEMBL::IdMapping::SyntenyFramework
103 Exceptions : thrown on wrong or missing arguments
104 Caller : InternalIdMapper plugins
105 Status : At Risk
106 : under development
107
108 =cut
109
110 sub new {
111 my $caller = shift;
112 my $class = ref($caller) || $caller;
113 my $self = $class->SUPER::new(@_);
114
115 my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_);
116
117 unless ($logger and ref($logger) and
118 $logger->isa('Bio::EnsEMBL::Utils::Logger')) {
119 throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
120 }
121
122 unless ($conf and ref($conf) and
123 $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
124 throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
125 }
126
127 unless ($cache and ref($cache) and
128 $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) {
129 throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
130 }
131
132 # initialise
133 $self->logger($logger);
134 $self->conf($conf);
135 $self->cache($cache);
136 $self->{'cache'} = [];
137
138 return $self;
139 }
140
141
142 =head2 build_synteny
143
144 Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings
145 to build the SyntenyFramework from
146 Example : $synteny_framework->build_synteny($gene_mappings);
147 Description : Builds the SyntenyFramework from unambiguous gene mappings.
148 SyntenyRegions are allowed to overlap. At most two overlapping
149 SyntenyRegions are merged (otherwise we'd get too large
150 SyntenyRegions with little information content).
151 Return type : none
152 Exceptions : thrown on wrong or missing argument
153 Caller : InternalIdMapper plugins
154 Status : At Risk
155 : under development
156
157 =cut
158
159 sub build_synteny {
160 my $self = shift;
161 my $mappings = shift;
162
163 unless ($mappings and
164 $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
165 throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
166 }
167
168 # create a synteny region for each mapping
169 my @synteny_regions = ();
170
171 foreach my $entry (@{ $mappings->get_all_Entries }) {
172
173 my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
174 $entry->source);
175 my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
176 $entry->target);
177
178 my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
179 $source_gene->start,
180 $source_gene->end,
181 $source_gene->strand,
182 $source_gene->seq_region_name,
183 $target_gene->start,
184 $target_gene->end,
185 $target_gene->strand,
186 $target_gene->seq_region_name,
187 $entry->score,
188 ]);
189
190 push @synteny_regions, $sr;
191 }
192
193 unless (@synteny_regions) {
194 $self->logger->warning("No synteny regions could be identified.\n");
195 return;
196 }
197
198 # sort synteny regions
199 #my @sorted = sort _by_overlap @synteny_regions;
200 my @sorted = reverse sort {
201 $a->source_seq_region_name cmp $b->source_seq_region_name ||
202 $a->source_start <=> $b->source_start ||
203 $a->source_end <=> $b->source_end } @synteny_regions;
204
205 $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n");
206
207 # now create merged regions from overlapping syntenies, but only merge a
208 # maximum of 2 regions (otherwise you end up with large synteny blocks which
209 # won't contain much information in this context)
210 my $last_merged = 0;
211 my $last_sr = shift(@sorted);
212
213 while (my $sr = shift(@sorted)) {
214 #$self->logger->debug("this ".$sr->to_string."\n");
215
216 my $merged_sr = $last_sr->merge($sr);
217
218 if (! $merged_sr) {
219 unless ($last_merged) {
220 $self->add_SyntenyRegion($last_sr->stretch(2));
221 #$self->logger->debug("nnn ".$last_sr->to_string."\n");
222 }
223 $last_merged = 0;
224 } else {
225 $self->add_SyntenyRegion($merged_sr->stretch(2));
226 #$self->logger->debug("mmm ".$merged_sr->to_string."\n");
227 $last_merged = 1;
228 }
229
230 $last_sr = $sr;
231 }
232
233 # deal with last synteny region in @sorted
234 unless ($last_merged) {
235 $self->add_SyntenyRegion($last_sr->stretch(2));
236 $last_merged = 0;
237 }
238
239 #foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
240 # $self->logger->debug("SRs ".$sr->to_string."\n");
241 #}
242
243 $self->logger->info("SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions })."\n");
244
245 }
246
247
248 #
249 # sort SyntenyRegions by overlap
250 #
251 sub _by_overlap {
252 # first sort by seq_region
253 my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name);
254 return $retval if ($retval);
255
256 # then sort by overlap:
257 # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap
258 if ($a->source_end < $b->source_start) { return 1; }
259 if ($a->source_start < $b->source_end) { return -1; }
260 return 0;
261 }
262
263
264 =head2 add_SyntenyRegion
265
266 Arg[1] : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add
267 Example : $synteny_framework->add_SyntenyRegion($synteny_region);
268 Description : Adds a SyntenyRegion to the framework. For speed reasons (and
269 since this is an internal method), no argument check is done.
270 Return type : none
271 Exceptions : none
272 Caller : internal
273 Status : At Risk
274 : under development
275
276 =cut
277
278 sub add_SyntenyRegion {
279 push @{ $_[0]->{'cache'} }, $_[1];
280 }
281
282
283 =head2 get_all_SyntenyRegions
284
285 Example : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) {
286 # do something with the SyntenyRegion
287 }
288 Description : Get a list of all SyntenyRegions in the framework.
289 Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion
290 Exceptions : none
291 Caller : general
292 Status : At Risk
293 : under development
294
295 =cut
296
297 sub get_all_SyntenyRegions {
298 return $_[0]->{'cache'};
299 }
300
301
302 =head2 rescore_gene_matrix_lsf
303
304 Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
305 scores to rescore
306 Example : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
307 Description : This method runs rescore_gene_matrix() (via the
308 synteny_resocre.pl script) in parallel with lsf, then combines
309 the results to return a single rescored scoring matrix.
310 Parallelisation is done by chunking the scoring matrix into
311 several pieces (determined by the --synteny_rescore_jobs
312 configuration option).
313 Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
314 Exceptions : thrown on wrong or missing argument
315 thrown on filesystem I/O error
316 thrown on failure of one or mor lsf jobs
317 Caller : InternalIdMapper plugins
318 Status : At Risk
319 : under development
320
321 =cut
322
323 sub rescore_gene_matrix_lsf {
324 my $self = shift;
325 my $matrix = shift;
326
327 unless ($matrix and
328 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
329 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
330 }
331
332 # serialise SyntenyFramework to disk
333 $self->logger->debug("Serialising SyntenyFramework...\n", 0, 'stamped');
334 $self->write_to_file;
335 $self->logger->debug("Done.\n", 0, 'stamped');
336
337 # split the ScoredMappingMatrix into chunks and write to disk
338 my $matrix_size = $matrix->size;
339 $self->logger->debug("Scores before rescoring: $matrix_size.\n");
340
341 my $num_jobs = $self->conf->param('synteny_rescore_jobs') || 20;
342 $num_jobs++;
343
344 my $dump_path = path_append($self->conf->param('basedir'),
345 'matrix/synteny_rescore');
346
347 $self->logger->debug("Creating sub-matrices...\n", 0, 'stamped');
348 foreach my $i (1..$num_jobs) {
349 my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1;
350 my $end = int($matrix_size/($num_jobs-1)) * $i;
351 $self->logger->debug("$start-$end\n", 1);
352 my $sub_matrix = $matrix->sub_matrix($start, $end);
353
354 $sub_matrix->cache_file_name("gene_matrix_synteny$i.ser");
355 $sub_matrix->dump_path($dump_path);
356
357 $sub_matrix->write_to_file;
358 }
359 $self->logger->debug("Done.\n", 0, 'stamped');
360
361 # create an empty lsf log directory
362 my $logpath = path_append($self->logger->logpath, 'synteny_rescore');
363 system("rm -rf $logpath") == 0 or
364 $self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
365 system("mkdir -p $logpath") == 0 or
366 $self->logger->error("Can't create lsf log dir $logpath: $!\n");
367
368 # build lsf command
369 my $lsf_name = 'idmapping_synteny_rescore_'.time;
370
371 my $options = $self->conf->create_commandline_options(
372 logauto => 1,
373 logautobase => "synteny_rescore",
374 logpath => $logpath,
375 interactive => 0,
376 is_component => 1,
377 );
378
379 my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX};
380
381 my $bsub_cmd =
382 sprintf( "|bsub -J%s[1-%d] "
383 . "-o %s/synteny_rescore.%%I.out "
384 . "-e %s/synteny_rescore.%%I.err %s",
385 $lsf_name, $num_jobs, $logpath, $logpath,
386 $self->conf()->param('lsf_opt_synteny_rescore') );
387
388 # run lsf job array
389 $self->logger->info("Submitting $num_jobs jobs to lsf.\n");
390 $self->logger->debug("$cmd\n\n");
391
392 local *BSUB;
393 open( BSUB, $bsub_cmd )
394 or $self->logger->error("Could not open open pipe to bsub: $!\n");
395
396 print BSUB $cmd;
397 $self->logger->error("Error submitting synteny rescoring jobs: $!\n")
398 unless ($? == 0);
399 close BSUB;
400
401 # submit dependent job to monitor finishing of jobs
402 $self->logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
403
404 my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
405 qq{-o $logpath/synteny_rescore_depend.out /bin/true};
406
407 system($dependent_job) == 0 or
408 $self->logger->error("Error submitting dependent job: $!\n");
409
410 $self->logger->info("All jobs finished.\n", 0, 'stamped');
411
412 # check for lsf errors
413 sleep(5);
414 my $err;
415 foreach my $i (1..$num_jobs) {
416 $err++ unless (-e "$logpath/synteny_rescore.$i.success");
417 }
418
419 if ($err) {
420 $self->logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n");
421 }
422
423 # merge and return matrix
424 $self->logger->debug("Merging rescored matrices...\n");
425 $matrix->flush;
426
427 foreach my $i (1..$num_jobs) {
428 # read partial matrix created by lsf job from file
429 my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
430 -DUMP_PATH => $dump_path,
431 -CACHE_FILE => "gene_matrix_synteny$i.ser",
432 );
433 $sub_matrix->read_from_file;
434
435 # merge with main matrix
436 $matrix->merge($sub_matrix);
437 }
438
439 $self->logger->debug("Done.\n");
440 $self->logger->debug("Scores after rescoring: ".$matrix->size.".\n");
441
442 return $matrix;
443 }
444
445
446 #
447 #
448 =head2 rescore_gene_matrix
449
450 Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
451 scores to rescore
452 Example : my $new_scores = $sf->rescore_gene_matrix($gene_scores);
453 Description : Rescores a gene matrix. Retains 70% of old score and builds
454 other 30% from the synteny match.
455 Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
456 Exceptions : thrown on wrong or missing argument
457 Caller : InternalIdMapper plugins
458 Status : At Risk
459 : under development
460
461 =cut
462
463 sub rescore_gene_matrix {
464 my $self = shift;
465 my $matrix = shift;
466
467 unless ($matrix and
468 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
469 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
470 }
471
472 my $retain_factor = 0.7;
473
474 foreach my $entry (@{ $matrix->get_all_Entries }) {
475 my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
476 $entry->source);
477
478 my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
479 $entry->target);
480
481 my $highest_score = 0;
482
483 foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
484 my $score = $sr->score_location_relationship($source_gene, $target_gene);
485 $highest_score = $score if ($score > $highest_score);
486 }
487
488 #$self->logger->debug("highscore ".$entry->to_string." ".
489 # sprintf("%.6f\n", $highest_score));
490
491 $matrix->set_score($entry->source, $entry->target,
492 ($entry->score * 0.7 + $highest_score * 0.3));
493 }
494
495 return $matrix;
496 }
497
498
499 =head2 logger
500
501 Arg[1] : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set
502 Example : $object->logger->info("Starting ID mapping.\n");
503 Description : Getter/setter for logger object
504 Return type : Bio::EnsEMBL::Utils::Logger
505 Exceptions : none
506 Caller : constructor
507 Status : At Risk
508 : under development
509
510 =cut
511
512 sub logger {
513 my $self = shift;
514 $self->{'_logger'} = shift if (@_);
515 return $self->{'_logger'};
516 }
517
518
519 =head2 conf
520
521 Arg[1] : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration
522 to set
523 Example : my $basedir = $object->conf->param('basedir');
524 Description : Getter/setter for configuration object
525 Return type : Bio::EnsEMBL::Utils::ConfParser
526 Exceptions : none
527 Caller : constructor
528 Status : At Risk
529 : under development
530
531 =cut
532
533 sub conf {
534 my $self = shift;
535 $self->{'_conf'} = shift if (@_);
536 return $self->{'_conf'};
537 }
538
539
540 =head2 cache
541
542 Arg[1] : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set
543 Example : $object->cache->read_from_file('source');
544 Description : Getter/setter for cache object
545 Return type : Bio::EnsEMBL::IdMapping::Cache
546 Exceptions : none
547 Caller : constructor
548 Status : At Risk
549 : under development
550
551 =cut
552
553 sub cache {
554 my $self = shift;
555 $self->{'_cache'} = shift if (@_);
556 return $self->{'_cache'};
557 }
558
559
560 1;
561