Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyFramework.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic | |
24 regions across the genome | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 # build the SyntenyFramework from unambiguous gene mappings | |
29 my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( | |
30 -DUMP_PATH => $dump_path, | |
31 -CACHE_FILE => 'synteny_framework.ser', | |
32 -LOGGER => $self->logger, | |
33 -CONF => $self->conf, | |
34 -CACHE => $self->cache, | |
35 ); | |
36 $sf->build_synteny($gene_mappings); | |
37 | |
38 # use it to rescore the genes | |
39 $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores); | |
40 | |
41 =head1 DESCRIPTION | |
42 | |
43 The SyntenyFramework is a set of SyntenyRegions. These are pairs of | |
44 locations very analoguous to the information in the assembly table (the | |
45 locations dont have to be the same length though). They are built from | |
46 genes that map uniquely between source and target. | |
47 | |
48 Once built, the SyntenyFramework is used to score source and target gene | |
49 pairs to determine whether they are similar. This process is slow (it | |
50 involves testing all gene pairs against all SyntenyRegions), this module | |
51 therefor has built-in support to run the process in parallel via LSF. | |
52 | |
53 =head1 METHODS | |
54 | |
55 new | |
56 build_synteny | |
57 _by_overlap | |
58 add_SyntenyRegion | |
59 get_all_SyntenyRegions | |
60 rescore_gene_matrix_lsf | |
61 rescore_gene_matrix | |
62 logger | |
63 conf | |
64 cache | |
65 | |
66 =cut | |
67 | |
68 package Bio::EnsEMBL::IdMapping::SyntenyFramework; | |
69 | |
70 use strict; | |
71 use warnings; | |
72 no warnings 'uninitialized'; | |
73 | |
74 use Bio::EnsEMBL::IdMapping::Serialisable; | |
75 our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable); | |
76 | |
77 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
78 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
79 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append); | |
80 use Bio::EnsEMBL::IdMapping::SyntenyRegion; | |
81 use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix; | |
82 | |
83 use FindBin qw($Bin); | |
84 FindBin->again; | |
85 | |
86 | |
87 =head2 new | |
88 | |
89 Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object | |
90 Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object | |
91 Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object | |
92 Arg [DUMP_PATH] : String - path for object serialisation | |
93 Arg [CACHE_FILE] : String - filename of serialised object | |
94 Example : my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new( | |
95 -DUMP_PATH => $dump_path, | |
96 -CACHE_FILE => 'synteny_framework.ser', | |
97 -LOGGER => $self->logger, | |
98 -CONF => $self->conf, | |
99 -CACHE => $self->cache, | |
100 ); | |
101 Description : Constructor. | |
102 Return type : Bio::EnsEMBL::IdMapping::SyntenyFramework | |
103 Exceptions : thrown on wrong or missing arguments | |
104 Caller : InternalIdMapper plugins | |
105 Status : At Risk | |
106 : under development | |
107 | |
108 =cut | |
109 | |
110 sub new { | |
111 my $caller = shift; | |
112 my $class = ref($caller) || $caller; | |
113 my $self = $class->SUPER::new(@_); | |
114 | |
115 my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_); | |
116 | |
117 unless ($logger and ref($logger) and | |
118 $logger->isa('Bio::EnsEMBL::Utils::Logger')) { | |
119 throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging."); | |
120 } | |
121 | |
122 unless ($conf and ref($conf) and | |
123 $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) { | |
124 throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object."); | |
125 } | |
126 | |
127 unless ($cache and ref($cache) and | |
128 $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) { | |
129 throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object."); | |
130 } | |
131 | |
132 # initialise | |
133 $self->logger($logger); | |
134 $self->conf($conf); | |
135 $self->cache($cache); | |
136 $self->{'cache'} = []; | |
137 | |
138 return $self; | |
139 } | |
140 | |
141 | |
142 =head2 build_synteny | |
143 | |
144 Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings | |
145 to build the SyntenyFramework from | |
146 Example : $synteny_framework->build_synteny($gene_mappings); | |
147 Description : Builds the SyntenyFramework from unambiguous gene mappings. | |
148 SyntenyRegions are allowed to overlap. At most two overlapping | |
149 SyntenyRegions are merged (otherwise we'd get too large | |
150 SyntenyRegions with little information content). | |
151 Return type : none | |
152 Exceptions : thrown on wrong or missing argument | |
153 Caller : InternalIdMapper plugins | |
154 Status : At Risk | |
155 : under development | |
156 | |
157 =cut | |
158 | |
159 sub build_synteny { | |
160 my $self = shift; | |
161 my $mappings = shift; | |
162 | |
163 unless ($mappings and | |
164 $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) { | |
165 throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.'); | |
166 } | |
167 | |
168 # create a synteny region for each mapping | |
169 my @synteny_regions = (); | |
170 | |
171 foreach my $entry (@{ $mappings->get_all_Entries }) { | |
172 | |
173 my $source_gene = $self->cache->get_by_key('genes_by_id', 'source', | |
174 $entry->source); | |
175 my $target_gene = $self->cache->get_by_key('genes_by_id', 'target', | |
176 $entry->target); | |
177 | |
178 my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([ | |
179 $source_gene->start, | |
180 $source_gene->end, | |
181 $source_gene->strand, | |
182 $source_gene->seq_region_name, | |
183 $target_gene->start, | |
184 $target_gene->end, | |
185 $target_gene->strand, | |
186 $target_gene->seq_region_name, | |
187 $entry->score, | |
188 ]); | |
189 | |
190 push @synteny_regions, $sr; | |
191 } | |
192 | |
193 unless (@synteny_regions) { | |
194 $self->logger->warning("No synteny regions could be identified.\n"); | |
195 return; | |
196 } | |
197 | |
198 # sort synteny regions | |
199 #my @sorted = sort _by_overlap @synteny_regions; | |
200 my @sorted = reverse sort { | |
201 $a->source_seq_region_name cmp $b->source_seq_region_name || | |
202 $a->source_start <=> $b->source_start || | |
203 $a->source_end <=> $b->source_end } @synteny_regions; | |
204 | |
205 $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n"); | |
206 | |
207 # now create merged regions from overlapping syntenies, but only merge a | |
208 # maximum of 2 regions (otherwise you end up with large synteny blocks which | |
209 # won't contain much information in this context) | |
210 my $last_merged = 0; | |
211 my $last_sr = shift(@sorted); | |
212 | |
213 while (my $sr = shift(@sorted)) { | |
214 #$self->logger->debug("this ".$sr->to_string."\n"); | |
215 | |
216 my $merged_sr = $last_sr->merge($sr); | |
217 | |
218 if (! $merged_sr) { | |
219 unless ($last_merged) { | |
220 $self->add_SyntenyRegion($last_sr->stretch(2)); | |
221 #$self->logger->debug("nnn ".$last_sr->to_string."\n"); | |
222 } | |
223 $last_merged = 0; | |
224 } else { | |
225 $self->add_SyntenyRegion($merged_sr->stretch(2)); | |
226 #$self->logger->debug("mmm ".$merged_sr->to_string."\n"); | |
227 $last_merged = 1; | |
228 } | |
229 | |
230 $last_sr = $sr; | |
231 } | |
232 | |
233 # deal with last synteny region in @sorted | |
234 unless ($last_merged) { | |
235 $self->add_SyntenyRegion($last_sr->stretch(2)); | |
236 $last_merged = 0; | |
237 } | |
238 | |
239 #foreach my $sr (@{ $self->get_all_SyntenyRegions }) { | |
240 # $self->logger->debug("SRs ".$sr->to_string."\n"); | |
241 #} | |
242 | |
243 $self->logger->info("SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions })."\n"); | |
244 | |
245 } | |
246 | |
247 | |
248 # | |
249 # sort SyntenyRegions by overlap | |
250 # | |
251 sub _by_overlap { | |
252 # first sort by seq_region | |
253 my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name); | |
254 return $retval if ($retval); | |
255 | |
256 # then sort by overlap: | |
257 # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap | |
258 if ($a->source_end < $b->source_start) { return 1; } | |
259 if ($a->source_start < $b->source_end) { return -1; } | |
260 return 0; | |
261 } | |
262 | |
263 | |
264 =head2 add_SyntenyRegion | |
265 | |
266 Arg[1] : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add | |
267 Example : $synteny_framework->add_SyntenyRegion($synteny_region); | |
268 Description : Adds a SyntenyRegion to the framework. For speed reasons (and | |
269 since this is an internal method), no argument check is done. | |
270 Return type : none | |
271 Exceptions : none | |
272 Caller : internal | |
273 Status : At Risk | |
274 : under development | |
275 | |
276 =cut | |
277 | |
278 sub add_SyntenyRegion { | |
279 push @{ $_[0]->{'cache'} }, $_[1]; | |
280 } | |
281 | |
282 | |
283 =head2 get_all_SyntenyRegions | |
284 | |
285 Example : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) { | |
286 # do something with the SyntenyRegion | |
287 } | |
288 Description : Get a list of all SyntenyRegions in the framework. | |
289 Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion | |
290 Exceptions : none | |
291 Caller : general | |
292 Status : At Risk | |
293 : under development | |
294 | |
295 =cut | |
296 | |
297 sub get_all_SyntenyRegions { | |
298 return $_[0]->{'cache'}; | |
299 } | |
300 | |
301 | |
302 =head2 rescore_gene_matrix_lsf | |
303 | |
304 Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene | |
305 scores to rescore | |
306 Example : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores); | |
307 Description : This method runs rescore_gene_matrix() (via the | |
308 synteny_resocre.pl script) in parallel with lsf, then combines | |
309 the results to return a single rescored scoring matrix. | |
310 Parallelisation is done by chunking the scoring matrix into | |
311 several pieces (determined by the --synteny_rescore_jobs | |
312 configuration option). | |
313 Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix | |
314 Exceptions : thrown on wrong or missing argument | |
315 thrown on filesystem I/O error | |
316 thrown on failure of one or mor lsf jobs | |
317 Caller : InternalIdMapper plugins | |
318 Status : At Risk | |
319 : under development | |
320 | |
321 =cut | |
322 | |
323 sub rescore_gene_matrix_lsf { | |
324 my $self = shift; | |
325 my $matrix = shift; | |
326 | |
327 unless ($matrix and | |
328 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { | |
329 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); | |
330 } | |
331 | |
332 # serialise SyntenyFramework to disk | |
333 $self->logger->debug("Serialising SyntenyFramework...\n", 0, 'stamped'); | |
334 $self->write_to_file; | |
335 $self->logger->debug("Done.\n", 0, 'stamped'); | |
336 | |
337 # split the ScoredMappingMatrix into chunks and write to disk | |
338 my $matrix_size = $matrix->size; | |
339 $self->logger->debug("Scores before rescoring: $matrix_size.\n"); | |
340 | |
341 my $num_jobs = $self->conf->param('synteny_rescore_jobs') || 20; | |
342 $num_jobs++; | |
343 | |
344 my $dump_path = path_append($self->conf->param('basedir'), | |
345 'matrix/synteny_rescore'); | |
346 | |
347 $self->logger->debug("Creating sub-matrices...\n", 0, 'stamped'); | |
348 foreach my $i (1..$num_jobs) { | |
349 my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1; | |
350 my $end = int($matrix_size/($num_jobs-1)) * $i; | |
351 $self->logger->debug("$start-$end\n", 1); | |
352 my $sub_matrix = $matrix->sub_matrix($start, $end); | |
353 | |
354 $sub_matrix->cache_file_name("gene_matrix_synteny$i.ser"); | |
355 $sub_matrix->dump_path($dump_path); | |
356 | |
357 $sub_matrix->write_to_file; | |
358 } | |
359 $self->logger->debug("Done.\n", 0, 'stamped'); | |
360 | |
361 # create an empty lsf log directory | |
362 my $logpath = path_append($self->logger->logpath, 'synteny_rescore'); | |
363 system("rm -rf $logpath") == 0 or | |
364 $self->logger->error("Unable to delete lsf log dir $logpath: $!\n"); | |
365 system("mkdir -p $logpath") == 0 or | |
366 $self->logger->error("Can't create lsf log dir $logpath: $!\n"); | |
367 | |
368 # build lsf command | |
369 my $lsf_name = 'idmapping_synteny_rescore_'.time; | |
370 | |
371 my $options = $self->conf->create_commandline_options( | |
372 logauto => 1, | |
373 logautobase => "synteny_rescore", | |
374 logpath => $logpath, | |
375 interactive => 0, | |
376 is_component => 1, | |
377 ); | |
378 | |
379 my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX}; | |
380 | |
381 my $bsub_cmd = | |
382 sprintf( "|bsub -J%s[1-%d] " | |
383 . "-o %s/synteny_rescore.%%I.out " | |
384 . "-e %s/synteny_rescore.%%I.err %s", | |
385 $lsf_name, $num_jobs, $logpath, $logpath, | |
386 $self->conf()->param('lsf_opt_synteny_rescore') ); | |
387 | |
388 # run lsf job array | |
389 $self->logger->info("Submitting $num_jobs jobs to lsf.\n"); | |
390 $self->logger->debug("$cmd\n\n"); | |
391 | |
392 local *BSUB; | |
393 open( BSUB, $bsub_cmd ) | |
394 or $self->logger->error("Could not open open pipe to bsub: $!\n"); | |
395 | |
396 print BSUB $cmd; | |
397 $self->logger->error("Error submitting synteny rescoring jobs: $!\n") | |
398 unless ($? == 0); | |
399 close BSUB; | |
400 | |
401 # submit dependent job to monitor finishing of jobs | |
402 $self->logger->info("Waiting for jobs to finish...\n", 0, 'stamped'); | |
403 | |
404 my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } . | |
405 qq{-o $logpath/synteny_rescore_depend.out /bin/true}; | |
406 | |
407 system($dependent_job) == 0 or | |
408 $self->logger->error("Error submitting dependent job: $!\n"); | |
409 | |
410 $self->logger->info("All jobs finished.\n", 0, 'stamped'); | |
411 | |
412 # check for lsf errors | |
413 sleep(5); | |
414 my $err; | |
415 foreach my $i (1..$num_jobs) { | |
416 $err++ unless (-e "$logpath/synteny_rescore.$i.success"); | |
417 } | |
418 | |
419 if ($err) { | |
420 $self->logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n"); | |
421 } | |
422 | |
423 # merge and return matrix | |
424 $self->logger->debug("Merging rescored matrices...\n"); | |
425 $matrix->flush; | |
426 | |
427 foreach my $i (1..$num_jobs) { | |
428 # read partial matrix created by lsf job from file | |
429 my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new( | |
430 -DUMP_PATH => $dump_path, | |
431 -CACHE_FILE => "gene_matrix_synteny$i.ser", | |
432 ); | |
433 $sub_matrix->read_from_file; | |
434 | |
435 # merge with main matrix | |
436 $matrix->merge($sub_matrix); | |
437 } | |
438 | |
439 $self->logger->debug("Done.\n"); | |
440 $self->logger->debug("Scores after rescoring: ".$matrix->size.".\n"); | |
441 | |
442 return $matrix; | |
443 } | |
444 | |
445 | |
446 # | |
447 # | |
448 =head2 rescore_gene_matrix | |
449 | |
450 Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene | |
451 scores to rescore | |
452 Example : my $new_scores = $sf->rescore_gene_matrix($gene_scores); | |
453 Description : Rescores a gene matrix. Retains 70% of old score and builds | |
454 other 30% from the synteny match. | |
455 Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix | |
456 Exceptions : thrown on wrong or missing argument | |
457 Caller : InternalIdMapper plugins | |
458 Status : At Risk | |
459 : under development | |
460 | |
461 =cut | |
462 | |
463 sub rescore_gene_matrix { | |
464 my $self = shift; | |
465 my $matrix = shift; | |
466 | |
467 unless ($matrix and | |
468 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) { | |
469 throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'); | |
470 } | |
471 | |
472 my $retain_factor = 0.7; | |
473 | |
474 foreach my $entry (@{ $matrix->get_all_Entries }) { | |
475 my $source_gene = $self->cache->get_by_key('genes_by_id', 'source', | |
476 $entry->source); | |
477 | |
478 my $target_gene = $self->cache->get_by_key('genes_by_id', 'target', | |
479 $entry->target); | |
480 | |
481 my $highest_score = 0; | |
482 | |
483 foreach my $sr (@{ $self->get_all_SyntenyRegions }) { | |
484 my $score = $sr->score_location_relationship($source_gene, $target_gene); | |
485 $highest_score = $score if ($score > $highest_score); | |
486 } | |
487 | |
488 #$self->logger->debug("highscore ".$entry->to_string." ". | |
489 # sprintf("%.6f\n", $highest_score)); | |
490 | |
491 $matrix->set_score($entry->source, $entry->target, | |
492 ($entry->score * 0.7 + $highest_score * 0.3)); | |
493 } | |
494 | |
495 return $matrix; | |
496 } | |
497 | |
498 | |
499 =head2 logger | |
500 | |
501 Arg[1] : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set | |
502 Example : $object->logger->info("Starting ID mapping.\n"); | |
503 Description : Getter/setter for logger object | |
504 Return type : Bio::EnsEMBL::Utils::Logger | |
505 Exceptions : none | |
506 Caller : constructor | |
507 Status : At Risk | |
508 : under development | |
509 | |
510 =cut | |
511 | |
512 sub logger { | |
513 my $self = shift; | |
514 $self->{'_logger'} = shift if (@_); | |
515 return $self->{'_logger'}; | |
516 } | |
517 | |
518 | |
519 =head2 conf | |
520 | |
521 Arg[1] : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration | |
522 to set | |
523 Example : my $basedir = $object->conf->param('basedir'); | |
524 Description : Getter/setter for configuration object | |
525 Return type : Bio::EnsEMBL::Utils::ConfParser | |
526 Exceptions : none | |
527 Caller : constructor | |
528 Status : At Risk | |
529 : under development | |
530 | |
531 =cut | |
532 | |
533 sub conf { | |
534 my $self = shift; | |
535 $self->{'_conf'} = shift if (@_); | |
536 return $self->{'_conf'}; | |
537 } | |
538 | |
539 | |
540 =head2 cache | |
541 | |
542 Arg[1] : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set | |
543 Example : $object->cache->read_from_file('source'); | |
544 Description : Getter/setter for cache object | |
545 Return type : Bio::EnsEMBL::IdMapping::Cache | |
546 Exceptions : none | |
547 Caller : constructor | |
548 Status : At Risk | |
549 : under development | |
550 | |
551 =cut | |
552 | |
553 sub cache { | |
554 my $self = shift; | |
555 $self->{'_cache'} = shift if (@_); | |
556 return $self->{'_cache'}; | |
557 } | |
558 | |
559 | |
560 1; | |
561 |