0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =head1 NAME
|
|
20
|
|
21 Bio::EnsEMBL::Compara::AlignSlice - An AlignSlice can be used to map genes and features from one species onto another one
|
|
22
|
|
23 =head1 DESCRIPTION
|
|
24
|
|
25 INTRODUCTION
|
|
26
|
|
27 An AlignSlice is an object built with a reference Slice and the corresponding set of genomic alignments.
|
|
28 The genomic alignments are used to map features from one species onto another and viceversa.
|
|
29
|
|
30 STRUCTURE
|
|
31
|
|
32 Every Bio::EnsEMBL::Compara::AlignSlice contains a set of Bio::EnsEMBL::Compara::AlignSlice::Slice
|
|
33 objects, at least one by species involved in the alignments. For instance, if the reference Slice is a
|
|
34 human slice and the set of alignments corresponds to human-mouse BLASTZ_NET alignments, there will be at
|
|
35 least one Bio::EnsEMBL::Compara::AlignSlice::Slice for human and at least another one for mouse. The main
|
|
36 Bio::EnsEMBL::Compara::AlignSlice::Slice for the reference species will contain a single genomic sequence
|
|
37 whilst the other Bio::EnsEMBL::Compara::AlignSlice::Slice objects might be made of several pieces of
|
|
38 genomic sequences, depending on the set of alignments. Here is a graphical representation:
|
|
39
|
|
40 ref.Slice **************************************************************
|
|
41
|
|
42 alignments 11111111111
|
|
43 2222222222222222
|
|
44 33333333333333333
|
|
45
|
|
46 resulting Bio::EnsEMBL::Compara::AlignSlice:
|
|
47
|
|
48 AS::Slice 1 **************************************************************
|
|
49 AS::Slice 2 .11111111111....2222222222222222......33333333333333333.......
|
|
50
|
|
51 MODES
|
|
52
|
|
53 Two modes are currently available: condensed and expanded mode. The default mode is "condensed". In
|
|
54 condensed mode, no gaps are allowed in the reference Slice which means that information about deletions
|
|
55 in the reference species (i.e. insertions in the other species) are lost. On the other hand, the
|
|
56 first Bio::EnsEMBL::Compara::AlignSlice::Slice object corresponds to the original Bio::EnsEMBL::Slice.
|
|
57
|
|
58 In the expanded mode, the first Bio::EnsEMBL::Compara::AlignSlice::Slice is expanded in order to
|
|
59 accomodate the gaps corresponding to the deletions (insertions). Bear in mind that in expanded mode, the
|
|
60 length of the resulting AlignSlice will be most probably larger than the length of the reference
|
|
61 Bio::EnsEMBL::Slice.
|
|
62
|
|
63
|
|
64 OVERLAPPING ALIGNMENTS
|
|
65
|
|
66 No overlapping alignments are allowed by default. This means that if an alignment overlaps another one, the
|
|
67 second alignment is ignored. This is due to lack of information needed to reconciliate both alignment.
|
|
68 Here is a graphical example showing this problem:
|
|
69
|
|
70 ALN 1: Human (ref) CTGTGAAAA----CCCCATTAGG
|
|
71 Mouse (1) CTGAAAATTTTCCCC
|
|
72
|
|
73 ALN 2: Human (ref) CTGTGAAAA---CCCCATTAGG
|
|
74 Mouse (1) AAAGGGCCCCATTA
|
|
75
|
|
76 Possible solution 1:
|
|
77 Human (ref) CTGTGAAAA----CCCCATTAGG
|
|
78 Mouse (1) CTGAAAATTTTCCCC----
|
|
79 Mouse (1) ----AAA-GGGCCCCATTA
|
|
80
|
|
81 Possible solution 2:
|
|
82 Human (ref) CTGTGAAAA----CCCCATTAGG
|
|
83 Mouse (1) CTGAAAATTTTCCCC----
|
|
84 Mouse (1) ----AAAGGG-CCCCATTA
|
|
85
|
|
86 Possible solution 3:
|
|
87 Human (ref) CTGTGAAAA-------CCCCATTAGG
|
|
88 Mouse (1) CTGAAAATTTT---CCCC
|
|
89 Mouse (1) AAA----GGGCCCCATTA
|
|
90
|
|
91 There is no easy way to find which of these possible solution is the best without trying to realign the
|
|
92 three sequences together and this is far beyond the aim of this module. The best solution is to start with
|
|
93 multiple alignments instead of overlapping pairwise ones.
|
|
94
|
|
95 The third possibility is probably not the best alignment you can get although its implementation is
|
|
96 systematic (insert as many gaps as needed in order to accommodate the insertions and never ever overlap
|
|
97 them) and computationally cheap as no realignment is needed. You may ask this module to solve overlapping
|
|
98 alignments in this way using the "solve_overlapping" option.
|
|
99
|
|
100 RESULT
|
|
101
|
|
102 The AlignSlice results in a set of Bio::EnsEMBL::Compara::AlignSlice::Slice which are objects really
|
|
103 similar to the Bio::EnsEMBL::Slice. There are intended to be used just like the genuine
|
|
104 Bio::EnsEMBL::Slice but some methods do not work though. Some examples of non-ported methods are: expand()
|
|
105 and invert(). Some other methods work as expected (seq, subseq, get_all_Attributes,
|
|
106 get_all_VariationFeatures, get_all_RepeatFeatures...). All these Bio::EnsEMBL::Compara::AlignSlice::Slice
|
|
107 share the same fake coordinate system defined by the Bio::EnsEMBL::Compara::AlignSlice. This allows to
|
|
108 map features from one species onto the others.
|
|
109
|
|
110 =head1 SYNOPSIS
|
|
111
|
|
112 use Bio::EnsEMBL::Compara::AlignSlice;
|
|
113
|
|
114 ## You may create your own AlignSlice objects but if you are interested in
|
|
115 ## getting AlignSlice built with data from an EnsEMBL Compara database you
|
|
116 ## should consider using the Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor
|
|
117 ## instead
|
|
118 my $align_slice = new Bio::EnsEMBL::Compara::AlignSlice(
|
|
119 -adaptor => $align_slice_adaptor,
|
|
120 -reference_Slice => $reference_Slice,
|
|
121 -method_link_species_set => $all_genomic_align_blocks,
|
|
122 -genomicAlignBlocks => $all_genomic_align_blocks,
|
|
123 -expanded => 1
|
|
124 -solve_overlapping => 1
|
|
125 );
|
|
126
|
|
127 my $all_slices = $aling_slice->get_all_Slices();
|
|
128 foreach my $this_slice (@$all_slices) {
|
|
129 ## See also Bio::EnsEMBL::Compara::AlignSlice::Slice
|
|
130 my $species_name = $this_slice->genome_db->name()
|
|
131 my $all_mapped_genes = $this_slice->get_all_Genes();
|
|
132 }
|
|
133
|
|
134 my $simple_align = $align_slice->get_projected_SimpleAlign();
|
|
135
|
|
136 SET VALUES
|
|
137 $align_slice->adaptor($align_slice_adaptor);
|
|
138 $align_slice->reference_Slice($reference_slice);
|
|
139 $align_slice->add_GenomicAlignBlock($genomic_align_block_1);
|
|
140 $align_slice->add_GenomicAlignBlock($genomic_align_block_2);
|
|
141 $align_slice->add_GenomicAlignBlock($genomic_align_block_3);
|
|
142
|
|
143 GET VALUES
|
|
144 my $align_slice_adaptor = $align_slice->adaptor();
|
|
145 my $reference_slice = $align_slice->reference_Slice();
|
|
146 my $all_genomic_align_blocks = $align_slice->get_all_GenomicAlignBlock();
|
|
147 my $mapped_genes = $align_slice->get_all_Genes_by_genome_db_id(3);
|
|
148 my $simple_align = $align_slice->get_projected_SimpleAlign();
|
|
149
|
|
150 =head1 OBJECT ATTRIBUTES
|
|
151
|
|
152 =over
|
|
153
|
|
154 =item adaptor
|
|
155
|
|
156 Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor object to access DB
|
|
157
|
|
158 =item reference_slice
|
|
159
|
|
160 Bio::EnsEMBL::Slice object used to create this Bio::EnsEBML::Compara::AlignSlice object
|
|
161
|
|
162 =item all_genomic_align_blocks
|
|
163
|
|
164 a listref of Bio::EnsEMBL::Compara::GenomicAlignBlock objects found using the reference_Slice
|
|
165
|
|
166 =back
|
|
167
|
|
168 =head1 APPENDIX
|
|
169
|
|
170 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
171
|
|
172 =cut
|
|
173
|
|
174
|
|
175 # Let the code begin...
|
|
176
|
|
177
|
|
178 package Bio::EnsEMBL::Compara::AlignSlice;
|
|
179
|
|
180 use strict;
|
|
181 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
|
|
182 use Bio::EnsEMBL::Utils::Exception qw(throw warning info verbose);
|
|
183 use Bio::EnsEMBL::Compara::AlignSlice::Exon;
|
|
184 use Bio::EnsEMBL::Compara::AlignSlice::Slice;
|
|
185 use Bio::EnsEMBL::Compara::GenomicAlignBlock;
|
|
186 use Bio::EnsEMBL::Compara::GenomicAlign;
|
|
187 use Bio::SimpleAlign;
|
|
188
|
|
189 ## Creates a new coordinate system for creating empty Slices.
|
|
190
|
|
191 my $aligngap_coord_system = new Bio::EnsEMBL::CoordSystem(
|
|
192 -NAME => 'alignment',
|
|
193 -VERSION => "none",
|
|
194 -TOP_LEVEL => 0,
|
|
195 -SEQUENCE_LEVEL => 1,
|
|
196 -RANK => 1,
|
|
197 );
|
|
198
|
|
199 =head2 new (CONSTRUCTOR)
|
|
200
|
|
201 Arg[1] : a reference to a hash where keys can be:
|
|
202 -adaptor
|
|
203 -reference_slice
|
|
204 -genomic_align_blocks
|
|
205 -method_link_species_set
|
|
206 -expanded
|
|
207 -solve_overlapping
|
|
208 -preserve_blocks
|
|
209 -adaptor: the Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor
|
|
210 -reference_slice:
|
|
211 Bio::EnsEMBL::Slice, the guide slice for this align_slice
|
|
212 -genomic_align_blocks:
|
|
213 listref of Bio::EnsEMBL::Compara::GenomicAlignBlock
|
|
214 objects containing to the alignments to be used for this
|
|
215 align_slice
|
|
216 -method_link_species_set;
|
|
217 Bio::EnsEMBL::Compara::MethodLinkSpeciesSet object for
|
|
218 all the previous genomic_align_blocks. At the moment all
|
|
219 the blocks should correspond to the same MethodLinkSpeciesSet
|
|
220 -expanded: boolean flag. If set to true, the AlignSlice will insert all
|
|
221 the gaps requiered by the alignments in the reference_slice
|
|
222 (see MODES elsewhere in this document)
|
|
223 -solve_overlapping:
|
|
224 boolean flag. If set to true, the AlignSlice will allow
|
|
225 overlapping alginments and solve indeterminations according
|
|
226 to the method described in OVERLAPPING ALIGNMENTS elsewhere
|
|
227 in this document
|
|
228 -preserve_blocks:
|
|
229 boolean flag. By default the AlignSlice trim the alignments
|
|
230 in order to fit the reference_slice. This flags tell the
|
|
231 AlignSlice to use the alignment block as they are (usually
|
|
232 this is only used by the AlignSliceAdaptor, use with care)
|
|
233 Example : my $align_slice =
|
|
234 new Bio::EnsEMBL::Compara::AlignSlice(
|
|
235 -adaptor => $align_slice_adaptor,
|
|
236 -reference_slice => $reference_slice,
|
|
237 -method_link_species_set => $method_link_species_set,
|
|
238 -genomic_align_blocks => [$gab1, $gab2],
|
|
239 -expanded => 1
|
|
240 );
|
|
241 Description: Creates a new Bio::EnsEMBL::Compara::AlignSlice object
|
|
242 Returntype : Bio::EnsEMBL::Compara::AlignSlice object
|
|
243 Exceptions : none
|
|
244 Caller : general
|
|
245
|
|
246 =cut
|
|
247
|
|
248 sub new {
|
|
249 my ($class, @args) = @_;
|
|
250
|
|
251 my $self = {};
|
|
252 bless $self,$class;
|
|
253
|
|
254 my ($adaptor, $reference_slice, $genomic_align_blocks, $genomic_align_trees,
|
|
255 $method_link_species_set, $expanded, $solve_overlapping, $preserve_blocks,
|
|
256 $species_order) =
|
|
257 rearrange([qw(
|
|
258 ADAPTOR REFERENCE_SLICE GENOMIC_ALIGN_BLOCKS GENOMIC_ALIGN_TREES
|
|
259 METHOD_LINK_SPECIES_SET EXPANDED SOLVE_OVERLAPPING PRESERVE_BLOCKS
|
|
260 SPECIES_ORDER
|
|
261 )], @args);
|
|
262
|
|
263 $self->adaptor($adaptor) if (defined ($adaptor));
|
|
264 $self->reference_Slice($reference_slice) if (defined ($reference_slice));
|
|
265 if (defined($genomic_align_blocks)) {
|
|
266 foreach my $this_genomic_align_block (@$genomic_align_blocks) {
|
|
267 $self->add_GenomicAlignBlock($this_genomic_align_block);
|
|
268 }
|
|
269 }
|
|
270 $self->{_method_link_species_set} = $method_link_species_set if (defined($method_link_species_set));
|
|
271
|
|
272 $self->{expanded} = 0;
|
|
273 if ($expanded) {
|
|
274 $self->{expanded} = 1;
|
|
275 }
|
|
276
|
|
277 $self->{solve_overlapping} = 0;
|
|
278 if ($solve_overlapping) {
|
|
279 $self->{solve_overlapping} = $solve_overlapping;
|
|
280 }
|
|
281
|
|
282 if ($genomic_align_trees) {
|
|
283 $self->_create_underlying_Slices($genomic_align_trees, $self->{expanded},
|
|
284 $self->{solve_overlapping}, $preserve_blocks, $species_order);
|
|
285
|
|
286 #Awful hack to store the _alignslice_from and _alignslice_to on the
|
|
287 #GenomicAlignBlock for use in get_all_ConservationScores which uses
|
|
288 #GenomicAlignBlock and not GenomicAlignTree
|
|
289 foreach my $tree (@$genomic_align_trees) {
|
|
290 foreach my $block (@$genomic_align_blocks) {
|
|
291
|
|
292 #my $gab_id = $tree->get_all_leaves->[0]->get_all_genomic_aligns_for_node->[0]->genomic_align_block_id;
|
|
293 #Hope always have a reference_genomic_align. The above doesn't work because only the reference species
|
|
294 #has the genomic_align_block_id set
|
|
295 my $gab_id = $tree->reference_genomic_align->genomic_align_block_id;
|
|
296 my $block_id = $block->dbID;
|
|
297
|
|
298 #if the block has been restricted, need to look at original_dbID
|
|
299 if (!defined $block_id) {
|
|
300 $block_id = $block->{original_dbID};
|
|
301 }
|
|
302 my $tree_ref_ga = $tree->{reference_genomic_align};
|
|
303 my $block_ref_ga = $block->{reference_genomic_align};
|
|
304
|
|
305 #Need to check the ref_ga details not just the block id because
|
|
306 #the original_dbID is not unique for 2x genome blocks
|
|
307 if ($gab_id == $block_id &&
|
|
308 $tree_ref_ga->dnafrag_start == $block_ref_ga->dnafrag_start &&
|
|
309 $tree_ref_ga->dnafrag_end == $block_ref_ga->dnafrag_end &&
|
|
310 $tree_ref_ga->dnafrag_strand == $block_ref_ga->dnafrag_strand) {
|
|
311
|
|
312 $block->{_alignslice_from} = $tree->{_alignslice_from};
|
|
313 $block->{_alignslice_to} = $tree->{_alignslice_to};
|
|
314 }
|
|
315 }
|
|
316 }
|
|
317 } else {
|
|
318 $self->_create_underlying_Slices($genomic_align_blocks, $self->{expanded},
|
|
319 $self->{solve_overlapping}, $preserve_blocks, $species_order);
|
|
320 }
|
|
321 return $self;
|
|
322 }
|
|
323
|
|
324
|
|
325 =head2 sub_AlignSlice
|
|
326
|
|
327 Arg 1 : int $start
|
|
328 Arg 2 : int $end
|
|
329 Example : my $sub_align_slice = $align_slice->sub_AlignSlice(10, 50);
|
|
330 Description: Creates a new Bio::EnsEMBL::Compara::AlignSlice object
|
|
331 corresponding to a sub region of this one
|
|
332 Returntype : Bio::EnsEMBL::Compara::AlignSlice object
|
|
333 Exceptions : return undef if no internal slices can be created (see
|
|
334 Bio::EnsEMBL::Compara::AlignSlice::Slice->sub_Slice)
|
|
335 Caller : $align_slice
|
|
336 Status : Testing
|
|
337
|
|
338 =cut
|
|
339
|
|
340 sub sub_AlignSlice {
|
|
341 my ($self, $start, $end) = @_;
|
|
342 my $sub_align_slice = {};
|
|
343
|
|
344 throw("Must provide START argument") if (!defined($start));
|
|
345 throw("Must provide END argument") if (!defined($end));
|
|
346
|
|
347 bless $sub_align_slice, ref($self);
|
|
348
|
|
349 $sub_align_slice->adaptor($self->adaptor);
|
|
350 $sub_align_slice->reference_Slice($self->reference_Slice);
|
|
351 foreach my $this_genomic_align_block (@{$self->get_all_GenomicAlignBlocks()}) {
|
|
352 $sub_align_slice->add_GenomicAlignBlock($this_genomic_align_block);
|
|
353 }
|
|
354 $sub_align_slice->{_method_link_species_set} = $self->{_method_link_species_set};
|
|
355
|
|
356 $sub_align_slice->{expanded} = $self->{expanded};
|
|
357
|
|
358 $sub_align_slice->{solve_overlapping} = $self->{solve_overlapping};
|
|
359
|
|
360 foreach my $this_slice (@{$self->get_all_Slices}) {
|
|
361 my $new_slice = $this_slice->sub_Slice($start, $end);
|
|
362 push(@{$sub_align_slice->{_slices}}, $new_slice) if ($new_slice);
|
|
363 }
|
|
364 return undef if (!$sub_align_slice->{_slices});
|
|
365
|
|
366 return $sub_align_slice;
|
|
367 }
|
|
368
|
|
369
|
|
370 =head2 adaptor
|
|
371
|
|
372 Arg[1] : (optional) Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor $align_slice_adaptor
|
|
373 Example : my $align_slice_adaptor = $align_slice->adaptor
|
|
374 Example : $align_slice->adaptor($align_slice_adaptor)
|
|
375 Description: getter/setter for the adaptor attribute
|
|
376 Returntype : Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor object
|
|
377 Exceptions : throw if arg is not a Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor
|
|
378 Caller : $object->methodname
|
|
379
|
|
380 =cut
|
|
381
|
|
382 sub adaptor {
|
|
383 my ($self, $adaptor) = @_;
|
|
384
|
|
385 if (defined($adaptor)) {
|
|
386 throw "[$adaptor] must be a Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor object"
|
|
387 unless ($adaptor and $adaptor->isa("Bio::EnsEMBL::Compara::DBSQL::AlignSliceAdaptor"));
|
|
388 $self->{'adaptor'} = $adaptor;
|
|
389 }
|
|
390
|
|
391 return $self->{'adaptor'};
|
|
392 }
|
|
393
|
|
394
|
|
395 =head2 get_all_Slices (experimental)
|
|
396
|
|
397 Arg[1] : [optional] string $species_name1
|
|
398 Arg[2] : [optional] string $species_name2
|
|
399 Arg[...] : [optional] string $species_nameN
|
|
400 Example : my $slices = $align_slice->get_all_Slices
|
|
401 Description: getter for all the Slices in this AlignSlice. If a list
|
|
402 of species is specified, returns only the slices for
|
|
403 these species. The slices are returned in a "smart"
|
|
404 order, i.e. the slice corresponding to the reference
|
|
405 species is returned first and then the remaining slices
|
|
406 depending on their phylogenetic distance to the first
|
|
407 one.
|
|
408 NB: You can use underscores instead of whitespaces for
|
|
409 the name of the species, i.e. Homo_sapiens will be
|
|
410 understood as "Homo sapiens". However if the GenomeDB is found
|
|
411 to already have _ defined in the name then this behaviour is
|
|
412 disabled.
|
|
413 Returntype : listref of Bio::EnsEMBL::Compara::AlignSlice::Slice
|
|
414 objects.
|
|
415 Exceptions :
|
|
416 Caller : $object->methodname
|
|
417
|
|
418 =cut
|
|
419
|
|
420 sub get_all_Slices {
|
|
421 my ( $self, @species_names ) = @_;
|
|
422 my $slices = [];
|
|
423
|
|
424 if (@species_names) {
|
|
425 foreach my $slice ( @{ $self->{_slices} } ) {
|
|
426 #Substitute _ for spaces & check if the current GenomeDB matches with
|
|
427 #or without them
|
|
428 foreach my $this_species_name (@species_names) {
|
|
429 ( my $space_species_name = $this_species_name ) =~ s/_/ /g;
|
|
430 push( @$slices, $slice )
|
|
431 if ( ( $this_species_name eq $slice->genome_db->name )
|
|
432 || ( $space_species_name eq $slice->genome_db->name ) );
|
|
433 }
|
|
434 }
|
|
435 }
|
|
436 else {
|
|
437 $slices = $self->{_slices};
|
|
438 }
|
|
439
|
|
440 return $slices;
|
|
441 }
|
|
442
|
|
443
|
|
444 =head2 reference_Slice
|
|
445
|
|
446 Arg[1] : (optional) Bio::EnsEMBL::Slice $slice
|
|
447 Example : my $reference_slice = $align_slice->reference_slice
|
|
448 Example : $align_slice->reference_Slice($reference_slice)
|
|
449 Description: getter/setter for the attribute reference_slice
|
|
450 Returntype : Bio::EnsEMBL::Slice object
|
|
451 Exceptions : throw if arg is not a Bio::EnsEMBL::Slice object
|
|
452 Caller : $object->methodname
|
|
453
|
|
454 =cut
|
|
455
|
|
456 sub reference_Slice {
|
|
457 my ($self, $reference_slice) = @_;
|
|
458
|
|
459 if (defined($reference_slice)) {
|
|
460 throw "[$reference_slice] must be a Bio::EnsEMBL::Slice object"
|
|
461 unless ($reference_slice and $reference_slice->isa("Bio::EnsEMBL::Slice"));
|
|
462 $self->{'reference_slice'} = $reference_slice;
|
|
463 }
|
|
464
|
|
465 return $self->{'reference_slice'};
|
|
466 }
|
|
467
|
|
468
|
|
469 =head2 add_GenomicAlignBlock
|
|
470
|
|
471 Arg[1] : Bio::EnsEMBL::Compara::GenomicAlignBlock $genomicAlignBlock
|
|
472 Example : $align_slice->add_GenomicAlignBlock($genomicAlignBlock)
|
|
473 Description: add a Bio::EnsEMBL::Compara::GenomicAlignBlock object to the array
|
|
474 stored in the attribute all_genomic_align_blocks
|
|
475 Returntype : none
|
|
476 Exceptions : throw if arg is not a Bio::EnsEMBL::Compara::GenomicAlignBlock
|
|
477 Caller : $object->methodname
|
|
478
|
|
479 =cut
|
|
480
|
|
481 sub add_GenomicAlignBlock {
|
|
482 my ($self, $genomic_align_block) = @_;
|
|
483
|
|
484 if (!defined($genomic_align_block)) {
|
|
485 throw "Too few arguments for Bio::EnsEMBL::Compara::AlignSlice->add_GenomicAlignBlock()";
|
|
486 }
|
|
487 if (!$genomic_align_block or !$genomic_align_block->isa("Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
|
|
488 throw "[$genomic_align_block] must be a Bio::EnsEMBL::Compara::GenomicAlignBlock object";
|
|
489 }
|
|
490
|
|
491 push(@{$self->{'all_genomic_align_blocks'}}, $genomic_align_block);
|
|
492 }
|
|
493
|
|
494
|
|
495 =head2 get_all_GenomicAlignBlocks
|
|
496
|
|
497 Arg[1] : none
|
|
498 Example : my $all_genomic_align_blocks = $align_slice->get_all_GenomicAlignBlocks
|
|
499 Description: getter for the attribute all_genomic_align_blocks
|
|
500 Returntype : listref of Bio::EnsEMBL::Compara::GenomicAlignBlock objects
|
|
501 Exceptions : none
|
|
502 Caller : $object->methodname
|
|
503
|
|
504 =cut
|
|
505
|
|
506 sub get_all_GenomicAlignBlocks {
|
|
507 my ($self) = @_;
|
|
508
|
|
509 return ($self->{'all_genomic_align_blocks'} || []);
|
|
510 }
|
|
511
|
|
512
|
|
513 =head2 get_MethodLinkSpeciesSet
|
|
514
|
|
515 Arg[1] : none
|
|
516 Example : my $method_link_species_set = $align_slice->get_MethodLinkSpeciesSet
|
|
517 Description: getter for the Bio::EnsEMBL::Compara::MethodLinkSpeciesSet
|
|
518 used to create this object
|
|
519 Returntype : Bio::EnsEMBL::Compara::MethodLinkSpeciesSet
|
|
520 Exceptions : none
|
|
521 Caller : $object->methodname
|
|
522
|
|
523 =cut
|
|
524
|
|
525 sub get_MethodLinkSpeciesSet {
|
|
526 my ($self) = @_;
|
|
527
|
|
528 return $self->{'_method_link_species_set'};
|
|
529 }
|
|
530
|
|
531
|
|
532 =head2 get_SimpleAlign
|
|
533
|
|
534 Arg[1] : none
|
|
535 Example : use Bio::AlignIO;
|
|
536 my $out = Bio::AlignIO->newFh(-fh=>\*STDOUT, -format=> "clustalw");
|
|
537 print $out $align_slice->get_SimpleAlign();
|
|
538 Description : This method creates a Bio::SimpleAlign object using the
|
|
539 Bio::EnsEMBL::Compara::AlignSlice::Slices underlying this
|
|
540 Bio::EnsEMBL::Compara::AlignSlice object. The SimpleAlign
|
|
541 describes the alignment where the first sequence
|
|
542 corresponds to the reference Slice and the remaining
|
|
543 correspond to the other species.
|
|
544 Returntype : Bio::SimpleAlign object
|
|
545 Exceptions :
|
|
546 Caller : $object->methodname
|
|
547
|
|
548 =cut
|
|
549
|
|
550 sub get_SimpleAlign {
|
|
551 my ($self, @species) = @_;
|
|
552 my $simple_align;
|
|
553
|
|
554 ## Create a single Bio::SimpleAlign for the projection
|
|
555 $simple_align = Bio::SimpleAlign->new();
|
|
556 $simple_align->id("ProjectedMultiAlign");
|
|
557
|
|
558 my $genome_db_name_counter;
|
|
559 foreach my $slice (@{$self->get_all_Slices(@species)}) {
|
|
560 my $seq = Bio::LocatableSeq->new(
|
|
561 -SEQ => $slice->seq,
|
|
562 -START => $slice->start,
|
|
563 -END => $slice->end,
|
|
564 -ID => $slice->genome_db->name.($genome_db_name_counter->{$slice->genome_db->name} or ""),
|
|
565 -STRAND => $slice->strand
|
|
566 );
|
|
567 ## This allows to have several sequences for the same species. Bio::SimpleAlign complains
|
|
568 ## about having the same ID, START and END for two sequences...
|
|
569 if (!defined($genome_db_name_counter->{$slice->genome_db->name})) {
|
|
570 $genome_db_name_counter->{$slice->genome_db->name} = 2;
|
|
571 } else {
|
|
572 $genome_db_name_counter->{$slice->genome_db->name}++;
|
|
573 }
|
|
574
|
|
575 $simple_align->add_seq($seq);
|
|
576 }
|
|
577
|
|
578 return $simple_align;
|
|
579 }
|
|
580
|
|
581
|
|
582 =head2 get_all_ConservationScores
|
|
583
|
|
584 Arg 1 : (opt) integer $display_size (default 700)
|
|
585 Arg 2 : (opt) string $display_type (one of "AVERAGE" or "MAX") (default "MAX")
|
|
586 Arg 3 : (opt) integer $window_size
|
|
587 Example : my $conservation_scores =
|
|
588 $align_slice->get_all_ConservationScores(1000, "AVERAGE", 10);
|
|
589 Description: Retrieve the corresponding
|
|
590 Bio::EnsEMBL::Compara::ConservationScore objects for the
|
|
591 Bio::EnsEMBL::Compara::AlignSlice object. It calls either
|
|
592 _get_expanded_conservation_scores if the AlignSlice has
|
|
593 "expanded" set or _get_condensed_conservation_scores for
|
|
594 condensed mode.
|
|
595 It sets up the align_start, align_end and slice_length and map
|
|
596 the resulting objects onto the AlignSlice.
|
|
597 Please refer to the documentation in
|
|
598 Bio::EnsEMBL::Compara::DBSQL::ConservationScoreAdaptor
|
|
599 for more details.
|
|
600 Returntype : ref. to an array of Bio::EnsEMBL::Compara::ConservationScore
|
|
601 objects.
|
|
602 Caller : object::methodname
|
|
603 Status : At risk
|
|
604
|
|
605 =cut
|
|
606
|
|
607 sub get_all_ConservationScores {
|
|
608 my ($self, $display_size, $display_type, $window_size) = @_;
|
|
609 my $all_conservation_scores = [];
|
|
610 my $y_axis_min;
|
|
611 my $y_axis_max;
|
|
612
|
|
613 my $conservation_score_adaptor = $self->adaptor->db->get_ConservationScoreAdaptor();
|
|
614
|
|
615 #Get scores in either expanded or condensed mode
|
|
616 if ($self->{expanded}) {
|
|
617 $all_conservation_scores = $self->_get_expanded_conservation_scores($conservation_score_adaptor, $display_size, $display_type, $window_size);
|
|
618 } else {
|
|
619 $all_conservation_scores = $self->_get_condensed_conservation_scores($conservation_score_adaptor, $display_size, $display_type, $window_size);
|
|
620 }
|
|
621
|
|
622 return $all_conservation_scores;
|
|
623 }
|
|
624
|
|
625 =head2 _get_expanded_conservation_scores
|
|
626
|
|
627 Arg 1 : Bio::EnsEMBL::Compara::DBSQL::ConservationScoreAdaptor
|
|
628 Arg 2 : (opt) integer $display_size (default 700)
|
|
629 Arg 3 : (opt) string $display_type (one of "AVERAGE" or "MAX") (default "MAX")
|
|
630 Arg 4 : (opt) integer $window_size
|
|
631 Example : my $conservation_scores =
|
|
632 $self->_get_expanded_conservation_scores($cs_adaptor, 1000, "AVERAGE", 10);
|
|
633 Description: Retrieve the corresponding
|
|
634 Bio::EnsEMBL::Compara::ConservationScore objects for the
|
|
635 Bio::EnsEMBL::Compara::GenomicAlignBlock objects underlying
|
|
636 this Bio::EnsEMBL::Compara::AlignSlice object. This method
|
|
637 calls the Bio::EnsEMBL::Compara::DBSQL::ConservationScoreAdaptor->
|
|
638 fetch_all_by_GenomicAlignBlock() method. It sets up the align_start,
|
|
639 align_end and slice_length and map the resulting objects onto
|
|
640 the AlignSlice. $diaplay_slize, $display_type and $window_size
|
|
641 are passed to the fetch_all_by_GenomicAlignBlock() method.
|
|
642 Please refer to the documentation in
|
|
643 Bio::EnsEMBL::Compara::DBSQL::ConservationScoreAdaptor
|
|
644 for more details.
|
|
645 Returntype : ref. to an array of Bio::EnsEMBL::Compara::ConservationScore
|
|
646 objects.
|
|
647 Caller : object::methodname
|
|
648 Status : At risk
|
|
649
|
|
650 =cut
|
|
651 sub _get_expanded_conservation_scores {
|
|
652 my ($self, $conservation_score_adaptor, $display_size, $display_type, $window_size) = @_;
|
|
653 my $y_axis_min;
|
|
654 my $y_axis_max;
|
|
655
|
|
656 my $all_conservation_scores = [];
|
|
657 my $offset = 0; #the start of each gab in alignment coords
|
|
658 my $prev_gab_end;
|
|
659 foreach my $this_genomic_align_block (@{$self->get_all_GenomicAlignBlocks()}) {
|
|
660 $this_genomic_align_block->{restricted_aln_start} = $this_genomic_align_block->{_alignslice_from};
|
|
661 $this_genomic_align_block->{restricted_aln_end} = $this_genomic_align_block->{_alignslice_to};
|
|
662
|
|
663 #Need to map the coords I get back from the conservation_score_adaptor onto the slice using the start position of the genomic_align_block
|
|
664 my $all_these_conservation_scores = $conservation_score_adaptor->fetch_all_by_GenomicAlignBlock(
|
|
665 $this_genomic_align_block, 1,$this_genomic_align_block->length, $self->get_all_Slices()->[0]->length,
|
|
666 $display_size, $display_type, $window_size);
|
|
667
|
|
668 #Need to account for gaps between gabs which will be slice end - start on the reference slice.
|
|
669 if ($prev_gab_end) {
|
|
670 my $gap = $this_genomic_align_block->reference_slice_start - $prev_gab_end - 1;
|
|
671 if ($gap) {
|
|
672 $offset += $gap;
|
|
673 }
|
|
674 }
|
|
675 $prev_gab_end = $this_genomic_align_block->reference_slice_end;
|
|
676
|
|
677 foreach my $score (@$all_these_conservation_scores) {
|
|
678 $score->position($score->position + $offset);
|
|
679 }
|
|
680
|
|
681 #offset is the sum of preceding gab lengths
|
|
682 $offset += ($this_genomic_align_block->{restricted_aln_end} - $this_genomic_align_block->{restricted_aln_start} + 1);
|
|
683
|
|
684 #initialise y axis min and max
|
|
685 if (!defined $y_axis_max) {
|
|
686 $y_axis_max = $all_these_conservation_scores->[0]->y_axis_max;
|
|
687 $y_axis_min = $all_these_conservation_scores->[0]->y_axis_min;
|
|
688 }
|
|
689 #find overall min and max
|
|
690 if ($y_axis_min > $all_these_conservation_scores->[0]->y_axis_min) {
|
|
691 $y_axis_min = $all_these_conservation_scores->[0]->y_axis_min;
|
|
692 }
|
|
693 if ($y_axis_max < $all_these_conservation_scores->[0]->y_axis_max) {
|
|
694 $y_axis_max = $all_these_conservation_scores->[0]->y_axis_max;
|
|
695 }
|
|
696 push (@$all_conservation_scores, @$all_these_conservation_scores);
|
|
697 }
|
|
698 #set overall min and max
|
|
699 $all_conservation_scores->[0]->y_axis_min($y_axis_min);
|
|
700 $all_conservation_scores->[0]->y_axis_max($y_axis_max);
|
|
701
|
|
702 return $all_conservation_scores;
|
|
703 }
|
|
704
|
|
705 =head2 _get_condensed_conservation_scores
|
|
706
|
|
707 Arg 1 : Bio::EnsEMBL::Compara::DBSQL::ConservationScoreAdaptor
|
|
708 Arg 2 : (opt) integer $display_size (default 700)
|
|
709 Arg 3 : (opt) string $display_type (one of "AVERAGE" or "MAX") (default "MAX")
|
|
710 Arg 4 : (opt) integer $window_size
|
|
711 Example : my $conservation_scores =
|
|
712 $self->_get_expanded_conservation_scores($cs_adaptor, 1000, "AVERAGE", 10);
|
|
713 Description: Retrieve the corresponding
|
|
714 Bio::EnsEMBL::Compara::ConservationScore objects for the
|
|
715 reference Bio::EnsEMBL::Slice object of
|
|
716 this Bio::EnsEMBL::Compara::AlignSlice object. This method
|
|
717 calls the Bio::EnsEMBL::Compara::DBSQL::ConservationScoreAdaptor->
|
|
718 fetch_all_by_MethodLinkSpeciesSet_Slice() method. It sets up
|
|
719 the align_start, align_end and slice_length and map the
|
|
720 resulting objects onto the AlignSlice. $display_slize,
|
|
721 $display_type and $window_size are passed to the
|
|
722 fetch_all_by_MethodLinkSpeciesSet_Slice() method.
|
|
723 Please refer to the documentation in
|
|
724 Bio::EnsEMBL::Compara::DBSQL::ConservationScoreAdaptor
|
|
725 for more details.
|
|
726 Returntype : ref. to an array of Bio::EnsEMBL::Compara::ConservationScore
|
|
727 objects.
|
|
728 Caller : object::methodname
|
|
729 Status : At risk
|
|
730
|
|
731 =cut
|
|
732 sub _get_condensed_conservation_scores {
|
|
733 my ($self, $conservation_score_adaptor, $display_size, $display_type, $window_size) = @_;
|
|
734
|
|
735 my $all_conservation_scores = [];
|
|
736
|
|
737 throw ("Must have method_link_species_set defined to retrieve conservation scores for a condensed AlignSlice") if (!defined $self->{_method_link_species_set});
|
|
738 throw ("Must have reference slice defined to retrieve conservation scores for a condensed AlignSlice") if (!defined $self->{'reference_slice'});
|
|
739
|
|
740 # select the conservation score mlss_id linked to the msa
|
|
741 my $sql = 'SELECT method_link_species_set_id FROM method_link_species_set_tag JOIN method_link_species_set USING (method_link_species_set_id) JOIN method_link USING (method_link_id) WHERE type LIKE "%CONSERVATION\_SCORE" AND tag = "msa_mlss_id" AND value = ?';
|
|
742 my $sth = $conservation_score_adaptor->prepare($sql);
|
|
743 $sth->execute($self->{_method_link_species_set}->dbID);
|
|
744
|
|
745 my ($cs_mlss_id) = @{$sth->fetchrow_arrayref};
|
|
746 $sth->finish;
|
|
747
|
|
748 throw ("Unable to find conservation score method_link_species_set for this multiple alignment " . $self->{_method_link_species_set}->dbID) if (!defined $cs_mlss_id);
|
|
749 my $mlss_adaptor = $self->adaptor->db->get_MethodLinkSpeciesSetAdaptor();
|
|
750
|
|
751 my $cs_mlss = $mlss_adaptor->fetch_by_dbID($cs_mlss_id);
|
|
752
|
|
753 $all_conservation_scores = $conservation_score_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($cs_mlss, $self->{'reference_slice'}, $display_size, $display_type, $window_size);
|
|
754
|
|
755 return $all_conservation_scores;
|
|
756 }
|
|
757
|
|
758
|
|
759 =head2 get_all_ConstrainedElements
|
|
760
|
|
761 Arg 1 : (opt) string $method_link_type (default = GERP_CONSTRAINED_ELEMENT)
|
|
762 Arg 2 : (opt) listref Bio::EnsEMBL::Compara::GenomeDB $species_set
|
|
763 (default, the set of species from the MethodLinkSpeciesSet used
|
|
764 to build this AlignSlice)
|
|
765 Example : my $constrained_elements =
|
|
766 $align_slice->get_all_ConstrainedElements();
|
|
767 Description: Retrieve the corresponding constrained elements for these alignments.
|
|
768 Objects will be located on this AlignSlice, i.e. the
|
|
769 reference_slice, reference_slice_start, reference_slice_end
|
|
770 and reference_slice_strand will refer to this AlignSlice
|
|
771 object
|
|
772 Returntype : ref. to an array of Bio::EnsEMBL::Compara::ConstrainedElement
|
|
773 objects.
|
|
774 Caller : object::methodname
|
|
775 Status : At risk
|
|
776
|
|
777 =cut
|
|
778
|
|
779 sub get_all_ConstrainedElements {
|
|
780 my ($self, $method_link_type, $species_set) = @_;
|
|
781 my $all_constrained_elements = [];
|
|
782
|
|
783 $method_link_type ||= "GERP_CONSTRAINED_ELEMENT";
|
|
784 my $key_cache = "_constrained_elements_".$method_link_type;
|
|
785 if ($species_set) {
|
|
786 $key_cache .= "::" . join("-", sort map {s/\W/_/g} map {$_->name} @$species_set);
|
|
787 } else {
|
|
788 $species_set = $self->{_method_link_species_set}->species_set;
|
|
789 }
|
|
790
|
|
791 if (!defined($self->{$key_cache})) {
|
|
792 my $method_link_species_set_adaptor = $self->adaptor->db->get_MethodLinkSpeciesSetAdaptor();
|
|
793 my $method_link_species_set = $method_link_species_set_adaptor->fetch_by_method_link_type_GenomeDBs(
|
|
794 $method_link_type, $self->{_method_link_species_set}->species_set);
|
|
795
|
|
796 if ($method_link_species_set) {
|
|
797 my $constrained_element_adaptor = $self->adaptor->db->get_ConstrainedElementAdaptor();
|
|
798 $all_constrained_elements = $constrained_element_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice(
|
|
799 $method_link_species_set, $self->reference_Slice);
|
|
800 my $big_mapper = $self->{_reference_Mapper};
|
|
801 foreach my $this_constrained_element (@{$all_constrained_elements}) {
|
|
802 my $reference_slice_start;
|
|
803 my $reference_slice_end;
|
|
804 my $reference_slice_strand;
|
|
805
|
|
806 my @alignment_coords = $big_mapper->map_coordinates(
|
|
807 "sequence", # $self->genomic_align->dbID,
|
|
808 $this_constrained_element->start + $this_constrained_element->slice->start - 1,
|
|
809 $this_constrained_element->end + $this_constrained_element->slice->start - 1,
|
|
810 $this_constrained_element->strand,
|
|
811 "sequence" # $from_mapper->from
|
|
812 );
|
|
813 foreach my $alignment_coord (@alignment_coords) {
|
|
814 next if (!$alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate"));
|
|
815 if (!defined($reference_slice_strand)) {
|
|
816 $reference_slice_start = $alignment_coord->start;
|
|
817 $reference_slice_end = $alignment_coord->end;
|
|
818 $reference_slice_strand = $alignment_coord->strand;
|
|
819 } else {
|
|
820 if ($alignment_coord->start < $reference_slice_start) {
|
|
821 $reference_slice_start = $alignment_coord->start;
|
|
822 }
|
|
823 if ($alignment_coord->end > $reference_slice_end) {
|
|
824 $reference_slice_end = $alignment_coord->end;
|
|
825 }
|
|
826 }
|
|
827 }
|
|
828 $this_constrained_element->slice($self);
|
|
829 $this_constrained_element->start($reference_slice_start);
|
|
830 $this_constrained_element->end($reference_slice_end);
|
|
831 $this_constrained_element->strand($reference_slice_strand);
|
|
832 }
|
|
833 }
|
|
834 $self->{$key_cache} = $all_constrained_elements;
|
|
835 }
|
|
836
|
|
837 return $self->{$key_cache};
|
|
838 }
|
|
839
|
|
840
|
|
841 =head2 _create_underlying_Slices (experimental)
|
|
842
|
|
843 Arg[1] : listref of Bio::EnsEMBL::Compara::GenomicAlignBlocks
|
|
844 $genomic_align_blocks
|
|
845 Arg[2] : [optional] boolean $expanded (default = FALSE)
|
|
846 Arg[3] : [optional] boolean $solve_overlapping (default = FALSE)
|
|
847 Arg[4] : [optional] boolean $preserve_blocks (default = FALSE)
|
|
848 Example :
|
|
849 Description: Creates a set of Bio::EnsEMBL::Compara::AlignSlice::Slices
|
|
850 and attach it to this object.
|
|
851 Returntype :
|
|
852 Exceptions : warns about overlapping GenomicAlignBlocks
|
|
853 Caller :
|
|
854
|
|
855 =cut
|
|
856
|
|
857 sub _create_underlying_Slices {
|
|
858 my ($self, $genomic_align_blocks, $expanded, $solve_overlapping, $preserve_blocks, $species_order) = @_;
|
|
859 my $strand = $self->reference_Slice->strand;
|
|
860
|
|
861 my $align_slice_length = 0;
|
|
862 my $last_ref_pos;
|
|
863 if ($strand == 1) {
|
|
864 $last_ref_pos = $self->reference_Slice->start;
|
|
865 } else {
|
|
866 $last_ref_pos = $self->reference_Slice->end;
|
|
867 }
|
|
868
|
|
869 my $ref_genome_db = $self->adaptor->db->get_GenomeDBAdaptor->fetch_by_Slice($self->reference_Slice);
|
|
870 my $big_mapper = Bio::EnsEMBL::Mapper->new("sequence", "alignment");
|
|
871
|
|
872 my $sorted_genomic_align_blocks;
|
|
873
|
|
874 if ($solve_overlapping eq "restrict") {
|
|
875 $sorted_genomic_align_blocks = _sort_and_restrict_GenomicAlignBlocks($genomic_align_blocks);
|
|
876 } elsif ($solve_overlapping) {
|
|
877 $sorted_genomic_align_blocks = _sort_and_compile_GenomicAlignBlocks($genomic_align_blocks);
|
|
878 } else {
|
|
879 $sorted_genomic_align_blocks = _sort_GenomicAlignBlocks($genomic_align_blocks);
|
|
880 }
|
|
881 @$sorted_genomic_align_blocks = reverse(@$sorted_genomic_align_blocks) if ($strand == -1);
|
|
882 foreach my $this_genomic_align_block (@$sorted_genomic_align_blocks) {
|
|
883 my $original_genomic_align_block = $this_genomic_align_block;
|
|
884 my ($from, $to);
|
|
885
|
|
886 if ($preserve_blocks) {
|
|
887 ## Don't restrict the block. Set from and to to 1 and length respectively
|
|
888 $from = 1;
|
|
889 $to = $this_genomic_align_block->length;
|
|
890 } else {
|
|
891 #need to check that the block is still overlapping the slice - it may
|
|
892 #have already been restricted by the options above.
|
|
893 if ($this_genomic_align_block->reference_genomic_align->dnafrag_start > $self->reference_Slice->end || $this_genomic_align_block->reference_genomic_align->dnafrag_end < $self->reference_Slice->start) {
|
|
894 next;
|
|
895 }
|
|
896 ($this_genomic_align_block, $from, $to) = $this_genomic_align_block->restrict_between_reference_positions(
|
|
897 $self->reference_Slice->start, $self->reference_Slice->end);
|
|
898 }
|
|
899
|
|
900 $original_genomic_align_block->{_alignslice_from} = $from;
|
|
901 $original_genomic_align_block->{_alignslice_to} = $to;
|
|
902
|
|
903 my $reference_genomic_align = $this_genomic_align_block->reference_genomic_align;
|
|
904
|
|
905 #If I haven't needed to restrict, I don't gain this link so add it here
|
|
906 if (!defined $reference_genomic_align->genomic_align_block->reference_genomic_align) {
|
|
907 $reference_genomic_align->genomic_align_block($this_genomic_align_block);
|
|
908 }
|
|
909
|
|
910 my ($this_pos, $this_gap_between_genomic_align_blocks);
|
|
911 if ($strand == 1) {
|
|
912 $this_pos = $reference_genomic_align->dnafrag_start;
|
|
913 $this_gap_between_genomic_align_blocks = $this_pos - $last_ref_pos;
|
|
914 } else {
|
|
915 $this_pos = $reference_genomic_align->dnafrag_end;
|
|
916 $this_gap_between_genomic_align_blocks = $last_ref_pos - $this_pos;
|
|
917 }
|
|
918
|
|
919 if ($this_gap_between_genomic_align_blocks > 0) {
|
|
920 ## Add mapper info for inter-genomic_align_block space
|
|
921 if ($strand == 1) {
|
|
922 $big_mapper->add_map_coordinates(
|
|
923 'sequence',
|
|
924 $last_ref_pos,
|
|
925 $this_pos - 1,
|
|
926 $strand,
|
|
927 'alignment',
|
|
928 $align_slice_length + 1,
|
|
929 $align_slice_length + $this_gap_between_genomic_align_blocks,
|
|
930 );
|
|
931 } else {
|
|
932 $big_mapper->add_map_coordinates(
|
|
933 'sequence',
|
|
934 $this_pos + 1,
|
|
935 $last_ref_pos,
|
|
936 $strand,
|
|
937 'alignment',
|
|
938 $align_slice_length + 1,
|
|
939 $align_slice_length + $this_gap_between_genomic_align_blocks,
|
|
940 );
|
|
941 }
|
|
942 $align_slice_length += $this_gap_between_genomic_align_blocks;
|
|
943 }
|
|
944 $reference_genomic_align->genomic_align_block->start($align_slice_length + 1);
|
|
945 $original_genomic_align_block->{_alignslice_start} = $align_slice_length;
|
|
946 if ($expanded) {
|
|
947 $align_slice_length += CORE::length($reference_genomic_align->aligned_sequence("+FAKE_SEQ"));
|
|
948 $reference_genomic_align->genomic_align_block->end($align_slice_length);
|
|
949 $big_mapper->add_Mapper($reference_genomic_align->get_Mapper(0));
|
|
950 } else {
|
|
951 $align_slice_length += $reference_genomic_align->dnafrag_end - $reference_genomic_align->dnafrag_start + 1;
|
|
952 $reference_genomic_align->genomic_align_block->end($align_slice_length);
|
|
953 $big_mapper->add_Mapper($reference_genomic_align->get_Mapper(0,1));
|
|
954 }
|
|
955 $reference_genomic_align->genomic_align_block->slice($self);
|
|
956
|
|
957 if ($strand == 1) {
|
|
958 $last_ref_pos = $reference_genomic_align->dnafrag_end + 1;
|
|
959 } else {
|
|
960 $last_ref_pos = $reference_genomic_align->dnafrag_start - 1;
|
|
961 }
|
|
962 }
|
|
963 my ($this_pos, $this_gap_between_genomic_align_blocks);
|
|
964 if ($strand == 1) {
|
|
965 $this_pos = $self->reference_Slice->end;
|
|
966 $this_gap_between_genomic_align_blocks = $this_pos - ($last_ref_pos - 1);
|
|
967 } else {
|
|
968 $this_pos = $self->reference_Slice->start;
|
|
969 $this_gap_between_genomic_align_blocks = $last_ref_pos + 1 - $this_pos;
|
|
970 }
|
|
971
|
|
972 ## $last_ref_pos is the next nucleotide position after the last mapped one.
|
|
973 if ($this_gap_between_genomic_align_blocks > 0) {
|
|
974 if ($strand == 1) {
|
|
975 $big_mapper->add_map_coordinates(
|
|
976 'sequence',
|
|
977 $last_ref_pos,
|
|
978 $this_pos,
|
|
979 $strand,
|
|
980 'alignment',
|
|
981 $align_slice_length + 1,
|
|
982 $align_slice_length + $this_gap_between_genomic_align_blocks,
|
|
983 );
|
|
984 } else {
|
|
985 $big_mapper->add_map_coordinates(
|
|
986 'sequence',
|
|
987 $this_pos,
|
|
988 $last_ref_pos,
|
|
989 $strand,
|
|
990 'alignment',
|
|
991 $align_slice_length + 1,
|
|
992 $align_slice_length + $this_gap_between_genomic_align_blocks,
|
|
993 );
|
|
994 }
|
|
995 $align_slice_length += $this_gap_between_genomic_align_blocks;
|
|
996 }
|
|
997
|
|
998 if ($species_order) {
|
|
999 foreach my $species_def (@$species_order) {
|
|
1000 my $genome_db_name = $species_def->{genome_db}->name;
|
|
1001 # print STDERR "SPECIES:: ", $genome_db_name, "\n";
|
|
1002 my $new_slice = new Bio::EnsEMBL::Compara::AlignSlice::Slice(
|
|
1003 -length => $align_slice_length,
|
|
1004 -requesting_slice => $self->reference_Slice,
|
|
1005 -align_slice => $self,
|
|
1006 -method_link_species_set => $self->{_method_link_species_set},
|
|
1007 -genome_db => $species_def->{genome_db},
|
|
1008 -expanded => $expanded,
|
|
1009 );
|
|
1010 $new_slice->{genomic_align_ids} = $species_def->{genomic_align_ids};
|
|
1011 push(@{$self->{slices}->{lc($genome_db_name)}}, $new_slice);
|
|
1012 push(@{$self->{_slices}}, $new_slice);
|
|
1013 }
|
|
1014 } else {
|
|
1015 # print STDERR "SPECIES:: ", $ref_genome_db->name, "\n";
|
|
1016 $self->{slices}->{lc($ref_genome_db->name)} = [new Bio::EnsEMBL::Compara::AlignSlice::Slice(
|
|
1017 -length => $align_slice_length,
|
|
1018 -requesting_slice => $self->reference_Slice,
|
|
1019 -align_slice => $self,
|
|
1020 -method_link_species_set => $self->{_method_link_species_set},
|
|
1021 -genome_db => $ref_genome_db,
|
|
1022 -expanded => $expanded,
|
|
1023 )];
|
|
1024 $self->{_slices} = [$self->{slices}->{lc($ref_genome_db->name)}->[0]];
|
|
1025 }
|
|
1026
|
|
1027 $self->{slices}->{lc($ref_genome_db->name)}->[0]->add_Slice_Mapper_pair(
|
|
1028 $self->reference_Slice,
|
|
1029 $big_mapper,
|
|
1030 1,
|
|
1031 $align_slice_length,
|
|
1032 $self->reference_Slice->strand
|
|
1033 );
|
|
1034 $self->{_reference_Mapper} = $big_mapper;
|
|
1035
|
|
1036 foreach my $this_genomic_align_block (@$sorted_genomic_align_blocks) {
|
|
1037 if (UNIVERSAL::isa($this_genomic_align_block, "Bio::EnsEMBL::Compara::GenomicAlignTree")) {
|
|
1038 ## For trees, loop through all nodes (internal and leaves) to add the GenomicAligns
|
|
1039 foreach my $this_genomic_align_node (@{$this_genomic_align_block->get_all_nodes}) {
|
|
1040 # but we have to skip the reference node as this has already been added to the guide Slice
|
|
1041 next if ($this_genomic_align_node eq $this_genomic_align_block->reference_genomic_align_node);
|
|
1042
|
|
1043 # For composite segments (2X genomes), the node will link to several GenomicAligns.
|
|
1044 # Add each of them to one of the AS:Slice objects
|
|
1045 foreach my $this_genomic_align (@{$this_genomic_align_node->get_all_genomic_aligns_for_node}) {
|
|
1046 # Link to genomic_align_block may have been lost during tree minimization
|
|
1047 $this_genomic_align->genomic_align_block_id(0);
|
|
1048 $this_genomic_align->genomic_align_block($this_genomic_align_block);
|
|
1049 $self->_add_GenomicAlign_to_a_Slice($this_genomic_align, $this_genomic_align_block,
|
|
1050 $species_order, $align_slice_length);
|
|
1051 }
|
|
1052 }
|
|
1053 } else {
|
|
1054 ## For plain alignments, just use all non-reference GenomicAlign objects
|
|
1055 foreach my $this_genomic_align
|
|
1056 (@{$this_genomic_align_block->get_all_non_reference_genomic_aligns}) {
|
|
1057 $self->_add_GenomicAlign_to_a_Slice($this_genomic_align, $this_genomic_align_block,
|
|
1058 $species_order, $align_slice_length);
|
|
1059 }
|
|
1060 }
|
|
1061 }
|
|
1062 #It is possible for the same region in the ref species (eg Gibbon) to align to 2 different blocks (pairwise to human in the case of the EPO low coverage alignments). In this case, although the incoming $genomic_align_blocks will have 2 blocks, the $sorted_genomic_align_blocks will only contain 1 of the blocks. It may happen that one species occurs in one block (eg gorilla) but not in the other. However, the $species_order will contain gorilla but the $sorted_genomic_align_blocks may not. This results in a slice being created for gorilla but it has no slice_mapper_pairs. Must check the slices and remove any that have {'slice_mapper_pairs'} as undef (no alignment comes out as a GAP).
|
|
1063
|
|
1064 if ($species_order) {
|
|
1065 my $slices = $self->{_slices};
|
|
1066 for (my $i = (@$slices-1); $i >= 0; --$i) {
|
|
1067 if (@{$slices->[$i]->{'slice_mapper_pairs'}} == 0) {
|
|
1068 #remove from {slices}
|
|
1069 delete $self->{slices}->{$slices->[$i]->genome_db->name};
|
|
1070 #remove from {_slices}
|
|
1071 splice @$slices, $i, 1;
|
|
1072 }
|
|
1073 }
|
|
1074 }
|
|
1075
|
|
1076
|
|
1077
|
|
1078 return $self;
|
|
1079 }
|
|
1080
|
|
1081 =head2 _add_GenomicAlign_to_a_Slice
|
|
1082
|
|
1083 =cut
|
|
1084
|
|
1085 sub _add_GenomicAlign_to_a_Slice {
|
|
1086 my ($self, $this_genomic_align, $this_genomic_align_block, $species_order, $align_slice_length) = @_;
|
|
1087
|
|
1088 my $expanded = $self->{expanded};
|
|
1089 my $species = $this_genomic_align->dnafrag->genome_db->name;
|
|
1090
|
|
1091 if (!defined($self->{slices}->{lc($species)})) {
|
|
1092 $self->{slices}->{lc($species)} = [new Bio::EnsEMBL::Compara::AlignSlice::Slice(
|
|
1093 -length => $align_slice_length,
|
|
1094 -requesting_slice => $self->reference_Slice,
|
|
1095 -align_slice => $self,
|
|
1096 -method_link_species_set => $self->{_method_link_species_set},
|
|
1097 -genome_db => $this_genomic_align->dnafrag->genome_db,
|
|
1098 -expanded => $expanded,
|
|
1099 )];
|
|
1100 push(@{$self->{_slices}}, $self->{slices}->{lc($species)}->[0]);
|
|
1101 }
|
|
1102
|
|
1103 my $this_block_start = $this_genomic_align_block->start;
|
|
1104 my $this_block_end = $this_genomic_align_block->end;
|
|
1105 my $this_core_slice = $this_genomic_align->get_Slice();
|
|
1106 if (!$this_core_slice) {
|
|
1107 $this_core_slice = new Bio::EnsEMBL::Slice(
|
|
1108 -coord_system => $aligngap_coord_system,
|
|
1109 -seq_region_name => "GAP",
|
|
1110 -start => $this_block_start,
|
|
1111 -end => $this_block_end,
|
|
1112 -strand => 0
|
|
1113 );
|
|
1114 $this_core_slice->{seq} = "." x ($this_block_end - $this_block_start + 1);
|
|
1115 }
|
|
1116 return if (!$this_core_slice); ## The restriction of the GenomicAlignBlock may return a void GenomicAlign
|
|
1117
|
|
1118 ## This creates a link between the slice and the tree node. This is required to display
|
|
1119 ## the tree on the web interface.
|
|
1120 if ($this_genomic_align->genome_db->name eq "ancestral_sequences") {
|
|
1121 foreach my $genomic_align_node (@{$this_genomic_align_block->get_all_sorted_genomic_align_nodes}) {
|
|
1122 my $genomic_align_group = $genomic_align_node->genomic_align_group;
|
|
1123 next if (!$genomic_align_group);
|
|
1124
|
|
1125 foreach my $genomic_align (@{$genomic_align_group->get_all_GenomicAligns}) {
|
|
1126 if ($this_genomic_align == $genomic_align) {
|
|
1127 my $simple_tree = $genomic_align_node->newick_simple_format();
|
|
1128 $simple_tree =~ s/\_[^\_]+\_\d+\_\d+\[[\+\-]\]//g;
|
|
1129 $simple_tree =~ s/\:[\d\.]+//g;
|
|
1130 $this_core_slice->{_tree} = $simple_tree;
|
|
1131 last;
|
|
1132 }
|
|
1133 }
|
|
1134 }
|
|
1135 }
|
|
1136
|
|
1137 my $this_mapper = $this_genomic_align->get_Mapper(0, !$expanded);
|
|
1138 # Fix block start and block end for composite segments (2X genomes)
|
|
1139 if ($this_genomic_align->cigar_line =~ /^(\d*)X/ or $this_genomic_align->cigar_line =~ /(\d*)X$/) {
|
|
1140 $this_block_start = undef;
|
|
1141 $this_block_end = undef;
|
|
1142 my @blocks = $this_mapper->map_coordinates("sequence", $this_genomic_align->dnafrag_start,
|
|
1143 $this_genomic_align->dnafrag_end, $this_genomic_align->dnafrag_strand, "sequence");
|
|
1144 foreach my $this_block (@blocks) {
|
|
1145 next if ($this_block->isa("Bio::EnsEMBL::Mapper::Gap"));
|
|
1146 $this_block_start = $this_block->start if (!defined($this_block_start) or $this_block->start < $this_block_start);
|
|
1147 $this_block_end = $this_block->end if (!defined($this_block_end) or $this_block->end > $this_block_end);
|
|
1148 }
|
|
1149 }
|
|
1150
|
|
1151 # Choose the appropriate AS::Slice for adding this bit of the alignment
|
|
1152 my $this_underlying_slice = $self->_choose_underlying_Slice($this_genomic_align, $this_block_start,
|
|
1153 $this_block_end, $align_slice_length, $species_order);
|
|
1154
|
|
1155 # Add a Slice, Mapper, and start-end-strand coordinates to an underlying AS::Slice
|
|
1156 $this_underlying_slice->add_Slice_Mapper_pair(
|
|
1157 $this_core_slice,
|
|
1158 $this_mapper,
|
|
1159 $this_block_start,
|
|
1160 $this_block_end,
|
|
1161 $this_genomic_align->dnafrag_strand
|
|
1162 );
|
|
1163 return;
|
|
1164 }
|
|
1165
|
|
1166
|
|
1167 sub _choose_underlying_Slice {
|
|
1168 my ($self, $this_genomic_align, $this_block_start, $this_block_end, $align_slice_length, $species_order) = @_;
|
|
1169 my $underlying_slice = undef;
|
|
1170
|
|
1171 my $expanded = $self->{expanded};
|
|
1172 my $species = $this_genomic_align->dnafrag->genome_db->name;
|
|
1173
|
|
1174 if (defined($this_genomic_align->{_temporary_AS_underlying_Slice})) {
|
|
1175 my $preset_underlying_slice = $this_genomic_align->{_temporary_AS_underlying_Slice};
|
|
1176 delete($this_genomic_align->{_temporary_AS_underlying_Slice});
|
|
1177 return $preset_underlying_slice;
|
|
1178 }
|
|
1179
|
|
1180 if (!defined($self->{slices}->{lc($species)})) {
|
|
1181 ## No slice for this species yet. Create, store and return it
|
|
1182 $underlying_slice = new Bio::EnsEMBL::Compara::AlignSlice::Slice(
|
|
1183 -length => $align_slice_length,
|
|
1184 -requesting_slice => $self->reference_Slice,
|
|
1185 -align_slice => $self,
|
|
1186 -method_link_species_set => $self->{_method_link_species_set},
|
|
1187 -genome_db => $this_genomic_align->dnafrag->genome_db,
|
|
1188 -expanded => $expanded,
|
|
1189 );
|
|
1190 push(@{$self->{_slices}}, $underlying_slice);
|
|
1191 push(@{$self->{slices}->{lc($species)}}, $underlying_slice);
|
|
1192 return $underlying_slice;
|
|
1193 }
|
|
1194
|
|
1195 if ($species_order) {
|
|
1196 my $preset_underlying_slice = undef;
|
|
1197 foreach my $this_underlying_slice (@{$self->{_slices}}) {
|
|
1198 if (!$this_genomic_align->{original_dbID} and $this_genomic_align->dbID) {
|
|
1199 $this_genomic_align->{original_dbID} = $this_genomic_align->dbID;
|
|
1200 }
|
|
1201 if (grep {$_ == $this_genomic_align->{original_dbID}}
|
|
1202 @{$this_underlying_slice->{genomic_align_ids}}) {
|
|
1203 $preset_underlying_slice = $this_underlying_slice;
|
|
1204 }
|
|
1205 }
|
|
1206 if ($preset_underlying_slice) {
|
|
1207 my $overlap = 0;
|
|
1208 my $slice_mapper_pairs = $preset_underlying_slice->get_all_Slice_Mapper_pairs();
|
|
1209 foreach my $slice_mapper_pair (@$slice_mapper_pairs) {
|
|
1210 my $block_start = $slice_mapper_pair->{start};
|
|
1211 my $block_end = $slice_mapper_pair->{end};
|
|
1212 #a block may not have a start and end if there is no sequence
|
|
1213 #eg the cigar_line looks like 139D17186X
|
|
1214 next if (!defined $this_block_start || !defined $this_block_end);
|
|
1215 if ($this_block_start <= $block_end and $this_block_end >= $block_start) {
|
|
1216 $overlap = 1;
|
|
1217 last;
|
|
1218 }
|
|
1219 }
|
|
1220 if (!$overlap) {
|
|
1221 ## This block does not overlap any previous block: add it!
|
|
1222 $underlying_slice = $preset_underlying_slice;
|
|
1223 }
|
|
1224 }
|
|
1225 }
|
|
1226
|
|
1227 if (!$underlying_slice) {
|
|
1228 ## Try to add this alignment to an existing underlying Bio::EnsEMBL::Compara::AlignSlice::Slice
|
|
1229 SLICE: foreach my $this_underlying_slice (@{$self->{slices}->{lc($species)}}) {
|
|
1230 my $slice_mapper_pairs = $this_underlying_slice->get_all_Slice_Mapper_pairs();
|
|
1231 PAIRS: foreach my $slice_mapper_pair (@$slice_mapper_pairs) {
|
|
1232 my $block_start = $slice_mapper_pair->{start};
|
|
1233 my $block_end = $slice_mapper_pair->{end};
|
|
1234 if ($this_block_start <= $block_end and $this_block_end >= $block_start) {
|
|
1235 next SLICE; ## This block overlaps a previous block
|
|
1236 }
|
|
1237 }
|
|
1238 ## This block does not overlap any previous block: add it!
|
|
1239 $underlying_slice = $this_underlying_slice;
|
|
1240 }
|
|
1241 }
|
|
1242
|
|
1243 if (!$underlying_slice) {
|
|
1244 ## This block overlaps at least one block in every available underlying
|
|
1245 ## Bio::EnsEMBL::Compara::AlignSlice::Slice. Create a new one!
|
|
1246 $underlying_slice = new Bio::EnsEMBL::Compara::AlignSlice::Slice(
|
|
1247 -length => $align_slice_length,
|
|
1248 -requesting_slice => $self->reference_Slice,
|
|
1249 -align_slice => $self,
|
|
1250 -method_link_species_set => $self->{_method_link_species_set},
|
|
1251 -genome_db => $this_genomic_align->dnafrag->genome_db,
|
|
1252 -expanded => $expanded,
|
|
1253 );
|
|
1254 push(@{$self->{_slices}}, $underlying_slice);
|
|
1255 push(@{$self->{slices}->{lc($species)}}, $underlying_slice);
|
|
1256 }
|
|
1257
|
|
1258 # if ($this_genomic_align->cigar_line =~ /X/) {
|
|
1259 # ## This GenomicAlign is part of a composite alignment
|
|
1260 # my $genomic_align_group = $this_genomic_align->genomic_align_group_by_type("composite");
|
|
1261 # foreach my $this_genomic_align (@{$genomic_align_group->genomic_align_array}) {
|
|
1262 # # next if ($this_genomic_align
|
|
1263 # }
|
|
1264 # }
|
|
1265
|
|
1266 return $underlying_slice;
|
|
1267 }
|
|
1268
|
|
1269
|
|
1270 =head2 _sort_and_restrict_GenomicAlignBlocks
|
|
1271
|
|
1272 Arg[1] : listref of Bio::EnsEMBL::Compara::GenomicAlignBlocks $gabs
|
|
1273 Example : $sorted_gabs = _sort_GenomicAlignBlocks($gabs);
|
|
1274 Description : This method returns the original list of
|
|
1275 Bio::EnsEMBL::Compara::GenomicAlignBlock objects in order
|
|
1276 Returntype : listref of Bio::EnsEMBL::Compara::GenomicAlignBlock objects
|
|
1277 Exceptions :
|
|
1278 Caller : methodname()
|
|
1279
|
|
1280 =cut
|
|
1281
|
|
1282 sub _sort_and_restrict_GenomicAlignBlocks {
|
|
1283 my ($genomic_align_blocks) = @_;
|
|
1284 my $sorted_genomic_align_blocks = [];
|
|
1285 return $sorted_genomic_align_blocks if (!$genomic_align_blocks);
|
|
1286
|
|
1287 my $last_end;
|
|
1288 foreach my $this_genomic_align_block (sort _sort_gabs @{$genomic_align_blocks}) {
|
|
1289 if (defined($last_end) and
|
|
1290 $this_genomic_align_block->reference_genomic_align->dnafrag_start <= $last_end) {
|
|
1291 if ($this_genomic_align_block->reference_genomic_align->dnafrag_end > $last_end) {
|
|
1292 $this_genomic_align_block = $this_genomic_align_block->restrict_between_reference_positions($last_end + 1, undef);
|
|
1293 } else {
|
|
1294 warning("Ignoring GenomicAlignBlock because it overlaps".
|
|
1295 " previous GenomicAlignBlock");
|
|
1296 next;
|
|
1297 }
|
|
1298 }
|
|
1299 $last_end = $this_genomic_align_block->reference_genomic_align->dnafrag_end;
|
|
1300 push(@$sorted_genomic_align_blocks, $this_genomic_align_block);
|
|
1301 }
|
|
1302
|
|
1303 return $sorted_genomic_align_blocks;
|
|
1304 }
|
|
1305
|
|
1306 =head2 _sort_GenomicAlignBlocks
|
|
1307
|
|
1308 Arg[1] : listref of Bio::EnsEMBL::Compara::GenomicAlignBlocks $gabs
|
|
1309 Example : $sorted_gabs = _sort_GenomicAlignBlocks($gabs);
|
|
1310 Description : This method returns the original list of
|
|
1311 Bio::EnsEMBL::Compara::GenomicAlignBlock objects in order
|
|
1312 Returntype : listref of Bio::EnsEMBL::Compara::GenomicAlignBlock objects
|
|
1313 Exceptions :
|
|
1314 Caller : methodname()
|
|
1315
|
|
1316 =cut
|
|
1317
|
|
1318 sub _sort_GenomicAlignBlocks {
|
|
1319 my ($genomic_align_blocks) = @_;
|
|
1320 my $sorted_genomic_align_blocks = [];
|
|
1321 return $sorted_genomic_align_blocks if (!$genomic_align_blocks);
|
|
1322
|
|
1323 my $last_end;
|
|
1324 foreach my $this_genomic_align_block (sort _sort_gabs @{$genomic_align_blocks}) {
|
|
1325 if (!defined($last_end) or
|
|
1326 $this_genomic_align_block->reference_genomic_align->dnafrag_start > $last_end) {
|
|
1327 push(@$sorted_genomic_align_blocks, $this_genomic_align_block);
|
|
1328 $last_end = $this_genomic_align_block->reference_genomic_align->dnafrag_end;
|
|
1329 } else {
|
|
1330 my $block_id;
|
|
1331 if (UNIVERSAL::isa($a, "Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
|
|
1332 warning("Ignoring Bio::EnsEMBL::Compara::GenomicAlignBlock #".
|
|
1333 ($this_genomic_align_block->dbID or "-unknown")." because it overlaps".
|
|
1334 " previous Bio::EnsEMBL::Compara::GenomicAlignBlock");
|
|
1335 } else {
|
|
1336 warning("Ignoring Bio::EnsEMBL::Compara::GenomicAlignTree #".
|
|
1337 ($this_genomic_align_block->node_id or "-unknown")." because it overlaps".
|
|
1338 " previous Bio::EnsEMBL::Compara::GenomicAlignTree");
|
|
1339 }
|
|
1340
|
|
1341 }
|
|
1342 }
|
|
1343
|
|
1344 return $sorted_genomic_align_blocks;
|
|
1345 }
|
|
1346
|
|
1347 sub _sort_gabs {
|
|
1348
|
|
1349 if (UNIVERSAL::isa($a, "Bio::EnsEMBL::Compara::GenomicAlignBlock")) {
|
|
1350 _sort_genomic_align_block();
|
|
1351 } else {
|
|
1352 _sort_genomic_align_tree();
|
|
1353 }
|
|
1354 }
|
|
1355
|
|
1356 sub _sort_genomic_align_block {
|
|
1357
|
|
1358 if ($a->reference_genomic_align->dnafrag_start == $b->reference_genomic_align->dnafrag_start) {
|
|
1359 ## This may happen when a block has been split into small pieces and some of them contain
|
|
1360 ## gaps only for the reference species. In this case, use another species for sorting these
|
|
1361 ## genomic_align_blocks
|
|
1362 for (my $i = 0; $i<@{$a->get_all_non_reference_genomic_aligns()}; $i++) {
|
|
1363 for (my $j = 0; $j<@{$b->get_all_non_reference_genomic_aligns()}; $j++) {
|
|
1364 next if ($a->get_all_non_reference_genomic_aligns->[$i]->dnafrag_id !=
|
|
1365 $b->get_all_non_reference_genomic_aligns->[$j]->dnafrag_id);
|
|
1366 if (($a->get_all_non_reference_genomic_aligns->[$i]->dnafrag_start !=
|
|
1367 $b->get_all_non_reference_genomic_aligns->[$j]->dnafrag_start) and
|
|
1368 ($a->get_all_non_reference_genomic_aligns->[$i]->dnafrag_strand ==
|
|
1369 $b->get_all_non_reference_genomic_aligns->[$j]->dnafrag_strand)) {
|
|
1370 ## This other genomic_align is not a full gap and ca be used to sort these blocks
|
|
1371 if ($a->get_all_non_reference_genomic_aligns->[$i]->dnafrag_strand == 1) {
|
|
1372 return $a->get_all_non_reference_genomic_aligns->[$i]->dnafrag_start <=>
|
|
1373 $b->get_all_non_reference_genomic_aligns->[$j]->dnafrag_start;
|
|
1374 } else {
|
|
1375 return $b->get_all_non_reference_genomic_aligns->[$j]->dnafrag_start <=>
|
|
1376 $a->get_all_non_reference_genomic_aligns->[$i]->dnafrag_start;
|
|
1377 }
|
|
1378 }
|
|
1379 }
|
|
1380 }
|
|
1381 } else {
|
|
1382 return $a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start
|
|
1383 }
|
|
1384 }
|
|
1385
|
|
1386 sub _sort_genomic_align_tree {
|
|
1387
|
|
1388 if ($a->reference_genomic_align->dnafrag_start == $b->reference_genomic_align->dnafrag_start) {
|
|
1389 ## This may happen when a block has been split into small pieces and some of them contain
|
|
1390 ## gaps only for the reference species. In this case, use another species for sorting these
|
|
1391 ## genomic_align_blocks
|
|
1392 my $a_leaves = $a->get_all_leaves;
|
|
1393 my $b_leaves = $b->get_all_leaves;
|
|
1394
|
|
1395 for (my $i = 0; $i < @$a_leaves; $i++) {
|
|
1396 for (my $j = 0; $j < @$b_leaves; $j++) {
|
|
1397 #look at high coverage sequences only
|
|
1398 my $a_gas = $a_leaves->[$i]->get_all_genomic_aligns_for_node;
|
|
1399 next if (@$a_gas > 1);
|
|
1400 my $a_ga = $a_gas->[0];
|
|
1401
|
|
1402 my $b_gas = $b_leaves->[$j]->get_all_genomic_aligns_for_node;
|
|
1403 next if (@$b_gas > 1);
|
|
1404 my $b_ga = $b_gas->[0];
|
|
1405
|
|
1406 next if ($a_ga->dnafrag_id != $b_ga->dnafrag_id);
|
|
1407 if (($a_ga->dnafrag_start != $b_ga->dnafrag_start) and ($a_ga->dnafrag_strand == $b_ga->dnafrag_strand)) {
|
|
1408 ## This other genomic_align is not a full gap and ca be used to sort these blocks
|
|
1409 if ($a_ga->dnafrag_strand == 1) {
|
|
1410 return $a_ga->dnafrag_start <=> $b_ga->dnafrag_start;
|
|
1411 } else {
|
|
1412 return $b_ga->dnafrag_start <=> $a_ga->dnafrag_start;
|
|
1413 }
|
|
1414 }
|
|
1415 }
|
|
1416 }
|
|
1417 } else {
|
|
1418 return $a->reference_genomic_align->dnafrag_start <=> $b->reference_genomic_align->dnafrag_start
|
|
1419 }
|
|
1420 }
|
|
1421
|
|
1422 =head2 _sort_and_compile_GenomicAlignBlocks
|
|
1423
|
|
1424 Arg[1] : listref of Bio::EnsEMBL::Compara::GenomicAlignBlocks $gabs
|
|
1425 Example : $sorted_fake_gabs = _sort_and_compile_GenomicAlignBlocks($gabs);
|
|
1426 Description : This method returns a list of
|
|
1427 Bio::EnsEMBL::Compara::GenomicAlignBlock objects sorted by
|
|
1428 position on the reference Bio::EnsEMBL::Compara::DnaFrag. If two
|
|
1429 or more Bio::EnsEMBL::Compara::GenomicAlignBlock objects
|
|
1430 overlap, it compile them, using the _compile_GenomicAlignBlocks
|
|
1431 method.
|
|
1432 Returntype : listref of Bio::EnsEMBL::Compara::GenomicAlignBlock objects
|
|
1433 Exceptions :
|
|
1434 Caller : methodname()
|
|
1435
|
|
1436 =cut
|
|
1437
|
|
1438 sub _sort_and_compile_GenomicAlignBlocks {
|
|
1439 my ($genomic_align_blocks) = @_;
|
|
1440 my $sorted_genomic_align_blocks = [];
|
|
1441 return $sorted_genomic_align_blocks if (!$genomic_align_blocks);
|
|
1442
|
|
1443 ##############################################################################################
|
|
1444 ##
|
|
1445 ## Compile GenomicAlignBlocks in group of GenomicAlignBlocks based on reference coordinates
|
|
1446 ##
|
|
1447 my $sets_of_genomic_align_blocks = [];
|
|
1448 my $start_pos;
|
|
1449 my $end_pos;
|
|
1450 my $this_set_of_genomic_align_blocks = [];
|
|
1451 foreach my $this_genomic_align_block (sort _sort_gabs @$genomic_align_blocks) {
|
|
1452 my $this_start_pos = $this_genomic_align_block->reference_genomic_align->dnafrag_start;
|
|
1453 my $this_end_pos = $this_genomic_align_block->reference_genomic_align->dnafrag_end;
|
|
1454 if (defined($end_pos) and ($this_start_pos <= $end_pos)) {
|
|
1455 # this genomic_align_block overlaps previous one. Extend this set_of_coordinates
|
|
1456 $end_pos = $this_end_pos if ($this_end_pos > $end_pos);
|
|
1457 } else {
|
|
1458 # there is a gap between this genomic_align_block and the previous one. Close and save
|
|
1459 # this set_of_genomic_align_blocks (if it exists) and start a new one.
|
|
1460 push(@{$sets_of_genomic_align_blocks}, [$start_pos, $end_pos, $this_set_of_genomic_align_blocks])
|
|
1461 if (defined(@$this_set_of_genomic_align_blocks));
|
|
1462 $start_pos = $this_start_pos;
|
|
1463 $end_pos = $this_end_pos;
|
|
1464 $this_set_of_genomic_align_blocks = [];
|
|
1465 }
|
|
1466 push(@$this_set_of_genomic_align_blocks, $this_genomic_align_block);
|
|
1467 }
|
|
1468 push(@{$sets_of_genomic_align_blocks}, [$start_pos, $end_pos, $this_set_of_genomic_align_blocks])
|
|
1469 if (defined(@$this_set_of_genomic_align_blocks));
|
|
1470 ##
|
|
1471 ##############################################################################################
|
|
1472
|
|
1473 foreach my $this_set_of_genomic_align_blocks (@$sets_of_genomic_align_blocks) {
|
|
1474 my $this_compiled_genomic_align_block;
|
|
1475 if (@$this_set_of_genomic_align_blocks == 1) {
|
|
1476 $this_compiled_genomic_align_block = $this_set_of_genomic_align_blocks->[0];
|
|
1477 } else {
|
|
1478 $this_compiled_genomic_align_block =
|
|
1479 _compile_GenomicAlignBlocks(@$this_set_of_genomic_align_blocks);
|
|
1480 }
|
|
1481 push(@{$sorted_genomic_align_blocks}, $this_compiled_genomic_align_block);
|
|
1482 }
|
|
1483
|
|
1484 return $sorted_genomic_align_blocks;
|
|
1485 }
|
|
1486
|
|
1487 =head2 _compile_GenomicAlignBlocks
|
|
1488
|
|
1489 Arg [1] : integer $start_pos (the start of the fake genomic_align)
|
|
1490 Arg [2] : integer $end_pos (the end of the fake genomic_align)
|
|
1491 Arg [3] : listref of Bio::EnsEMBL::Compara::GenomicAlignBlocks $set_of_genomic_align_blocks
|
|
1492 $all_genomic_align_blocks (the pairwise genomic_align_blocks used for
|
|
1493 this fake multiple genomic_aling_block)
|
|
1494 Example :
|
|
1495 Description :
|
|
1496 Returntype : Bio::EnsEMBL::Compara::GenomicAlignBlock object
|
|
1497 Exceptions :
|
|
1498 Caller : methodname
|
|
1499
|
|
1500 =cut
|
|
1501
|
|
1502 sub _compile_GenomicAlignBlocks {
|
|
1503 my ($start_pos, $end_pos, $all_genomic_align_blocks) = @_;
|
|
1504
|
|
1505 ############################################################################################
|
|
1506 ##
|
|
1507 ## Change strands in order to have all reference genomic aligns on the forward strand
|
|
1508 ##
|
|
1509 my $strand;
|
|
1510 foreach my $this_genomic_align_block (@$all_genomic_align_blocks) {
|
|
1511 my $this_genomic_align = $this_genomic_align_block->reference_genomic_align;
|
|
1512 if (!defined($strand)) {
|
|
1513 $strand = $this_genomic_align->dnafrag_strand;
|
|
1514 } elsif ($strand != $this_genomic_align->dnafrag_strand) {
|
|
1515 $strand = 0;
|
|
1516 }
|
|
1517 if ($this_genomic_align->dnafrag_strand == -1) {
|
|
1518
|
|
1519 if (UNIVERSAL::isa($this_genomic_align_block, "Bio::EnsEMBL::Compara::GenomicAlignTree")) {
|
|
1520 foreach my $this_node (@{$this_genomic_align_block->get_all_nodes}) {
|
|
1521 my $genomic_align_group = $this_node->genomic_align_group;
|
|
1522 next if (!$genomic_align_group);
|
|
1523 foreach my $genomic_align (@{$genomic_align_group->get_all_GenomicAligns}) {
|
|
1524 $genomic_align->reverse_complement;
|
|
1525 }
|
|
1526 }
|
|
1527 } else {
|
|
1528 foreach my $genomic_align (@{$this_genomic_align_block->genomic_align_array}) {
|
|
1529 $genomic_align->reverse_complement;
|
|
1530 }
|
|
1531 }
|
|
1532 }
|
|
1533 }
|
|
1534 ##
|
|
1535 ############################################################################################
|
|
1536
|
|
1537 ## Nothing has to be compiled if there is one single GenomicAlignBlock!
|
|
1538 $all_genomic_align_blocks->[0]->reverse_complement;
|
|
1539 return $all_genomic_align_blocks->[0] if (scalar(@$all_genomic_align_blocks) == 1);
|
|
1540
|
|
1541 ############################################################################################
|
|
1542 ##
|
|
1543 ## Fix all sequences
|
|
1544 ##
|
|
1545 foreach my $this_genomic_align_block (@$all_genomic_align_blocks) {
|
|
1546 my $this_genomic_align = $this_genomic_align_block->reference_genomic_align;
|
|
1547 my $this_start_pos = $this_genomic_align->dnafrag_start;
|
|
1548 my $this_end_pos = $this_genomic_align->dnafrag_end;
|
|
1549 my $starting_gap = $this_start_pos - $start_pos;
|
|
1550 my $ending_gap = $end_pos - $this_end_pos;
|
|
1551
|
|
1552 my $this_cigar_line = $this_genomic_align->cigar_line;
|
|
1553 my $this_original_sequence = $this_genomic_align->original_sequence;
|
|
1554 $this_genomic_align->aligned_sequence("");
|
|
1555 if ($starting_gap) {
|
|
1556 $this_cigar_line = $starting_gap."M".$this_cigar_line;
|
|
1557 $this_original_sequence = ("N" x $starting_gap).$this_original_sequence;
|
|
1558 }
|
|
1559 if ($ending_gap) {
|
|
1560 $this_cigar_line .= $ending_gap."M";
|
|
1561 $this_original_sequence .= ("N" x $ending_gap);
|
|
1562 }
|
|
1563 $this_genomic_align->cigar_line($this_cigar_line);
|
|
1564 $this_genomic_align->original_sequence($this_original_sequence);
|
|
1565
|
|
1566 foreach my $this_genomic_align (@{$this_genomic_align_block->get_all_non_reference_genomic_aligns}) {
|
|
1567 $this_genomic_align->aligned_sequence("");
|
|
1568 my $this_cigar_line = $this_genomic_align->cigar_line;
|
|
1569 $this_cigar_line = $starting_gap."D".$this_cigar_line if ($starting_gap);
|
|
1570 $this_cigar_line .= $ending_gap."D" if ($ending_gap);
|
|
1571 $this_genomic_align->cigar_line($this_cigar_line);
|
|
1572 $this_genomic_align->aligned_sequence(); # compute aligned_sequence using cigar_line
|
|
1573 }
|
|
1574 }
|
|
1575 ##
|
|
1576 ############################################################################################
|
|
1577
|
|
1578 ############################################################################################
|
|
1579 ##
|
|
1580 ## Distribute gaps
|
|
1581 ##
|
|
1582 my $aln_pos = 0;
|
|
1583 my $gap;
|
|
1584 do {
|
|
1585 my $gap_pos;
|
|
1586 my $genomic_align_block;
|
|
1587 $gap = undef;
|
|
1588
|
|
1589 ## Get the (next) first gap from all the alignments (sets: $gap_pos, $gap and $genomic_align_block_id)
|
|
1590 foreach my $this_genomic_align_block (@$all_genomic_align_blocks) {
|
|
1591 my $this_gap_pos = index($this_genomic_align_block->reference_genomic_align->aligned_sequence,
|
|
1592 "-", $aln_pos);
|
|
1593 if ($this_gap_pos > 0 and (!defined($gap_pos) or $this_gap_pos < $gap_pos)) {
|
|
1594 $gap_pos = $this_gap_pos;
|
|
1595 my $gap_string = substr($this_genomic_align_block->reference_genomic_align->aligned_sequence,
|
|
1596 $gap_pos);
|
|
1597 ($gap) = $gap_string =~ /^(\-+)/;
|
|
1598 $genomic_align_block = $this_genomic_align_block;
|
|
1599 }
|
|
1600 }
|
|
1601
|
|
1602 ## If a gap has been found, apply it to the other GAB
|
|
1603 if ($gap) {
|
|
1604 $aln_pos = $gap_pos + length($gap);
|
|
1605 foreach my $this_genomic_align_block (@$all_genomic_align_blocks) {
|
|
1606 next if ($genomic_align_block eq $this_genomic_align_block); # Do not add gap to itself!!
|
|
1607 if (UNIVERSAL::isa($this_genomic_align_block, "Bio::EnsEMBL::Compara::GenomicAlignTree")) {
|
|
1608 foreach my $this_node (@{$this_genomic_align_block->get_all_nodes}) {
|
|
1609 my $genomic_align_group = $this_node->genomic_align_group;
|
|
1610 next if (!$genomic_align_group);
|
|
1611 foreach my $this_genomic_align (@{$genomic_align_group->get_all_GenomicAligns}) {
|
|
1612 # insert gap in the aligned_sequence
|
|
1613 my $aligned_sequence = $this_genomic_align->aligned_sequence;
|
|
1614 substr($aligned_sequence, $gap_pos, 0, $gap);
|
|
1615 $this_genomic_align->aligned_sequence($aligned_sequence);
|
|
1616 }
|
|
1617 }
|
|
1618 } else {
|
|
1619 foreach my $this_genomic_align (@{$this_genomic_align_block->genomic_align_array}) {
|
|
1620 # insert gap in the aligned_sequence
|
|
1621 my $aligned_sequence = $this_genomic_align->aligned_sequence;
|
|
1622 substr($aligned_sequence, $gap_pos, 0, $gap);
|
|
1623 $this_genomic_align->aligned_sequence($aligned_sequence);
|
|
1624 }
|
|
1625 }
|
|
1626 }
|
|
1627 }
|
|
1628
|
|
1629 } while ($gap); # exit loop if no gap has been found
|
|
1630
|
|
1631 ## Fix all cigar_lines in order to match new aligned_sequences
|
|
1632 foreach my $this_genomic_align_block (@$all_genomic_align_blocks) {
|
|
1633 foreach my $this_genomic_align (@{$this_genomic_align_block->genomic_align_array}) {
|
|
1634 $this_genomic_align->cigar_line(""); # undef old cigar_line
|
|
1635 $this_genomic_align->cigar_line(); # compute cigar_line from aligned_sequence
|
|
1636 }
|
|
1637 }
|
|
1638 ##
|
|
1639 ############################################################################################
|
|
1640
|
|
1641 ############################################################################################
|
|
1642 ##
|
|
1643 ## Create the reference_genomic_align for this fake genomic_align_block
|
|
1644 ##
|
|
1645 ## All the blocks have been edited and all the reference genomic_aling
|
|
1646 ## should be equivalent. Here, we create a new one with no fixed sequence.
|
|
1647 ## This permits to retrieve the real sequence when needed
|
|
1648 ##
|
|
1649 my $reference_genomic_align;
|
|
1650 if (@$all_genomic_align_blocks) {
|
|
1651 my $this_genomic_align = $all_genomic_align_blocks->[0]->reference_genomic_align;
|
|
1652 $reference_genomic_align = new Bio::EnsEMBL::Compara::GenomicAlign(
|
|
1653 -dbID => -1,
|
|
1654 -dnafrag => $this_genomic_align->dnafrag,
|
|
1655 -dnafrag_start => $start_pos,
|
|
1656 -dnafrag_end => $end_pos,
|
|
1657 -dnafrag_strand => 1,
|
|
1658 -cigar_line => $this_genomic_align->cigar_line,
|
|
1659 -method_link_species_set => $this_genomic_align->method_link_species_set,
|
|
1660 -visible => 1
|
|
1661 );
|
|
1662 }
|
|
1663 ##
|
|
1664 ############################################################################################
|
|
1665
|
|
1666 ## Create the genomic_align_array (the list of genomic_aling for this fake gab
|
|
1667 my $genomic_align_array = [$reference_genomic_align];
|
|
1668 foreach my $this_genomic_align_block (@$all_genomic_align_blocks) {
|
|
1669 foreach my $this_genomic_align (@{$this_genomic_align_block->get_all_non_reference_genomic_aligns}) {
|
|
1670 $this_genomic_align->genomic_align_block_id(0); # undef old genomic_align_block_id
|
|
1671 push(@$genomic_align_array, $this_genomic_align);
|
|
1672 }
|
|
1673 }
|
|
1674
|
|
1675 ## Create the fake multiple Bio::EnsEMBL::Compara::GenomicAlignBlock
|
|
1676 my $fake_genomic_align_block = new Bio::EnsEMBL::Compara::GenomicAlignBlock(
|
|
1677 -length => ($end_pos - $start_pos + 1),
|
|
1678 -genomic_align_array => $genomic_align_array,
|
|
1679 -reference_genomic_align => $reference_genomic_align,
|
|
1680 -level_id => 0
|
|
1681 );
|
|
1682
|
|
1683 if ($strand == -1) {
|
|
1684 $fake_genomic_align_block->reverse_complement;
|
|
1685 }
|
|
1686
|
|
1687 return $fake_genomic_align_block;
|
|
1688 }
|
|
1689
|
|
1690
|
|
1691 sub DESTROY {
|
|
1692 my $self = shift;
|
|
1693 ## Remove circular reference in order to allow Perl to clear the object
|
|
1694 $self->{all_genomic_align_blocks} = undef;
|
|
1695 }
|
|
1696
|
|
1697 1;
|