0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =head1 NAME
|
|
20
|
|
21 Bio::EnsEMBL::Compara::BaseGenomicAlignSet - Base class for GenomicAlignBlock and GenomicAlignTree
|
|
22
|
|
23 =head1 APPENDIX
|
|
24
|
|
25 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
26
|
|
27 =cut
|
|
28
|
|
29
|
|
30 # Let the code begin...
|
|
31
|
|
32 package Bio::EnsEMBL::Compara::BaseGenomicAlignSet;
|
|
33 use strict;
|
|
34
|
|
35 # Object preamble
|
|
36 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
|
|
37 use Bio::EnsEMBL::Utils::Exception qw(throw warning info deprecate verbose);
|
|
38
|
|
39
|
|
40 =head2 slice
|
|
41
|
|
42 Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice
|
|
43 Example : my $slice = $genomic_align_block->slice;
|
|
44 Example : $genomic_align_block->slice($slice);
|
|
45 Description: get/set for attribute slice.
|
|
46 Returntype : Bio::EnsEMBL::Slice object
|
|
47 Exceptions :
|
|
48 Caller : general
|
|
49 Status : Stable
|
|
50
|
|
51 =cut
|
|
52
|
|
53 sub slice {
|
|
54 my ($self, $reference_slice) = @_;
|
|
55
|
|
56 if (defined($reference_slice)) {
|
|
57 # throw "[$reference_slice] is not a Bio::EnsEMBL::Slice"
|
|
58 # unless $reference_slice->isa("Bio::EnsEMBL::Slice");
|
|
59 $self->{'reference_slice'} = $reference_slice;
|
|
60 }
|
|
61
|
|
62 return $self->{'reference_slice'};
|
|
63 }
|
|
64
|
|
65 =head2 reference_slice
|
|
66
|
|
67 Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice
|
|
68 Example : my $reference_slice = $genomic_align_block->reference_slice;
|
|
69 Example : $genomic_align_block->reference_slice($slice);
|
|
70 Description: Alias for slice method. TO BE DEPRECATED.
|
|
71 Returntype : Bio::EnsEMBL::Slice object
|
|
72 Exceptions :
|
|
73 Caller : general
|
|
74 Status : Stable
|
|
75
|
|
76 =cut
|
|
77
|
|
78 sub reference_slice {
|
|
79 my ($self, $reference_slice) = @_;
|
|
80
|
|
81 return $self->slice($reference_slice);
|
|
82 }
|
|
83
|
|
84 =head2 start
|
|
85
|
|
86 Arg [1] : (optional) integer $start
|
|
87 Example : my $start = $genomic_align_block->start;
|
|
88 Example : $genomic_align_block->start(1035);
|
|
89 Description: get/set for attribute reference_slice_start. A value of 0 will set
|
|
90 the attribute to undefined.
|
|
91 Returntype : integer
|
|
92 Exceptions : none
|
|
93 Caller : general
|
|
94
|
|
95 =cut
|
|
96
|
|
97 sub start {
|
|
98 my $self = shift;
|
|
99 return $self->reference_slice_start(@_);
|
|
100 }
|
|
101
|
|
102
|
|
103 =head2 reference_slice_start
|
|
104
|
|
105 Arg [1] : integer $reference_slice_start
|
|
106 Example : my $reference_slice_start = $genomic_align_block->reference_slice_start;
|
|
107 Example : $genomic_align_block->reference_slice_start(1035);
|
|
108 Description: get/set for attribute reference_slice_start. A value of 0 will set
|
|
109 the attribute to undefined.
|
|
110 Returntype : integer
|
|
111 Exceptions : none
|
|
112 Caller : general
|
|
113 Status : Stable
|
|
114
|
|
115 =cut
|
|
116
|
|
117 sub reference_slice_start {
|
|
118 my ($self, $reference_slice_start) = @_;
|
|
119
|
|
120 if (defined($reference_slice_start)) {
|
|
121 $self->{'reference_slice_start'} = ($reference_slice_start or undef);
|
|
122 }
|
|
123
|
|
124 return $self->{'reference_slice_start'};
|
|
125 }
|
|
126
|
|
127
|
|
128 =head2 end
|
|
129
|
|
130 Arg [1] : (optional) integer $end
|
|
131 Example : my $end = $genomic_align_block->end;
|
|
132 Example : $genomic_align_block->end(1283);
|
|
133 Description: get/set for attribute reference_slice_end. A value of 0 will set
|
|
134 the attribute to undefined.
|
|
135 Returntype : integer
|
|
136 Exceptions : none
|
|
137 Caller : general
|
|
138
|
|
139 =cut
|
|
140
|
|
141 sub end {
|
|
142 my $self = shift;
|
|
143 return $self->reference_slice_end(@_);
|
|
144 }
|
|
145
|
|
146
|
|
147 =head2 reference_slice_end
|
|
148
|
|
149 Arg [1] : integer $reference_slice_end
|
|
150 Example : my $reference_slice_end = $genomic_align_block->reference_slice_end;
|
|
151 Example : $genomic_align_block->reference_slice_end(1283);
|
|
152 Description: get/set for attribute reference_slice_end. A value of 0 will set
|
|
153 the attribute to undefined.
|
|
154 Returntype : integer
|
|
155 Exceptions : none
|
|
156 Caller : general
|
|
157 Status : Stable
|
|
158
|
|
159 =cut
|
|
160
|
|
161 sub reference_slice_end {
|
|
162 my ($self, $reference_slice_end) = @_;
|
|
163
|
|
164 if (defined($reference_slice_end)) {
|
|
165 $self->{'reference_slice_end'} = ($reference_slice_end or undef);
|
|
166 }
|
|
167
|
|
168 return $self->{'reference_slice_end'};
|
|
169
|
|
170 }
|
|
171
|
|
172 =head2 strand
|
|
173
|
|
174 Arg [1] : integer $strand
|
|
175 Example : my $strand = $genomic_align_block->strand;
|
|
176 Example : $genomic_align_block->strand(-1);
|
|
177 Description: get/set for attribute strand. A value of 0 will set
|
|
178 the attribute to undefined.
|
|
179 Returntype : integer
|
|
180 Exceptions : none
|
|
181 Caller : general
|
|
182 Status : Stable
|
|
183
|
|
184 =cut
|
|
185
|
|
186 sub strand {
|
|
187 my ($self, $reference_slice_strand) = @_;
|
|
188
|
|
189 if (defined($reference_slice_strand)) {
|
|
190 $self->{'reference_slice_strand'} = ($reference_slice_strand or undef);
|
|
191 }
|
|
192
|
|
193 return $self->{'reference_slice_strand'};
|
|
194 }
|
|
195
|
|
196 =head2 reference_slice_strand
|
|
197
|
|
198 Arg [1] : integer $reference_slice_strand
|
|
199 Example : my $reference_slice_strand = $genomic_align_block->reference_slice_strand;
|
|
200 Example : $genomic_align_block->reference_slice_strand(-1);
|
|
201 Description: get/set for attribute reference_slice_strand. A value of 0 will set
|
|
202 the attribute to undefined.
|
|
203 Returntype : integer
|
|
204 Exceptions : none
|
|
205 Caller : general
|
|
206 Status : Stable
|
|
207
|
|
208 =cut
|
|
209
|
|
210 sub reference_slice_strand {
|
|
211 my ($self, $reference_slice_strand) = @_;
|
|
212
|
|
213 if (defined($reference_slice_strand)) {
|
|
214 $self->{'reference_slice_strand'} = ($reference_slice_strand or undef);
|
|
215 }
|
|
216
|
|
217 return $self->{'reference_slice_strand'};
|
|
218 }
|
|
219
|
|
220 =head2 get_original_strand
|
|
221
|
|
222 Args : none
|
|
223 Example : if (!$genomic_align_block->get_original_strand()) {
|
|
224 # original GenomicAlignBlock has been reverse-complemented
|
|
225 }
|
|
226 Description: getter for the _orignal_strand attribute
|
|
227 Returntype : none
|
|
228 Exceptions : none
|
|
229 Caller : general
|
|
230 Status : Stable
|
|
231
|
|
232 =cut
|
|
233
|
|
234 sub get_original_strand {
|
|
235 my ($self) = @_;
|
|
236
|
|
237 if (!defined($self->{_original_strand})) {
|
|
238 $self->{_original_strand} = 1;
|
|
239 }
|
|
240
|
|
241 return $self->{_original_strand};
|
|
242 }
|
|
243
|
|
244 =head2 restrict_between_reference_positions
|
|
245
|
|
246 Arg[1] : [optional] int $start, refers to the reference_dnafrag
|
|
247 Arg[2] : [optional] int $end, refers to the reference_dnafrag
|
|
248 Arg[3] : [optional] Bio::EnsEMBL::Compara::GenomicAlign $reference_GenomicAlign
|
|
249 Arg[4] : [optional] boolean $skip_empty_GenomicAligns
|
|
250 Example : none
|
|
251 Description: restrict this GenomicAlignBlock. It returns a new object unless no
|
|
252 restriction is needed. In that case, it returns the original unchanged
|
|
253 object
|
|
254 It might be the case that the restricted region coincide with a gap
|
|
255 in one or several GenomicAligns. By default these GenomicAligns are
|
|
256 returned with a dnafrag_end equals to its dnafrag_start + 1. For instance,
|
|
257 a GenomicAlign with dnafrag_start = 12345 and dnafrag_end = 12344
|
|
258 correspond to a block which goes on this region from before 12345 to
|
|
259 after 12344, ie just between 12344 and 12345. You can choose to remove
|
|
260 these empty GenomicAligns by setting $skip_empty_GenomicAligns to any
|
|
261 true value.
|
|
262 Returntype : Bio::EnsEMBL::Compara::GenomicAlignBlock object in scalar context. In
|
|
263 list context, returns the previous object and the start and end
|
|
264 positions of the restriction in alignment coordinates (from 1 to
|
|
265 alignment_length)
|
|
266 Exceptions : return undef if reference positions lie outside of the alignment
|
|
267 Caller : general
|
|
268 Status : At risk
|
|
269
|
|
270 =cut
|
|
271
|
|
272 sub restrict_between_reference_positions {
|
|
273 my ($self, $start, $end, $reference_genomic_align, $skip_empty_GenomicAligns) = @_;
|
|
274 my $genomic_align_set;
|
|
275 my $new_reference_genomic_align;
|
|
276 my $new_genomic_aligns = [];
|
|
277
|
|
278 $reference_genomic_align ||= $self->reference_genomic_align;
|
|
279 throw("A reference Bio::EnsEMBL::Compara::GenomicAlign must be given") if (!$reference_genomic_align);
|
|
280 $start = $reference_genomic_align->dnafrag_start if (!defined($start));
|
|
281 $end = $reference_genomic_align->dnafrag_end if (!defined($end));
|
|
282
|
|
283 if ($start > $reference_genomic_align->dnafrag_end or $end < $reference_genomic_align->dnafrag_start) {
|
|
284 # restricting outside of boundaries => return undef object
|
|
285 warn("restricting outside of boundaries => return undef object: $start-$end (".$reference_genomic_align->dnafrag_start."-".$reference_genomic_align->dnafrag_end.")");
|
|
286 return wantarray ? (undef, undef, undef) : undef;
|
|
287 }
|
|
288 my $number_of_base_pairs_to_trim_from_the_start = $start - $reference_genomic_align->dnafrag_start;
|
|
289 my $number_of_base_pairs_to_trim_from_the_end = $reference_genomic_align->dnafrag_end - $end;
|
|
290
|
|
291 my $is_ref_low_coverage = 0;
|
|
292 if ($reference_genomic_align->cigar_line =~ /X/) {
|
|
293 $is_ref_low_coverage = 1;
|
|
294 }
|
|
295
|
|
296 ## Skip if no restriction is needed. Return original object! We are still going on with the
|
|
297 ## restriction when either excess_at_the_start or excess_at_the_end are 0 as a (multiple)
|
|
298 ## alignment may start or end with gaps in the reference species. In that case, we want to
|
|
299 ## trim these gaps from the alignment as they fall just outside of the region of interest
|
|
300
|
|
301 ##Exception if the reference species is low coverage, then need to continue
|
|
302 ##with this routine to find out the correct align positions
|
|
303 if ($number_of_base_pairs_to_trim_from_the_start < 0 and $number_of_base_pairs_to_trim_from_the_end < 0 and !$is_ref_low_coverage) {
|
|
304 return wantarray ? ($self, 1, $self->length) : $self;
|
|
305 }
|
|
306
|
|
307 my $negative_strand = ($reference_genomic_align->dnafrag_strand == -1);
|
|
308
|
|
309 ## Create a new Bio::EnsEMBL::Compara::GenomicAlignBlock object
|
|
310 throw("Reference GenomicAlign not found!") if (!$reference_genomic_align);
|
|
311
|
|
312 my @reference_cigar = grep {$_} split(/(\d*[GDMXI])/, $reference_genomic_align->cigar_line);
|
|
313 if ($negative_strand) {
|
|
314 @reference_cigar = reverse @reference_cigar;
|
|
315 }
|
|
316
|
|
317 #If this is negative, eg when a slice starts in one block and ends in
|
|
318 #another, need to set to 0 to ensure we enter the loop below. This
|
|
319 #fixes a bug when using a 2x species as the reference and fetching using
|
|
320 #an expanded align slice.
|
|
321 if ($number_of_base_pairs_to_trim_from_the_start < 0) {
|
|
322 $number_of_base_pairs_to_trim_from_the_start = 0;
|
|
323 }
|
|
324
|
|
325 ## Parse start of cigar_line for the reference GenomicAlign
|
|
326 my $counter_of_trimmed_columns_from_the_start = 0; # num. of bp in the alignment to be trimmed
|
|
327
|
|
328 if ($number_of_base_pairs_to_trim_from_the_start >= 0) {
|
|
329 my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the start)
|
|
330 ## Loop through the cigar pieces
|
|
331 while (my $cigar = shift(@reference_cigar)) {
|
|
332 # Parse each cigar piece
|
|
333 my ($num, $type) = ($cigar =~ /^(\d*)([GDMXI])/);
|
|
334 $num = 1 if ($num eq "");
|
|
335
|
|
336 # Insertions are not part of the alignment, don't count them
|
|
337 if ($type ne "I") {
|
|
338 $counter_of_trimmed_columns_from_the_start += $num;
|
|
339 }
|
|
340
|
|
341 # Matches and insertions are actual base pairs in the reference
|
|
342 if ($type eq "M" or $type eq "I") {
|
|
343 $counter_of_trimmed_base_pairs += $num;
|
|
344 # If this cigar piece is too long and we overshoot the number of base pairs we want to trim,
|
|
345 # we substitute this cigar piece by a shorter one
|
|
346 if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_start) {
|
|
347 my $new_cigar_piece = "";
|
|
348 # length of the new cigar piece
|
|
349 my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_start;
|
|
350 if ($length > 1) {
|
|
351 $new_cigar_piece = $length.$type;
|
|
352 } elsif ($length == 1) {
|
|
353 $new_cigar_piece = $type;
|
|
354 }
|
|
355 unshift(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece);
|
|
356
|
|
357 # There is no need to correct the counter of trimmed columns if we are in an insertion
|
|
358 # when we overshoot
|
|
359 if ($type eq "M") {
|
|
360 $counter_of_trimmed_columns_from_the_start -= $length;
|
|
361 }
|
|
362
|
|
363 ## We don't want to start with an insertion or a deletion. Trim them!
|
|
364 while (@reference_cigar and $reference_cigar[0] =~ /[DI]/) {
|
|
365 my ($num, $type) = ($reference_cigar[0] =~ /^(\d*)([DIGMX])/);
|
|
366 $num = 1 if ($num eq "");
|
|
367 # only counts deletions, insertions are not part of the aligment
|
|
368 $counter_of_trimmed_columns_from_the_start += $num if ($type eq "D");
|
|
369 shift(@reference_cigar);
|
|
370 }
|
|
371 last;
|
|
372 }
|
|
373 }
|
|
374 }
|
|
375 }
|
|
376
|
|
377 #If this is negative, eg when a slice starts in one block and ends in
|
|
378 #another, need to set to 0 to ensure we enter the loop below. This
|
|
379 #fixes a bug when using a 2x species as the reference and fetching using
|
|
380 #an expanded align slice.
|
|
381 if ($number_of_base_pairs_to_trim_from_the_end < 0) {
|
|
382 $number_of_base_pairs_to_trim_from_the_end = 0;
|
|
383 }
|
|
384
|
|
385 ## Parse end of cigar_line for the reference GenomicAlign
|
|
386 my $counter_of_trimmed_columns_from_the_end = 0; # num. of bp in the alignment to be trimmed
|
|
387 if ($number_of_base_pairs_to_trim_from_the_end >= 0) {
|
|
388 my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the end)
|
|
389 ## Loop through the cigar pieces
|
|
390 while (my $cigar = pop(@reference_cigar)) {
|
|
391 # Parse each cigar piece
|
|
392 my ($num, $type) = ($cigar =~ /^(\d*)([DIGMX])/);
|
|
393 $num = 1 if ($num eq "");
|
|
394
|
|
395 # Insertions are not part of the alignment, don't count them
|
|
396 if ($type ne "I") {
|
|
397 $counter_of_trimmed_columns_from_the_end += $num;
|
|
398 }
|
|
399
|
|
400 # Matches and insertions are actual base pairs in the reference
|
|
401 if ($type eq "M" or $type eq "I") {
|
|
402 $counter_of_trimmed_base_pairs += $num;
|
|
403 # If this cigar piece is too long and we overshoot the number of base pairs we want to trim,
|
|
404 # we substitute this cigar piece by a shorter one
|
|
405 if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_end) {
|
|
406 my $new_cigar_piece = "";
|
|
407 # length of the new cigar piece
|
|
408 my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_end;
|
|
409 if ($length > 1) {
|
|
410 $new_cigar_piece = $length.$type;
|
|
411 } elsif ($length == 1) {
|
|
412 $new_cigar_piece = $type;
|
|
413 }
|
|
414 push(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece);
|
|
415
|
|
416 # There is no need to correct the counter of trimmed columns if we are in an insertion
|
|
417 # when we overshoot
|
|
418 if ($type eq "M") {
|
|
419 $counter_of_trimmed_columns_from_the_end -= $length;
|
|
420 }
|
|
421
|
|
422 ## We don't want to end with an insertion or a deletion. Trim them!
|
|
423 while (@reference_cigar and $reference_cigar[-1] =~ /[DI]/) {
|
|
424 my ($num, $type) = ($reference_cigar[-1] =~ /^(\d*)([DIGMX])/);
|
|
425 $num = 1 if ($num eq "");
|
|
426 # only counts deletions, insertions are not part of the aligment
|
|
427 $counter_of_trimmed_columns_from_the_end += $num if ($type eq "D");
|
|
428 pop(@reference_cigar);
|
|
429 }
|
|
430 last;
|
|
431 }
|
|
432 }
|
|
433 }
|
|
434 }
|
|
435
|
|
436 ## Skip if no restriction is needed. Return original object! This may happen when
|
|
437 ## either excess_at_the_start or excess_at_the_end are 0 but the alignment does not
|
|
438 ## start or end with gaps in the reference species.
|
|
439 if ($counter_of_trimmed_columns_from_the_start <= 0 and $counter_of_trimmed_columns_from_the_end <= 0) {
|
|
440 return wantarray ? ($self, 1, $self->length) : $self;
|
|
441 }
|
|
442
|
|
443 my ($aln_start, $aln_end);
|
|
444 if ($negative_strand) {
|
|
445 $aln_start = $counter_of_trimmed_columns_from_the_end + 1;
|
|
446 $aln_end = $self->length - $counter_of_trimmed_columns_from_the_start;
|
|
447 } else {
|
|
448 $aln_start = $counter_of_trimmed_columns_from_the_start + 1;
|
|
449 $aln_end = $self->length - $counter_of_trimmed_columns_from_the_end;
|
|
450 }
|
|
451
|
|
452 $genomic_align_set = $self->restrict_between_alignment_positions($aln_start, $aln_end, $skip_empty_GenomicAligns);
|
|
453 $new_reference_genomic_align = $genomic_align_set->reference_genomic_align;
|
|
454
|
|
455 if (!defined $self->{'restricted_aln_start'}) {
|
|
456 $self->{'restricted_aln_start'} = 0;
|
|
457 }
|
|
458 if (!defined $self->{'restricted_aln_end'}) {
|
|
459 $self->{'restricted_aln_end'} = 0;
|
|
460 }
|
|
461 $genomic_align_set->{'restricted_aln_start'} = $counter_of_trimmed_columns_from_the_start + $self->{'restricted_aln_start'};
|
|
462 $genomic_align_set->{'restricted_aln_end'} = $counter_of_trimmed_columns_from_the_end + $self->{'restricted_aln_end'};
|
|
463 #$genomic_align_set->{'original_length'} = $self->length;
|
|
464
|
|
465 #Need to use original gab length. If original_length is not set, length has
|
|
466 #not changed. Needed when use 2X genome as reference.
|
|
467 if (defined $self->{'original_length'}) {
|
|
468 $genomic_align_set->{'original_length'} = $self->{'original_length'};
|
|
469 } else {
|
|
470 $genomic_align_set->{'original_length'} = $self->length;
|
|
471 }
|
|
472
|
|
473 if (defined $self->slice) {
|
|
474 if ($self->strand == 1) {
|
|
475 $genomic_align_set->start($new_reference_genomic_align->dnafrag_start -
|
|
476 $self->slice->start + 1);
|
|
477 $genomic_align_set->end($new_reference_genomic_align->dnafrag_end -
|
|
478 $self->slice->start + 1);
|
|
479 $genomic_align_set->strand(1);
|
|
480 } else {
|
|
481 $genomic_align_set->start($self->{reference_slice}->{end} -
|
|
482 $new_reference_genomic_align->{dnafrag_end} + 1);
|
|
483 $genomic_align_set->end($self->{reference_slice}->{end} -
|
|
484 $new_reference_genomic_align->{dnafrag_start} + 1);
|
|
485 $genomic_align_set->strand(-1);
|
|
486 }
|
|
487 }
|
|
488
|
|
489 return wantarray ? ($genomic_align_set, $aln_start, $aln_end) : $genomic_align_set;
|
|
490 }
|
|
491
|
|
492 1;
|