comparison variant_effect_predictor/Bio/EnsEMBL/Compara/BaseGenomicAlignSet.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =head1 NAME
20
21 Bio::EnsEMBL::Compara::BaseGenomicAlignSet - Base class for GenomicAlignBlock and GenomicAlignTree
22
23 =head1 APPENDIX
24
25 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
26
27 =cut
28
29
30 # Let the code begin...
31
32 package Bio::EnsEMBL::Compara::BaseGenomicAlignSet;
33 use strict;
34
35 # Object preamble
36 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
37 use Bio::EnsEMBL::Utils::Exception qw(throw warning info deprecate verbose);
38
39
40 =head2 slice
41
42 Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice
43 Example : my $slice = $genomic_align_block->slice;
44 Example : $genomic_align_block->slice($slice);
45 Description: get/set for attribute slice.
46 Returntype : Bio::EnsEMBL::Slice object
47 Exceptions :
48 Caller : general
49 Status : Stable
50
51 =cut
52
53 sub slice {
54 my ($self, $reference_slice) = @_;
55
56 if (defined($reference_slice)) {
57 # throw "[$reference_slice] is not a Bio::EnsEMBL::Slice"
58 # unless $reference_slice->isa("Bio::EnsEMBL::Slice");
59 $self->{'reference_slice'} = $reference_slice;
60 }
61
62 return $self->{'reference_slice'};
63 }
64
65 =head2 reference_slice
66
67 Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice
68 Example : my $reference_slice = $genomic_align_block->reference_slice;
69 Example : $genomic_align_block->reference_slice($slice);
70 Description: Alias for slice method. TO BE DEPRECATED.
71 Returntype : Bio::EnsEMBL::Slice object
72 Exceptions :
73 Caller : general
74 Status : Stable
75
76 =cut
77
78 sub reference_slice {
79 my ($self, $reference_slice) = @_;
80
81 return $self->slice($reference_slice);
82 }
83
84 =head2 start
85
86 Arg [1] : (optional) integer $start
87 Example : my $start = $genomic_align_block->start;
88 Example : $genomic_align_block->start(1035);
89 Description: get/set for attribute reference_slice_start. A value of 0 will set
90 the attribute to undefined.
91 Returntype : integer
92 Exceptions : none
93 Caller : general
94
95 =cut
96
97 sub start {
98 my $self = shift;
99 return $self->reference_slice_start(@_);
100 }
101
102
103 =head2 reference_slice_start
104
105 Arg [1] : integer $reference_slice_start
106 Example : my $reference_slice_start = $genomic_align_block->reference_slice_start;
107 Example : $genomic_align_block->reference_slice_start(1035);
108 Description: get/set for attribute reference_slice_start. A value of 0 will set
109 the attribute to undefined.
110 Returntype : integer
111 Exceptions : none
112 Caller : general
113 Status : Stable
114
115 =cut
116
117 sub reference_slice_start {
118 my ($self, $reference_slice_start) = @_;
119
120 if (defined($reference_slice_start)) {
121 $self->{'reference_slice_start'} = ($reference_slice_start or undef);
122 }
123
124 return $self->{'reference_slice_start'};
125 }
126
127
128 =head2 end
129
130 Arg [1] : (optional) integer $end
131 Example : my $end = $genomic_align_block->end;
132 Example : $genomic_align_block->end(1283);
133 Description: get/set for attribute reference_slice_end. A value of 0 will set
134 the attribute to undefined.
135 Returntype : integer
136 Exceptions : none
137 Caller : general
138
139 =cut
140
141 sub end {
142 my $self = shift;
143 return $self->reference_slice_end(@_);
144 }
145
146
147 =head2 reference_slice_end
148
149 Arg [1] : integer $reference_slice_end
150 Example : my $reference_slice_end = $genomic_align_block->reference_slice_end;
151 Example : $genomic_align_block->reference_slice_end(1283);
152 Description: get/set for attribute reference_slice_end. A value of 0 will set
153 the attribute to undefined.
154 Returntype : integer
155 Exceptions : none
156 Caller : general
157 Status : Stable
158
159 =cut
160
161 sub reference_slice_end {
162 my ($self, $reference_slice_end) = @_;
163
164 if (defined($reference_slice_end)) {
165 $self->{'reference_slice_end'} = ($reference_slice_end or undef);
166 }
167
168 return $self->{'reference_slice_end'};
169
170 }
171
172 =head2 strand
173
174 Arg [1] : integer $strand
175 Example : my $strand = $genomic_align_block->strand;
176 Example : $genomic_align_block->strand(-1);
177 Description: get/set for attribute strand. A value of 0 will set
178 the attribute to undefined.
179 Returntype : integer
180 Exceptions : none
181 Caller : general
182 Status : Stable
183
184 =cut
185
186 sub strand {
187 my ($self, $reference_slice_strand) = @_;
188
189 if (defined($reference_slice_strand)) {
190 $self->{'reference_slice_strand'} = ($reference_slice_strand or undef);
191 }
192
193 return $self->{'reference_slice_strand'};
194 }
195
196 =head2 reference_slice_strand
197
198 Arg [1] : integer $reference_slice_strand
199 Example : my $reference_slice_strand = $genomic_align_block->reference_slice_strand;
200 Example : $genomic_align_block->reference_slice_strand(-1);
201 Description: get/set for attribute reference_slice_strand. A value of 0 will set
202 the attribute to undefined.
203 Returntype : integer
204 Exceptions : none
205 Caller : general
206 Status : Stable
207
208 =cut
209
210 sub reference_slice_strand {
211 my ($self, $reference_slice_strand) = @_;
212
213 if (defined($reference_slice_strand)) {
214 $self->{'reference_slice_strand'} = ($reference_slice_strand or undef);
215 }
216
217 return $self->{'reference_slice_strand'};
218 }
219
220 =head2 get_original_strand
221
222 Args : none
223 Example : if (!$genomic_align_block->get_original_strand()) {
224 # original GenomicAlignBlock has been reverse-complemented
225 }
226 Description: getter for the _orignal_strand attribute
227 Returntype : none
228 Exceptions : none
229 Caller : general
230 Status : Stable
231
232 =cut
233
234 sub get_original_strand {
235 my ($self) = @_;
236
237 if (!defined($self->{_original_strand})) {
238 $self->{_original_strand} = 1;
239 }
240
241 return $self->{_original_strand};
242 }
243
244 =head2 restrict_between_reference_positions
245
246 Arg[1] : [optional] int $start, refers to the reference_dnafrag
247 Arg[2] : [optional] int $end, refers to the reference_dnafrag
248 Arg[3] : [optional] Bio::EnsEMBL::Compara::GenomicAlign $reference_GenomicAlign
249 Arg[4] : [optional] boolean $skip_empty_GenomicAligns
250 Example : none
251 Description: restrict this GenomicAlignBlock. It returns a new object unless no
252 restriction is needed. In that case, it returns the original unchanged
253 object
254 It might be the case that the restricted region coincide with a gap
255 in one or several GenomicAligns. By default these GenomicAligns are
256 returned with a dnafrag_end equals to its dnafrag_start + 1. For instance,
257 a GenomicAlign with dnafrag_start = 12345 and dnafrag_end = 12344
258 correspond to a block which goes on this region from before 12345 to
259 after 12344, ie just between 12344 and 12345. You can choose to remove
260 these empty GenomicAligns by setting $skip_empty_GenomicAligns to any
261 true value.
262 Returntype : Bio::EnsEMBL::Compara::GenomicAlignBlock object in scalar context. In
263 list context, returns the previous object and the start and end
264 positions of the restriction in alignment coordinates (from 1 to
265 alignment_length)
266 Exceptions : return undef if reference positions lie outside of the alignment
267 Caller : general
268 Status : At risk
269
270 =cut
271
272 sub restrict_between_reference_positions {
273 my ($self, $start, $end, $reference_genomic_align, $skip_empty_GenomicAligns) = @_;
274 my $genomic_align_set;
275 my $new_reference_genomic_align;
276 my $new_genomic_aligns = [];
277
278 $reference_genomic_align ||= $self->reference_genomic_align;
279 throw("A reference Bio::EnsEMBL::Compara::GenomicAlign must be given") if (!$reference_genomic_align);
280 $start = $reference_genomic_align->dnafrag_start if (!defined($start));
281 $end = $reference_genomic_align->dnafrag_end if (!defined($end));
282
283 if ($start > $reference_genomic_align->dnafrag_end or $end < $reference_genomic_align->dnafrag_start) {
284 # restricting outside of boundaries => return undef object
285 warn("restricting outside of boundaries => return undef object: $start-$end (".$reference_genomic_align->dnafrag_start."-".$reference_genomic_align->dnafrag_end.")");
286 return wantarray ? (undef, undef, undef) : undef;
287 }
288 my $number_of_base_pairs_to_trim_from_the_start = $start - $reference_genomic_align->dnafrag_start;
289 my $number_of_base_pairs_to_trim_from_the_end = $reference_genomic_align->dnafrag_end - $end;
290
291 my $is_ref_low_coverage = 0;
292 if ($reference_genomic_align->cigar_line =~ /X/) {
293 $is_ref_low_coverage = 1;
294 }
295
296 ## Skip if no restriction is needed. Return original object! We are still going on with the
297 ## restriction when either excess_at_the_start or excess_at_the_end are 0 as a (multiple)
298 ## alignment may start or end with gaps in the reference species. In that case, we want to
299 ## trim these gaps from the alignment as they fall just outside of the region of interest
300
301 ##Exception if the reference species is low coverage, then need to continue
302 ##with this routine to find out the correct align positions
303 if ($number_of_base_pairs_to_trim_from_the_start < 0 and $number_of_base_pairs_to_trim_from_the_end < 0 and !$is_ref_low_coverage) {
304 return wantarray ? ($self, 1, $self->length) : $self;
305 }
306
307 my $negative_strand = ($reference_genomic_align->dnafrag_strand == -1);
308
309 ## Create a new Bio::EnsEMBL::Compara::GenomicAlignBlock object
310 throw("Reference GenomicAlign not found!") if (!$reference_genomic_align);
311
312 my @reference_cigar = grep {$_} split(/(\d*[GDMXI])/, $reference_genomic_align->cigar_line);
313 if ($negative_strand) {
314 @reference_cigar = reverse @reference_cigar;
315 }
316
317 #If this is negative, eg when a slice starts in one block and ends in
318 #another, need to set to 0 to ensure we enter the loop below. This
319 #fixes a bug when using a 2x species as the reference and fetching using
320 #an expanded align slice.
321 if ($number_of_base_pairs_to_trim_from_the_start < 0) {
322 $number_of_base_pairs_to_trim_from_the_start = 0;
323 }
324
325 ## Parse start of cigar_line for the reference GenomicAlign
326 my $counter_of_trimmed_columns_from_the_start = 0; # num. of bp in the alignment to be trimmed
327
328 if ($number_of_base_pairs_to_trim_from_the_start >= 0) {
329 my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the start)
330 ## Loop through the cigar pieces
331 while (my $cigar = shift(@reference_cigar)) {
332 # Parse each cigar piece
333 my ($num, $type) = ($cigar =~ /^(\d*)([GDMXI])/);
334 $num = 1 if ($num eq "");
335
336 # Insertions are not part of the alignment, don't count them
337 if ($type ne "I") {
338 $counter_of_trimmed_columns_from_the_start += $num;
339 }
340
341 # Matches and insertions are actual base pairs in the reference
342 if ($type eq "M" or $type eq "I") {
343 $counter_of_trimmed_base_pairs += $num;
344 # If this cigar piece is too long and we overshoot the number of base pairs we want to trim,
345 # we substitute this cigar piece by a shorter one
346 if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_start) {
347 my $new_cigar_piece = "";
348 # length of the new cigar piece
349 my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_start;
350 if ($length > 1) {
351 $new_cigar_piece = $length.$type;
352 } elsif ($length == 1) {
353 $new_cigar_piece = $type;
354 }
355 unshift(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece);
356
357 # There is no need to correct the counter of trimmed columns if we are in an insertion
358 # when we overshoot
359 if ($type eq "M") {
360 $counter_of_trimmed_columns_from_the_start -= $length;
361 }
362
363 ## We don't want to start with an insertion or a deletion. Trim them!
364 while (@reference_cigar and $reference_cigar[0] =~ /[DI]/) {
365 my ($num, $type) = ($reference_cigar[0] =~ /^(\d*)([DIGMX])/);
366 $num = 1 if ($num eq "");
367 # only counts deletions, insertions are not part of the aligment
368 $counter_of_trimmed_columns_from_the_start += $num if ($type eq "D");
369 shift(@reference_cigar);
370 }
371 last;
372 }
373 }
374 }
375 }
376
377 #If this is negative, eg when a slice starts in one block and ends in
378 #another, need to set to 0 to ensure we enter the loop below. This
379 #fixes a bug when using a 2x species as the reference and fetching using
380 #an expanded align slice.
381 if ($number_of_base_pairs_to_trim_from_the_end < 0) {
382 $number_of_base_pairs_to_trim_from_the_end = 0;
383 }
384
385 ## Parse end of cigar_line for the reference GenomicAlign
386 my $counter_of_trimmed_columns_from_the_end = 0; # num. of bp in the alignment to be trimmed
387 if ($number_of_base_pairs_to_trim_from_the_end >= 0) {
388 my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the end)
389 ## Loop through the cigar pieces
390 while (my $cigar = pop(@reference_cigar)) {
391 # Parse each cigar piece
392 my ($num, $type) = ($cigar =~ /^(\d*)([DIGMX])/);
393 $num = 1 if ($num eq "");
394
395 # Insertions are not part of the alignment, don't count them
396 if ($type ne "I") {
397 $counter_of_trimmed_columns_from_the_end += $num;
398 }
399
400 # Matches and insertions are actual base pairs in the reference
401 if ($type eq "M" or $type eq "I") {
402 $counter_of_trimmed_base_pairs += $num;
403 # If this cigar piece is too long and we overshoot the number of base pairs we want to trim,
404 # we substitute this cigar piece by a shorter one
405 if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_end) {
406 my $new_cigar_piece = "";
407 # length of the new cigar piece
408 my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_end;
409 if ($length > 1) {
410 $new_cigar_piece = $length.$type;
411 } elsif ($length == 1) {
412 $new_cigar_piece = $type;
413 }
414 push(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece);
415
416 # There is no need to correct the counter of trimmed columns if we are in an insertion
417 # when we overshoot
418 if ($type eq "M") {
419 $counter_of_trimmed_columns_from_the_end -= $length;
420 }
421
422 ## We don't want to end with an insertion or a deletion. Trim them!
423 while (@reference_cigar and $reference_cigar[-1] =~ /[DI]/) {
424 my ($num, $type) = ($reference_cigar[-1] =~ /^(\d*)([DIGMX])/);
425 $num = 1 if ($num eq "");
426 # only counts deletions, insertions are not part of the aligment
427 $counter_of_trimmed_columns_from_the_end += $num if ($type eq "D");
428 pop(@reference_cigar);
429 }
430 last;
431 }
432 }
433 }
434 }
435
436 ## Skip if no restriction is needed. Return original object! This may happen when
437 ## either excess_at_the_start or excess_at_the_end are 0 but the alignment does not
438 ## start or end with gaps in the reference species.
439 if ($counter_of_trimmed_columns_from_the_start <= 0 and $counter_of_trimmed_columns_from_the_end <= 0) {
440 return wantarray ? ($self, 1, $self->length) : $self;
441 }
442
443 my ($aln_start, $aln_end);
444 if ($negative_strand) {
445 $aln_start = $counter_of_trimmed_columns_from_the_end + 1;
446 $aln_end = $self->length - $counter_of_trimmed_columns_from_the_start;
447 } else {
448 $aln_start = $counter_of_trimmed_columns_from_the_start + 1;
449 $aln_end = $self->length - $counter_of_trimmed_columns_from_the_end;
450 }
451
452 $genomic_align_set = $self->restrict_between_alignment_positions($aln_start, $aln_end, $skip_empty_GenomicAligns);
453 $new_reference_genomic_align = $genomic_align_set->reference_genomic_align;
454
455 if (!defined $self->{'restricted_aln_start'}) {
456 $self->{'restricted_aln_start'} = 0;
457 }
458 if (!defined $self->{'restricted_aln_end'}) {
459 $self->{'restricted_aln_end'} = 0;
460 }
461 $genomic_align_set->{'restricted_aln_start'} = $counter_of_trimmed_columns_from_the_start + $self->{'restricted_aln_start'};
462 $genomic_align_set->{'restricted_aln_end'} = $counter_of_trimmed_columns_from_the_end + $self->{'restricted_aln_end'};
463 #$genomic_align_set->{'original_length'} = $self->length;
464
465 #Need to use original gab length. If original_length is not set, length has
466 #not changed. Needed when use 2X genome as reference.
467 if (defined $self->{'original_length'}) {
468 $genomic_align_set->{'original_length'} = $self->{'original_length'};
469 } else {
470 $genomic_align_set->{'original_length'} = $self->length;
471 }
472
473 if (defined $self->slice) {
474 if ($self->strand == 1) {
475 $genomic_align_set->start($new_reference_genomic_align->dnafrag_start -
476 $self->slice->start + 1);
477 $genomic_align_set->end($new_reference_genomic_align->dnafrag_end -
478 $self->slice->start + 1);
479 $genomic_align_set->strand(1);
480 } else {
481 $genomic_align_set->start($self->{reference_slice}->{end} -
482 $new_reference_genomic_align->{dnafrag_end} + 1);
483 $genomic_align_set->end($self->{reference_slice}->{end} -
484 $new_reference_genomic_align->{dnafrag_start} + 1);
485 $genomic_align_set->strand(-1);
486 }
487 }
488
489 return wantarray ? ($genomic_align_set, $aln_start, $aln_end) : $genomic_align_set;
490 }
491
492 1;