Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/Compara/BaseGenomicAlignSet.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =head1 NAME | |
20 | |
21 Bio::EnsEMBL::Compara::BaseGenomicAlignSet - Base class for GenomicAlignBlock and GenomicAlignTree | |
22 | |
23 =head1 APPENDIX | |
24 | |
25 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
26 | |
27 =cut | |
28 | |
29 | |
30 # Let the code begin... | |
31 | |
32 package Bio::EnsEMBL::Compara::BaseGenomicAlignSet; | |
33 use strict; | |
34 | |
35 # Object preamble | |
36 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
37 use Bio::EnsEMBL::Utils::Exception qw(throw warning info deprecate verbose); | |
38 | |
39 | |
40 =head2 slice | |
41 | |
42 Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice | |
43 Example : my $slice = $genomic_align_block->slice; | |
44 Example : $genomic_align_block->slice($slice); | |
45 Description: get/set for attribute slice. | |
46 Returntype : Bio::EnsEMBL::Slice object | |
47 Exceptions : | |
48 Caller : general | |
49 Status : Stable | |
50 | |
51 =cut | |
52 | |
53 sub slice { | |
54 my ($self, $reference_slice) = @_; | |
55 | |
56 if (defined($reference_slice)) { | |
57 # throw "[$reference_slice] is not a Bio::EnsEMBL::Slice" | |
58 # unless $reference_slice->isa("Bio::EnsEMBL::Slice"); | |
59 $self->{'reference_slice'} = $reference_slice; | |
60 } | |
61 | |
62 return $self->{'reference_slice'}; | |
63 } | |
64 | |
65 =head2 reference_slice | |
66 | |
67 Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice | |
68 Example : my $reference_slice = $genomic_align_block->reference_slice; | |
69 Example : $genomic_align_block->reference_slice($slice); | |
70 Description: Alias for slice method. TO BE DEPRECATED. | |
71 Returntype : Bio::EnsEMBL::Slice object | |
72 Exceptions : | |
73 Caller : general | |
74 Status : Stable | |
75 | |
76 =cut | |
77 | |
78 sub reference_slice { | |
79 my ($self, $reference_slice) = @_; | |
80 | |
81 return $self->slice($reference_slice); | |
82 } | |
83 | |
84 =head2 start | |
85 | |
86 Arg [1] : (optional) integer $start | |
87 Example : my $start = $genomic_align_block->start; | |
88 Example : $genomic_align_block->start(1035); | |
89 Description: get/set for attribute reference_slice_start. A value of 0 will set | |
90 the attribute to undefined. | |
91 Returntype : integer | |
92 Exceptions : none | |
93 Caller : general | |
94 | |
95 =cut | |
96 | |
97 sub start { | |
98 my $self = shift; | |
99 return $self->reference_slice_start(@_); | |
100 } | |
101 | |
102 | |
103 =head2 reference_slice_start | |
104 | |
105 Arg [1] : integer $reference_slice_start | |
106 Example : my $reference_slice_start = $genomic_align_block->reference_slice_start; | |
107 Example : $genomic_align_block->reference_slice_start(1035); | |
108 Description: get/set for attribute reference_slice_start. A value of 0 will set | |
109 the attribute to undefined. | |
110 Returntype : integer | |
111 Exceptions : none | |
112 Caller : general | |
113 Status : Stable | |
114 | |
115 =cut | |
116 | |
117 sub reference_slice_start { | |
118 my ($self, $reference_slice_start) = @_; | |
119 | |
120 if (defined($reference_slice_start)) { | |
121 $self->{'reference_slice_start'} = ($reference_slice_start or undef); | |
122 } | |
123 | |
124 return $self->{'reference_slice_start'}; | |
125 } | |
126 | |
127 | |
128 =head2 end | |
129 | |
130 Arg [1] : (optional) integer $end | |
131 Example : my $end = $genomic_align_block->end; | |
132 Example : $genomic_align_block->end(1283); | |
133 Description: get/set for attribute reference_slice_end. A value of 0 will set | |
134 the attribute to undefined. | |
135 Returntype : integer | |
136 Exceptions : none | |
137 Caller : general | |
138 | |
139 =cut | |
140 | |
141 sub end { | |
142 my $self = shift; | |
143 return $self->reference_slice_end(@_); | |
144 } | |
145 | |
146 | |
147 =head2 reference_slice_end | |
148 | |
149 Arg [1] : integer $reference_slice_end | |
150 Example : my $reference_slice_end = $genomic_align_block->reference_slice_end; | |
151 Example : $genomic_align_block->reference_slice_end(1283); | |
152 Description: get/set for attribute reference_slice_end. A value of 0 will set | |
153 the attribute to undefined. | |
154 Returntype : integer | |
155 Exceptions : none | |
156 Caller : general | |
157 Status : Stable | |
158 | |
159 =cut | |
160 | |
161 sub reference_slice_end { | |
162 my ($self, $reference_slice_end) = @_; | |
163 | |
164 if (defined($reference_slice_end)) { | |
165 $self->{'reference_slice_end'} = ($reference_slice_end or undef); | |
166 } | |
167 | |
168 return $self->{'reference_slice_end'}; | |
169 | |
170 } | |
171 | |
172 =head2 strand | |
173 | |
174 Arg [1] : integer $strand | |
175 Example : my $strand = $genomic_align_block->strand; | |
176 Example : $genomic_align_block->strand(-1); | |
177 Description: get/set for attribute strand. A value of 0 will set | |
178 the attribute to undefined. | |
179 Returntype : integer | |
180 Exceptions : none | |
181 Caller : general | |
182 Status : Stable | |
183 | |
184 =cut | |
185 | |
186 sub strand { | |
187 my ($self, $reference_slice_strand) = @_; | |
188 | |
189 if (defined($reference_slice_strand)) { | |
190 $self->{'reference_slice_strand'} = ($reference_slice_strand or undef); | |
191 } | |
192 | |
193 return $self->{'reference_slice_strand'}; | |
194 } | |
195 | |
196 =head2 reference_slice_strand | |
197 | |
198 Arg [1] : integer $reference_slice_strand | |
199 Example : my $reference_slice_strand = $genomic_align_block->reference_slice_strand; | |
200 Example : $genomic_align_block->reference_slice_strand(-1); | |
201 Description: get/set for attribute reference_slice_strand. A value of 0 will set | |
202 the attribute to undefined. | |
203 Returntype : integer | |
204 Exceptions : none | |
205 Caller : general | |
206 Status : Stable | |
207 | |
208 =cut | |
209 | |
210 sub reference_slice_strand { | |
211 my ($self, $reference_slice_strand) = @_; | |
212 | |
213 if (defined($reference_slice_strand)) { | |
214 $self->{'reference_slice_strand'} = ($reference_slice_strand or undef); | |
215 } | |
216 | |
217 return $self->{'reference_slice_strand'}; | |
218 } | |
219 | |
220 =head2 get_original_strand | |
221 | |
222 Args : none | |
223 Example : if (!$genomic_align_block->get_original_strand()) { | |
224 # original GenomicAlignBlock has been reverse-complemented | |
225 } | |
226 Description: getter for the _orignal_strand attribute | |
227 Returntype : none | |
228 Exceptions : none | |
229 Caller : general | |
230 Status : Stable | |
231 | |
232 =cut | |
233 | |
234 sub get_original_strand { | |
235 my ($self) = @_; | |
236 | |
237 if (!defined($self->{_original_strand})) { | |
238 $self->{_original_strand} = 1; | |
239 } | |
240 | |
241 return $self->{_original_strand}; | |
242 } | |
243 | |
244 =head2 restrict_between_reference_positions | |
245 | |
246 Arg[1] : [optional] int $start, refers to the reference_dnafrag | |
247 Arg[2] : [optional] int $end, refers to the reference_dnafrag | |
248 Arg[3] : [optional] Bio::EnsEMBL::Compara::GenomicAlign $reference_GenomicAlign | |
249 Arg[4] : [optional] boolean $skip_empty_GenomicAligns | |
250 Example : none | |
251 Description: restrict this GenomicAlignBlock. It returns a new object unless no | |
252 restriction is needed. In that case, it returns the original unchanged | |
253 object | |
254 It might be the case that the restricted region coincide with a gap | |
255 in one or several GenomicAligns. By default these GenomicAligns are | |
256 returned with a dnafrag_end equals to its dnafrag_start + 1. For instance, | |
257 a GenomicAlign with dnafrag_start = 12345 and dnafrag_end = 12344 | |
258 correspond to a block which goes on this region from before 12345 to | |
259 after 12344, ie just between 12344 and 12345. You can choose to remove | |
260 these empty GenomicAligns by setting $skip_empty_GenomicAligns to any | |
261 true value. | |
262 Returntype : Bio::EnsEMBL::Compara::GenomicAlignBlock object in scalar context. In | |
263 list context, returns the previous object and the start and end | |
264 positions of the restriction in alignment coordinates (from 1 to | |
265 alignment_length) | |
266 Exceptions : return undef if reference positions lie outside of the alignment | |
267 Caller : general | |
268 Status : At risk | |
269 | |
270 =cut | |
271 | |
272 sub restrict_between_reference_positions { | |
273 my ($self, $start, $end, $reference_genomic_align, $skip_empty_GenomicAligns) = @_; | |
274 my $genomic_align_set; | |
275 my $new_reference_genomic_align; | |
276 my $new_genomic_aligns = []; | |
277 | |
278 $reference_genomic_align ||= $self->reference_genomic_align; | |
279 throw("A reference Bio::EnsEMBL::Compara::GenomicAlign must be given") if (!$reference_genomic_align); | |
280 $start = $reference_genomic_align->dnafrag_start if (!defined($start)); | |
281 $end = $reference_genomic_align->dnafrag_end if (!defined($end)); | |
282 | |
283 if ($start > $reference_genomic_align->dnafrag_end or $end < $reference_genomic_align->dnafrag_start) { | |
284 # restricting outside of boundaries => return undef object | |
285 warn("restricting outside of boundaries => return undef object: $start-$end (".$reference_genomic_align->dnafrag_start."-".$reference_genomic_align->dnafrag_end.")"); | |
286 return wantarray ? (undef, undef, undef) : undef; | |
287 } | |
288 my $number_of_base_pairs_to_trim_from_the_start = $start - $reference_genomic_align->dnafrag_start; | |
289 my $number_of_base_pairs_to_trim_from_the_end = $reference_genomic_align->dnafrag_end - $end; | |
290 | |
291 my $is_ref_low_coverage = 0; | |
292 if ($reference_genomic_align->cigar_line =~ /X/) { | |
293 $is_ref_low_coverage = 1; | |
294 } | |
295 | |
296 ## Skip if no restriction is needed. Return original object! We are still going on with the | |
297 ## restriction when either excess_at_the_start or excess_at_the_end are 0 as a (multiple) | |
298 ## alignment may start or end with gaps in the reference species. In that case, we want to | |
299 ## trim these gaps from the alignment as they fall just outside of the region of interest | |
300 | |
301 ##Exception if the reference species is low coverage, then need to continue | |
302 ##with this routine to find out the correct align positions | |
303 if ($number_of_base_pairs_to_trim_from_the_start < 0 and $number_of_base_pairs_to_trim_from_the_end < 0 and !$is_ref_low_coverage) { | |
304 return wantarray ? ($self, 1, $self->length) : $self; | |
305 } | |
306 | |
307 my $negative_strand = ($reference_genomic_align->dnafrag_strand == -1); | |
308 | |
309 ## Create a new Bio::EnsEMBL::Compara::GenomicAlignBlock object | |
310 throw("Reference GenomicAlign not found!") if (!$reference_genomic_align); | |
311 | |
312 my @reference_cigar = grep {$_} split(/(\d*[GDMXI])/, $reference_genomic_align->cigar_line); | |
313 if ($negative_strand) { | |
314 @reference_cigar = reverse @reference_cigar; | |
315 } | |
316 | |
317 #If this is negative, eg when a slice starts in one block and ends in | |
318 #another, need to set to 0 to ensure we enter the loop below. This | |
319 #fixes a bug when using a 2x species as the reference and fetching using | |
320 #an expanded align slice. | |
321 if ($number_of_base_pairs_to_trim_from_the_start < 0) { | |
322 $number_of_base_pairs_to_trim_from_the_start = 0; | |
323 } | |
324 | |
325 ## Parse start of cigar_line for the reference GenomicAlign | |
326 my $counter_of_trimmed_columns_from_the_start = 0; # num. of bp in the alignment to be trimmed | |
327 | |
328 if ($number_of_base_pairs_to_trim_from_the_start >= 0) { | |
329 my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the start) | |
330 ## Loop through the cigar pieces | |
331 while (my $cigar = shift(@reference_cigar)) { | |
332 # Parse each cigar piece | |
333 my ($num, $type) = ($cigar =~ /^(\d*)([GDMXI])/); | |
334 $num = 1 if ($num eq ""); | |
335 | |
336 # Insertions are not part of the alignment, don't count them | |
337 if ($type ne "I") { | |
338 $counter_of_trimmed_columns_from_the_start += $num; | |
339 } | |
340 | |
341 # Matches and insertions are actual base pairs in the reference | |
342 if ($type eq "M" or $type eq "I") { | |
343 $counter_of_trimmed_base_pairs += $num; | |
344 # If this cigar piece is too long and we overshoot the number of base pairs we want to trim, | |
345 # we substitute this cigar piece by a shorter one | |
346 if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_start) { | |
347 my $new_cigar_piece = ""; | |
348 # length of the new cigar piece | |
349 my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_start; | |
350 if ($length > 1) { | |
351 $new_cigar_piece = $length.$type; | |
352 } elsif ($length == 1) { | |
353 $new_cigar_piece = $type; | |
354 } | |
355 unshift(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece); | |
356 | |
357 # There is no need to correct the counter of trimmed columns if we are in an insertion | |
358 # when we overshoot | |
359 if ($type eq "M") { | |
360 $counter_of_trimmed_columns_from_the_start -= $length; | |
361 } | |
362 | |
363 ## We don't want to start with an insertion or a deletion. Trim them! | |
364 while (@reference_cigar and $reference_cigar[0] =~ /[DI]/) { | |
365 my ($num, $type) = ($reference_cigar[0] =~ /^(\d*)([DIGMX])/); | |
366 $num = 1 if ($num eq ""); | |
367 # only counts deletions, insertions are not part of the aligment | |
368 $counter_of_trimmed_columns_from_the_start += $num if ($type eq "D"); | |
369 shift(@reference_cigar); | |
370 } | |
371 last; | |
372 } | |
373 } | |
374 } | |
375 } | |
376 | |
377 #If this is negative, eg when a slice starts in one block and ends in | |
378 #another, need to set to 0 to ensure we enter the loop below. This | |
379 #fixes a bug when using a 2x species as the reference and fetching using | |
380 #an expanded align slice. | |
381 if ($number_of_base_pairs_to_trim_from_the_end < 0) { | |
382 $number_of_base_pairs_to_trim_from_the_end = 0; | |
383 } | |
384 | |
385 ## Parse end of cigar_line for the reference GenomicAlign | |
386 my $counter_of_trimmed_columns_from_the_end = 0; # num. of bp in the alignment to be trimmed | |
387 if ($number_of_base_pairs_to_trim_from_the_end >= 0) { | |
388 my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the end) | |
389 ## Loop through the cigar pieces | |
390 while (my $cigar = pop(@reference_cigar)) { | |
391 # Parse each cigar piece | |
392 my ($num, $type) = ($cigar =~ /^(\d*)([DIGMX])/); | |
393 $num = 1 if ($num eq ""); | |
394 | |
395 # Insertions are not part of the alignment, don't count them | |
396 if ($type ne "I") { | |
397 $counter_of_trimmed_columns_from_the_end += $num; | |
398 } | |
399 | |
400 # Matches and insertions are actual base pairs in the reference | |
401 if ($type eq "M" or $type eq "I") { | |
402 $counter_of_trimmed_base_pairs += $num; | |
403 # If this cigar piece is too long and we overshoot the number of base pairs we want to trim, | |
404 # we substitute this cigar piece by a shorter one | |
405 if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_end) { | |
406 my $new_cigar_piece = ""; | |
407 # length of the new cigar piece | |
408 my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_end; | |
409 if ($length > 1) { | |
410 $new_cigar_piece = $length.$type; | |
411 } elsif ($length == 1) { | |
412 $new_cigar_piece = $type; | |
413 } | |
414 push(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece); | |
415 | |
416 # There is no need to correct the counter of trimmed columns if we are in an insertion | |
417 # when we overshoot | |
418 if ($type eq "M") { | |
419 $counter_of_trimmed_columns_from_the_end -= $length; | |
420 } | |
421 | |
422 ## We don't want to end with an insertion or a deletion. Trim them! | |
423 while (@reference_cigar and $reference_cigar[-1] =~ /[DI]/) { | |
424 my ($num, $type) = ($reference_cigar[-1] =~ /^(\d*)([DIGMX])/); | |
425 $num = 1 if ($num eq ""); | |
426 # only counts deletions, insertions are not part of the aligment | |
427 $counter_of_trimmed_columns_from_the_end += $num if ($type eq "D"); | |
428 pop(@reference_cigar); | |
429 } | |
430 last; | |
431 } | |
432 } | |
433 } | |
434 } | |
435 | |
436 ## Skip if no restriction is needed. Return original object! This may happen when | |
437 ## either excess_at_the_start or excess_at_the_end are 0 but the alignment does not | |
438 ## start or end with gaps in the reference species. | |
439 if ($counter_of_trimmed_columns_from_the_start <= 0 and $counter_of_trimmed_columns_from_the_end <= 0) { | |
440 return wantarray ? ($self, 1, $self->length) : $self; | |
441 } | |
442 | |
443 my ($aln_start, $aln_end); | |
444 if ($negative_strand) { | |
445 $aln_start = $counter_of_trimmed_columns_from_the_end + 1; | |
446 $aln_end = $self->length - $counter_of_trimmed_columns_from_the_start; | |
447 } else { | |
448 $aln_start = $counter_of_trimmed_columns_from_the_start + 1; | |
449 $aln_end = $self->length - $counter_of_trimmed_columns_from_the_end; | |
450 } | |
451 | |
452 $genomic_align_set = $self->restrict_between_alignment_positions($aln_start, $aln_end, $skip_empty_GenomicAligns); | |
453 $new_reference_genomic_align = $genomic_align_set->reference_genomic_align; | |
454 | |
455 if (!defined $self->{'restricted_aln_start'}) { | |
456 $self->{'restricted_aln_start'} = 0; | |
457 } | |
458 if (!defined $self->{'restricted_aln_end'}) { | |
459 $self->{'restricted_aln_end'} = 0; | |
460 } | |
461 $genomic_align_set->{'restricted_aln_start'} = $counter_of_trimmed_columns_from_the_start + $self->{'restricted_aln_start'}; | |
462 $genomic_align_set->{'restricted_aln_end'} = $counter_of_trimmed_columns_from_the_end + $self->{'restricted_aln_end'}; | |
463 #$genomic_align_set->{'original_length'} = $self->length; | |
464 | |
465 #Need to use original gab length. If original_length is not set, length has | |
466 #not changed. Needed when use 2X genome as reference. | |
467 if (defined $self->{'original_length'}) { | |
468 $genomic_align_set->{'original_length'} = $self->{'original_length'}; | |
469 } else { | |
470 $genomic_align_set->{'original_length'} = $self->length; | |
471 } | |
472 | |
473 if (defined $self->slice) { | |
474 if ($self->strand == 1) { | |
475 $genomic_align_set->start($new_reference_genomic_align->dnafrag_start - | |
476 $self->slice->start + 1); | |
477 $genomic_align_set->end($new_reference_genomic_align->dnafrag_end - | |
478 $self->slice->start + 1); | |
479 $genomic_align_set->strand(1); | |
480 } else { | |
481 $genomic_align_set->start($self->{reference_slice}->{end} - | |
482 $new_reference_genomic_align->{dnafrag_end} + 1); | |
483 $genomic_align_set->end($self->{reference_slice}->{end} - | |
484 $new_reference_genomic_align->{dnafrag_start} + 1); | |
485 $genomic_align_set->strand(-1); | |
486 } | |
487 } | |
488 | |
489 return wantarray ? ($genomic_align_set, $aln_start, $aln_end) : $genomic_align_set; | |
490 } | |
491 | |
492 1; |