comparison variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMember.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =head1 AUTHORSHIP
20
21 Ensembl Team. Individual contributions can be found in the CVS log.
22
23 =cut
24
25 =head1 NAME
26
27 AlignedMember - DESCRIPTION of Object
28
29 =head1 DESCRIPTION
30
31 A subclass of Member which extends it to allow it to be aligned with other AlignedMember objects.
32 General enough to allow for global, local, pair-wise and multiple alignments.
33 At the moment used primarily in NestedSet Tree data-structure, but there are plans to extend its usage.
34
35 =head1 INHERITANCE TREE
36
37 Bio::EnsEMBL::Compara::AlignedMember
38 +- Bio::EnsEMBL::Compara::Member
39
40 =head1 METHODS
41
42 =cut
43
44 package Bio::EnsEMBL::Compara::AlignedMember;
45
46 use strict;
47 use Bio::EnsEMBL::Utils::Exception;
48 use Bio::EnsEMBL::Compara::Member;
49
50 use base ('Bio::EnsEMBL::Compara::Member');
51
52
53 ##################################
54 # overriden superclass methods
55 ##################################
56
57 =head2 copy
58
59 Arg [1] : none
60 Example : $copy = $aligned_member->copy();
61 Description : Creates a new AlignedMember object from an existing one
62 Returntype : Bio::EnsEMBL::Compara::AlignedMember
63 Exceptions : none
64 Caller : general
65 Status : Stable
66
67 =cut
68
69 sub copy {
70 my $self = shift;
71
72 my $mycopy = @_ ? shift : {}; # extending or from scratch?
73 $self->SUPER::copy($mycopy);
74 bless $mycopy, 'Bio::EnsEMBL::Compara::AlignedMember';
75
76 # The following does not Work if the initial object is only a Member
77 if (UNIVERSAL::isa($self, 'Bio::EnsEMBL::Compara::AlignedMember')) {
78 $mycopy->cigar_line($self->cigar_line);
79 $mycopy->cigar_start($self->cigar_start);
80 $mycopy->cigar_end($self->cigar_end);
81 $mycopy->perc_cov($self->perc_cov);
82 $mycopy->perc_id($self->perc_id);
83 $mycopy->perc_pos($self->perc_pos);
84 $mycopy->method_link_species_set_id($self->method_link_species_set_id);
85 }
86
87 return $mycopy;
88 }
89
90
91 #####################################################
92
93
94 =head2 cigar_line
95
96 Arg [1] : (optional) $cigar_line
97 Example : $object->cigar_line($cigar_line);
98 Example : $cigar_line = $object->cigar_line();
99 Description : Getter/setter for the cigar_line attribute. The cigar line
100 represents the modifications that are required to go from
101 the original sequence to the aligned sequence. In particular,
102 it shows the location of the gaps in the sequence. The cigar
103 line is built with a series of numbers and characters where
104 the number represents the number of positions in the mode
105 defined by the next charcater. When the number is 1, it can be
106 omitted. For example, the cigar line '23MD4M' means that there
107 are 23 matches or mismatches, then 1 deletion (gap) and then
108 another 4 matches or mismatches. The aligned sequence is
109 obtained by inserting 1 gap at the right location.
110 Returntype : string
111 Exceptions : none
112 Caller : general
113 Status : Stable
114
115 =cut
116
117 sub cigar_line {
118 my $self = shift;
119 $self->{'_cigar_line'} = shift if(@_);
120 return $self->{'_cigar_line'};
121 }
122
123
124 =head2 cigar_start
125
126 Arg [1] : (optional) $cigar_start
127 Example : $object->cigar_start($cigar_start);
128 Example : $cigar_start = $object->cigar_start();
129 Description : Getter/setter for the cigar_start attribute. For non-global
130 alignments, this represent the starting point of the local
131 alignment.
132 Currently the data provided as AlignedMembers (leaves of the
133 GeneTree) are obtained using global alignments and the
134 cigar_start is always undefined.
135 Returntype : integer
136 Exceptions : none
137 Caller : general
138 Status : Stable
139
140 =cut
141
142 sub cigar_start {
143 my $self = shift;
144 $self->{'_cigar_start'} = shift if(@_);
145 return $self->{'_cigar_start'};
146 }
147
148
149 =head2 cigar_end
150
151 Arg [1] : (optional) $cigar_end
152 Example : $object->cigar_end($cigar_end);
153 Example : $cigar_end = $object->cigar_end();
154 Description : Getter/setter for the cigar_end attribute. For non-global
155 alignments, this represent the ending point of the local
156 alignment.
157 Currently the data provided as AlignedMembers (leaves of the
158 GeneTree) are obtained using global alignments and the
159 cigar_end is always undefined.
160 Returntype : integer
161 Exceptions : none
162 Caller : general
163 Status : Stable
164
165 =cut
166
167 sub cigar_end {
168 my $self = shift;
169 $self->{'_cigar_end'} = shift if(@_);
170 return $self->{'_cigar_end'};
171 }
172
173
174 =head2 perc_cov
175
176 Arg [1] : (optional) $perc_cov
177 Example : $object->perc_cov($perc_cov);
178 Example : $perc_cov = $object->perc_cov();
179 Description : Getter/setter for the perc_cov attribute. For non-global
180 alignments, this represent the coverage of the alignment in
181 percentage of the total length of the sequence.
182 Currently the data provided as AlignedMembers (leaves of the
183 GeneTree) are obtained using global alignments (the whole
184 sequence is always included) and the perc_cov is always undefined.
185 Returntype : integer
186 Exceptions : none
187 Caller : general
188 Status : Stable
189
190 =cut
191
192 sub perc_cov {
193 my $self = shift;
194 $self->{'perc_cov'} = shift if(@_);
195 return $self->{'perc_cov'};
196 }
197
198
199 =head2 perc_id
200
201 Arg [1] : (optional) $perc_id
202 Example : $object->perc_id($perc_id);
203 Example : $perc_id = $object->perc_id();
204 Description : Getter/setter for the perc_id attribute. This is generally
205 used for pairwise relationships. The percentage identity
206 reprensents the number of positions that are identical in
207 the alignment in both sequences.
208 Currently the data provided as AlignedMembers (leaves of the
209 GeneTree) are obtained using multiple alignments and the
210 perc_id is always undefined.
211 Returntype : integer
212 Exceptions : none
213 Caller : general
214 Status : Stable
215
216 =cut
217
218 sub perc_id {
219 my $self = shift;
220 $self->{'perc_id'} = shift if(@_);
221 return $self->{'perc_id'};
222 }
223
224
225 =head2 perc_pos
226
227 Arg [1] : (optional) $perc_pos
228 Example : $object->perc_pos($perc_pos);
229 Example : $perc_pos = $object->perc_pos();
230 Description : Getter/setter for the perc_pos attribute. This is generally
231 used for pairwise relationships. The percentage positivity
232 reprensents the number of positions that are positive in
233 the alignment in both sequences. Currently, this is calculated
234 for protein sequences using the BLOSUM62 scoring matrix.
235 Currently the data provided as AlignedMembers (leaves of the
236 GeneTree) are obtained using multiple alignments and the
237 perc_cov is always undefined.
238 Returntype : integer
239 Exceptions : none
240 Caller : general
241 Status : Stable
242
243 =cut
244
245 sub perc_pos {
246 my $self = shift;
247 $self->{'perc_pos'} = shift if(@_);
248 return $self->{'perc_pos'};
249 }
250
251
252 =head2 method_link_species_set_id
253
254 Arg [1] : (optional) $method_link_species_set_id
255 Example : $object->method_link_species_set_id($method_link_species_set_id);
256 Example : $method_link_species_set_id = $object->method_link_species_set_id();
257 Description : Getter/setter for the method_link_species_set_id attribute. Please,
258 refer to the Bio::EnsEMBL::Compara::MethodLinkSpeciesSet module
259 for more information on the method_link_species_set_id.
260 Returntype : int
261 Exceptions : Returns 0 if the method_link_species_set_id is not defined.
262 Caller : general
263 Status : Stable
264
265 =cut
266
267 sub method_link_species_set_id {
268 my $self = shift;
269 $self->{'method_link_species_set_id'} = shift if(@_);
270 $self->{'method_link_species_set_id'} = 0 unless(defined($self->{'method_link_species_set_id'}));
271 return $self->{'method_link_species_set_id'};
272 }
273
274
275 sub set {
276 my $self = shift;
277 return $self->{'set'};
278 }
279
280
281 =head2 alignment_string
282
283 Arg [1] : (optional) bool $exon_cased
284 Example : my $alignment_string = $object->alignment_string();
285 Example : my $alignment_string = $object->alignment_string(1);
286 Description : Returns the aligned sequence for this object. For sequences
287 split in exons, the $exon_cased flag permits to request
288 that each exon is represented in alternative upper and lower
289 case.
290 For local alignments, when the alignment does not cover the
291 whole protein, only the part of the sequence in the alignemnt
292 is returned. Currently only global alignments are provided.
293 Therefore the alignment_string always returns the whole aligned
294 sequence.
295 Returntype : string
296 Exceptions : throws if the cigar_line is not defined for this object.
297 Caller : general
298 Status : Stable
299
300 =cut
301
302 sub alignment_string {
303 my $self = shift;
304 my $exon_cased = shift;
305
306 unless (defined $self->cigar_line && $self->cigar_line ne "") {
307 throw("To get an alignment_string, the cigar_line needs to be define\n");
308 }
309
310 # Use different keys for exon-cased and non exon-cased sequences
311 my $key = 'alignment_string';
312 if ($exon_cased) {
313 $key = 'alignment_string_cased';
314 } elsif (defined $self->{'alignment_string_cased'} and !defined($self->{'alignment_string'})) {
315 # non exon-cased sequence can be easily obtained from the exon-cased one.
316 $self->{'alignment_string'} = uc($self->{'alignment_string_cased'})
317 }
318
319 unless (defined $self->{$key}) {
320 my $sequence;
321 if ($exon_cased) {
322 $sequence = $self->sequence_exon_cased;
323 } else {
324 $sequence = $self->sequence;
325 }
326 if (defined $self->cigar_start || defined $self->cigar_end) {
327 unless (defined $self->cigar_start && defined $self->cigar_end) {
328 throw("both cigar_start and cigar_end should be defined");
329 }
330 my $offset = $self->cigar_start - 1;
331 my $length = $self->cigar_end - $self->cigar_start + 1;
332 $sequence = substr($sequence, $offset, $length);
333 }
334
335 my $cigar_line = $self->cigar_line;
336 $cigar_line =~ s/([MD])/$1 /g;
337
338 my @cigar_segments = split " ",$cigar_line;
339 my $alignment_string = "";
340 my $seq_start = 0;
341 foreach my $segment (@cigar_segments) {
342 if ($segment =~ /^(\d*)D$/) {
343 my $length = $1;
344 $length = 1 if ($length eq "");
345 $alignment_string .= "-" x $length;
346 } elsif ($segment =~ /^(\d*)M$/) {
347 my $length = $1;
348 $length = 1 if ($length eq "");
349 $alignment_string .= substr($sequence,$seq_start,$length);
350 $seq_start += $length;
351 }
352 }
353 $self->{$key} = $alignment_string;
354 }
355
356 return $self->{$key};
357 }
358
359
360 =head2 alignment_string_bounded
361
362 Arg [1] : none
363 Example : my $alignment_string_bounded = $object->alignment_string_bounded();
364 Description : Returns the aligned sequence for this object with padding characters
365 representing the introns.
366 Returntype : string
367 Exceptions : throws if the cigar_line is not defined for this object or if the
368 cigar_start or cigar_end are defined.
369 Caller : general
370 Status : Stable
371
372 =cut
373
374 sub alignment_string_bounded {
375 my $self = shift;
376
377 unless (defined $self->cigar_line && $self->cigar_line ne "") {
378 throw("To get an alignment_string, the cigar_line needs to be define\n");
379 }
380 unless (defined $self->{'alignment_string_bounded'}) {
381 my $sequence_exon_bounded = $self->sequence_exon_bounded;
382 if (defined $self->cigar_start || defined $self->cigar_end) {
383 throw("method doesnt implement defined cigar_start and cigar_end");
384 }
385 $sequence_exon_bounded =~ s/b|o|j/\ /g;
386 my $cigar_line = $self->cigar_line;
387 $cigar_line =~ s/([MD])/$1 /g;
388
389 my @cigar_segments = split " ",$cigar_line;
390 my $alignment_string_bounded = "";
391 my $seq_start = 0;
392 my $exon_count = 1;
393 foreach my $segment (@cigar_segments) {
394 if ($segment =~ /^(\d*)D$/) {
395 my $length = $1;
396 $length = 1 if ($length eq "");
397 $alignment_string_bounded .= "-" x $length;
398 } elsif ($segment =~ /^(\d*)M$/) {
399 my $length = $1;
400 $length = 1 if ($length eq "");
401 my $substring = substr($sequence_exon_bounded,$seq_start,$length);
402 if ($substring =~ /\ /) {
403 my $num_boundaries = $substring =~ s/(\ )/$1/g;
404 $length += $num_boundaries;
405 $substring = substr($sequence_exon_bounded,$seq_start,$length);
406 }
407 $alignment_string_bounded .= $substring;
408 $seq_start += $length;
409 }
410 }
411 $self->{'alignment_string_bounded'} = $alignment_string_bounded;
412 }
413
414 return $self->{'alignment_string_bounded'};
415 }
416
417
418 =head2 cdna_alignment_string
419
420 Arg [1] : none
421 Example : my $cdna_alignment = $aligned_member->cdna_alignment_string();
422 Description: Converts the peptide alignment string to a cdna alignment
423 string. This only works for EnsEMBL peptides whose cdna can
424 be retrieved from the attached core databse.
425 If the cdna cannot be retrieved undef is returned and a
426 warning is thrown.
427 Returntype : string
428 Exceptions : none
429 Caller : general
430
431 =cut
432
433 sub cdna_alignment_string {
434 my ($self, $changeSelenos) = @_;
435 $changeSelenos = 0 unless (defined $changeSelenos);
436
437 unless (defined $self->{'cdna_alignment_string'}) {
438
439 my $cdna;
440 eval { $cdna = $self->sequence_cds;};
441 if ($@) {
442 throw("can't connect to CORE to get transcript and cdna for "
443 . "genome_db_id:" . $self->genome_db_id )
444 unless($self->get_Transcript);
445 $cdna = $self->get_Transcript->translateable_seq;
446 }
447
448 if (defined $self->cigar_start || defined $self->cigar_end) {
449 unless (defined $self->cigar_start && defined $self->cigar_end) {
450 throw("both cigar_start and cigar_end should be defined");
451 }
452 my $offset = $self->cigar_start * 3 - 3;
453 my $length = ($self->cigar_end - $self->cigar_start + 1) * 3;
454 $cdna = substr($cdna, $offset, $length);
455 }
456
457 my $start = 0;
458 my $cdna_align_string = '';
459
460 # foreach my $pep (split(//, $self->alignment_string)) { # Speed up below
461 my $alignment_string = $self->alignment_string;
462 foreach my $pep (unpack("A1" x length($alignment_string), $alignment_string)) {
463 if($pep eq '-') {
464 $cdna_align_string .= '--- ';
465 } elsif ((($pep eq 'U') and $changeSelenos) or ($pep eq '*')) {
466 $cdna_align_string .= 'NNN ';
467 $start += 3;
468 } else {
469 my $codon = substr($cdna, $start, 3);
470 unless (length($codon) == 3) {
471 # sometimes the last codon contains only 1 or 2 nucleotides.
472 # making sure that it has 3 by adding as many Ns as necessary
473 $codon .= 'N' x (3 - length($codon));
474 }
475 $cdna_align_string .= $codon . ' ';
476 $start += 3;
477 }
478 }
479 $self->{'cdna_alignment_string'} = $cdna_align_string;
480 }
481
482 return $self->{'cdna_alignment_string'};
483 }
484
485
486 1;