0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =head1 AUTHORSHIP
|
|
20
|
|
21 Ensembl Team. Individual contributions can be found in the CVS log.
|
|
22
|
|
23 =cut
|
|
24
|
|
25 =head1 NAME
|
|
26
|
|
27 AlignedMember - DESCRIPTION of Object
|
|
28
|
|
29 =head1 DESCRIPTION
|
|
30
|
|
31 A subclass of Member which extends it to allow it to be aligned with other AlignedMember objects.
|
|
32 General enough to allow for global, local, pair-wise and multiple alignments.
|
|
33 At the moment used primarily in NestedSet Tree data-structure, but there are plans to extend its usage.
|
|
34
|
|
35 =head1 INHERITANCE TREE
|
|
36
|
|
37 Bio::EnsEMBL::Compara::AlignedMember
|
|
38 +- Bio::EnsEMBL::Compara::Member
|
|
39
|
|
40 =head1 METHODS
|
|
41
|
|
42 =cut
|
|
43
|
|
44 package Bio::EnsEMBL::Compara::AlignedMember;
|
|
45
|
|
46 use strict;
|
|
47 use Bio::EnsEMBL::Utils::Exception;
|
|
48 use Bio::EnsEMBL::Compara::Member;
|
|
49
|
|
50 use base ('Bio::EnsEMBL::Compara::Member');
|
|
51
|
|
52
|
|
53 ##################################
|
|
54 # overriden superclass methods
|
|
55 ##################################
|
|
56
|
|
57 =head2 copy
|
|
58
|
|
59 Arg [1] : none
|
|
60 Example : $copy = $aligned_member->copy();
|
|
61 Description : Creates a new AlignedMember object from an existing one
|
|
62 Returntype : Bio::EnsEMBL::Compara::AlignedMember
|
|
63 Exceptions : none
|
|
64 Caller : general
|
|
65 Status : Stable
|
|
66
|
|
67 =cut
|
|
68
|
|
69 sub copy {
|
|
70 my $self = shift;
|
|
71
|
|
72 my $mycopy = @_ ? shift : {}; # extending or from scratch?
|
|
73 $self->SUPER::copy($mycopy);
|
|
74 bless $mycopy, 'Bio::EnsEMBL::Compara::AlignedMember';
|
|
75
|
|
76 # The following does not Work if the initial object is only a Member
|
|
77 if (UNIVERSAL::isa($self, 'Bio::EnsEMBL::Compara::AlignedMember')) {
|
|
78 $mycopy->cigar_line($self->cigar_line);
|
|
79 $mycopy->cigar_start($self->cigar_start);
|
|
80 $mycopy->cigar_end($self->cigar_end);
|
|
81 $mycopy->perc_cov($self->perc_cov);
|
|
82 $mycopy->perc_id($self->perc_id);
|
|
83 $mycopy->perc_pos($self->perc_pos);
|
|
84 $mycopy->method_link_species_set_id($self->method_link_species_set_id);
|
|
85 }
|
|
86
|
|
87 return $mycopy;
|
|
88 }
|
|
89
|
|
90
|
|
91 #####################################################
|
|
92
|
|
93
|
|
94 =head2 cigar_line
|
|
95
|
|
96 Arg [1] : (optional) $cigar_line
|
|
97 Example : $object->cigar_line($cigar_line);
|
|
98 Example : $cigar_line = $object->cigar_line();
|
|
99 Description : Getter/setter for the cigar_line attribute. The cigar line
|
|
100 represents the modifications that are required to go from
|
|
101 the original sequence to the aligned sequence. In particular,
|
|
102 it shows the location of the gaps in the sequence. The cigar
|
|
103 line is built with a series of numbers and characters where
|
|
104 the number represents the number of positions in the mode
|
|
105 defined by the next charcater. When the number is 1, it can be
|
|
106 omitted. For example, the cigar line '23MD4M' means that there
|
|
107 are 23 matches or mismatches, then 1 deletion (gap) and then
|
|
108 another 4 matches or mismatches. The aligned sequence is
|
|
109 obtained by inserting 1 gap at the right location.
|
|
110 Returntype : string
|
|
111 Exceptions : none
|
|
112 Caller : general
|
|
113 Status : Stable
|
|
114
|
|
115 =cut
|
|
116
|
|
117 sub cigar_line {
|
|
118 my $self = shift;
|
|
119 $self->{'_cigar_line'} = shift if(@_);
|
|
120 return $self->{'_cigar_line'};
|
|
121 }
|
|
122
|
|
123
|
|
124 =head2 cigar_start
|
|
125
|
|
126 Arg [1] : (optional) $cigar_start
|
|
127 Example : $object->cigar_start($cigar_start);
|
|
128 Example : $cigar_start = $object->cigar_start();
|
|
129 Description : Getter/setter for the cigar_start attribute. For non-global
|
|
130 alignments, this represent the starting point of the local
|
|
131 alignment.
|
|
132 Currently the data provided as AlignedMembers (leaves of the
|
|
133 GeneTree) are obtained using global alignments and the
|
|
134 cigar_start is always undefined.
|
|
135 Returntype : integer
|
|
136 Exceptions : none
|
|
137 Caller : general
|
|
138 Status : Stable
|
|
139
|
|
140 =cut
|
|
141
|
|
142 sub cigar_start {
|
|
143 my $self = shift;
|
|
144 $self->{'_cigar_start'} = shift if(@_);
|
|
145 return $self->{'_cigar_start'};
|
|
146 }
|
|
147
|
|
148
|
|
149 =head2 cigar_end
|
|
150
|
|
151 Arg [1] : (optional) $cigar_end
|
|
152 Example : $object->cigar_end($cigar_end);
|
|
153 Example : $cigar_end = $object->cigar_end();
|
|
154 Description : Getter/setter for the cigar_end attribute. For non-global
|
|
155 alignments, this represent the ending point of the local
|
|
156 alignment.
|
|
157 Currently the data provided as AlignedMembers (leaves of the
|
|
158 GeneTree) are obtained using global alignments and the
|
|
159 cigar_end is always undefined.
|
|
160 Returntype : integer
|
|
161 Exceptions : none
|
|
162 Caller : general
|
|
163 Status : Stable
|
|
164
|
|
165 =cut
|
|
166
|
|
167 sub cigar_end {
|
|
168 my $self = shift;
|
|
169 $self->{'_cigar_end'} = shift if(@_);
|
|
170 return $self->{'_cigar_end'};
|
|
171 }
|
|
172
|
|
173
|
|
174 =head2 perc_cov
|
|
175
|
|
176 Arg [1] : (optional) $perc_cov
|
|
177 Example : $object->perc_cov($perc_cov);
|
|
178 Example : $perc_cov = $object->perc_cov();
|
|
179 Description : Getter/setter for the perc_cov attribute. For non-global
|
|
180 alignments, this represent the coverage of the alignment in
|
|
181 percentage of the total length of the sequence.
|
|
182 Currently the data provided as AlignedMembers (leaves of the
|
|
183 GeneTree) are obtained using global alignments (the whole
|
|
184 sequence is always included) and the perc_cov is always undefined.
|
|
185 Returntype : integer
|
|
186 Exceptions : none
|
|
187 Caller : general
|
|
188 Status : Stable
|
|
189
|
|
190 =cut
|
|
191
|
|
192 sub perc_cov {
|
|
193 my $self = shift;
|
|
194 $self->{'perc_cov'} = shift if(@_);
|
|
195 return $self->{'perc_cov'};
|
|
196 }
|
|
197
|
|
198
|
|
199 =head2 perc_id
|
|
200
|
|
201 Arg [1] : (optional) $perc_id
|
|
202 Example : $object->perc_id($perc_id);
|
|
203 Example : $perc_id = $object->perc_id();
|
|
204 Description : Getter/setter for the perc_id attribute. This is generally
|
|
205 used for pairwise relationships. The percentage identity
|
|
206 reprensents the number of positions that are identical in
|
|
207 the alignment in both sequences.
|
|
208 Currently the data provided as AlignedMembers (leaves of the
|
|
209 GeneTree) are obtained using multiple alignments and the
|
|
210 perc_id is always undefined.
|
|
211 Returntype : integer
|
|
212 Exceptions : none
|
|
213 Caller : general
|
|
214 Status : Stable
|
|
215
|
|
216 =cut
|
|
217
|
|
218 sub perc_id {
|
|
219 my $self = shift;
|
|
220 $self->{'perc_id'} = shift if(@_);
|
|
221 return $self->{'perc_id'};
|
|
222 }
|
|
223
|
|
224
|
|
225 =head2 perc_pos
|
|
226
|
|
227 Arg [1] : (optional) $perc_pos
|
|
228 Example : $object->perc_pos($perc_pos);
|
|
229 Example : $perc_pos = $object->perc_pos();
|
|
230 Description : Getter/setter for the perc_pos attribute. This is generally
|
|
231 used for pairwise relationships. The percentage positivity
|
|
232 reprensents the number of positions that are positive in
|
|
233 the alignment in both sequences. Currently, this is calculated
|
|
234 for protein sequences using the BLOSUM62 scoring matrix.
|
|
235 Currently the data provided as AlignedMembers (leaves of the
|
|
236 GeneTree) are obtained using multiple alignments and the
|
|
237 perc_cov is always undefined.
|
|
238 Returntype : integer
|
|
239 Exceptions : none
|
|
240 Caller : general
|
|
241 Status : Stable
|
|
242
|
|
243 =cut
|
|
244
|
|
245 sub perc_pos {
|
|
246 my $self = shift;
|
|
247 $self->{'perc_pos'} = shift if(@_);
|
|
248 return $self->{'perc_pos'};
|
|
249 }
|
|
250
|
|
251
|
|
252 =head2 method_link_species_set_id
|
|
253
|
|
254 Arg [1] : (optional) $method_link_species_set_id
|
|
255 Example : $object->method_link_species_set_id($method_link_species_set_id);
|
|
256 Example : $method_link_species_set_id = $object->method_link_species_set_id();
|
|
257 Description : Getter/setter for the method_link_species_set_id attribute. Please,
|
|
258 refer to the Bio::EnsEMBL::Compara::MethodLinkSpeciesSet module
|
|
259 for more information on the method_link_species_set_id.
|
|
260 Returntype : int
|
|
261 Exceptions : Returns 0 if the method_link_species_set_id is not defined.
|
|
262 Caller : general
|
|
263 Status : Stable
|
|
264
|
|
265 =cut
|
|
266
|
|
267 sub method_link_species_set_id {
|
|
268 my $self = shift;
|
|
269 $self->{'method_link_species_set_id'} = shift if(@_);
|
|
270 $self->{'method_link_species_set_id'} = 0 unless(defined($self->{'method_link_species_set_id'}));
|
|
271 return $self->{'method_link_species_set_id'};
|
|
272 }
|
|
273
|
|
274
|
|
275 sub set {
|
|
276 my $self = shift;
|
|
277 return $self->{'set'};
|
|
278 }
|
|
279
|
|
280
|
|
281 =head2 alignment_string
|
|
282
|
|
283 Arg [1] : (optional) bool $exon_cased
|
|
284 Example : my $alignment_string = $object->alignment_string();
|
|
285 Example : my $alignment_string = $object->alignment_string(1);
|
|
286 Description : Returns the aligned sequence for this object. For sequences
|
|
287 split in exons, the $exon_cased flag permits to request
|
|
288 that each exon is represented in alternative upper and lower
|
|
289 case.
|
|
290 For local alignments, when the alignment does not cover the
|
|
291 whole protein, only the part of the sequence in the alignemnt
|
|
292 is returned. Currently only global alignments are provided.
|
|
293 Therefore the alignment_string always returns the whole aligned
|
|
294 sequence.
|
|
295 Returntype : string
|
|
296 Exceptions : throws if the cigar_line is not defined for this object.
|
|
297 Caller : general
|
|
298 Status : Stable
|
|
299
|
|
300 =cut
|
|
301
|
|
302 sub alignment_string {
|
|
303 my $self = shift;
|
|
304 my $exon_cased = shift;
|
|
305
|
|
306 unless (defined $self->cigar_line && $self->cigar_line ne "") {
|
|
307 throw("To get an alignment_string, the cigar_line needs to be define\n");
|
|
308 }
|
|
309
|
|
310 # Use different keys for exon-cased and non exon-cased sequences
|
|
311 my $key = 'alignment_string';
|
|
312 if ($exon_cased) {
|
|
313 $key = 'alignment_string_cased';
|
|
314 } elsif (defined $self->{'alignment_string_cased'} and !defined($self->{'alignment_string'})) {
|
|
315 # non exon-cased sequence can be easily obtained from the exon-cased one.
|
|
316 $self->{'alignment_string'} = uc($self->{'alignment_string_cased'})
|
|
317 }
|
|
318
|
|
319 unless (defined $self->{$key}) {
|
|
320 my $sequence;
|
|
321 if ($exon_cased) {
|
|
322 $sequence = $self->sequence_exon_cased;
|
|
323 } else {
|
|
324 $sequence = $self->sequence;
|
|
325 }
|
|
326 if (defined $self->cigar_start || defined $self->cigar_end) {
|
|
327 unless (defined $self->cigar_start && defined $self->cigar_end) {
|
|
328 throw("both cigar_start and cigar_end should be defined");
|
|
329 }
|
|
330 my $offset = $self->cigar_start - 1;
|
|
331 my $length = $self->cigar_end - $self->cigar_start + 1;
|
|
332 $sequence = substr($sequence, $offset, $length);
|
|
333 }
|
|
334
|
|
335 my $cigar_line = $self->cigar_line;
|
|
336 $cigar_line =~ s/([MD])/$1 /g;
|
|
337
|
|
338 my @cigar_segments = split " ",$cigar_line;
|
|
339 my $alignment_string = "";
|
|
340 my $seq_start = 0;
|
|
341 foreach my $segment (@cigar_segments) {
|
|
342 if ($segment =~ /^(\d*)D$/) {
|
|
343 my $length = $1;
|
|
344 $length = 1 if ($length eq "");
|
|
345 $alignment_string .= "-" x $length;
|
|
346 } elsif ($segment =~ /^(\d*)M$/) {
|
|
347 my $length = $1;
|
|
348 $length = 1 if ($length eq "");
|
|
349 $alignment_string .= substr($sequence,$seq_start,$length);
|
|
350 $seq_start += $length;
|
|
351 }
|
|
352 }
|
|
353 $self->{$key} = $alignment_string;
|
|
354 }
|
|
355
|
|
356 return $self->{$key};
|
|
357 }
|
|
358
|
|
359
|
|
360 =head2 alignment_string_bounded
|
|
361
|
|
362 Arg [1] : none
|
|
363 Example : my $alignment_string_bounded = $object->alignment_string_bounded();
|
|
364 Description : Returns the aligned sequence for this object with padding characters
|
|
365 representing the introns.
|
|
366 Returntype : string
|
|
367 Exceptions : throws if the cigar_line is not defined for this object or if the
|
|
368 cigar_start or cigar_end are defined.
|
|
369 Caller : general
|
|
370 Status : Stable
|
|
371
|
|
372 =cut
|
|
373
|
|
374 sub alignment_string_bounded {
|
|
375 my $self = shift;
|
|
376
|
|
377 unless (defined $self->cigar_line && $self->cigar_line ne "") {
|
|
378 throw("To get an alignment_string, the cigar_line needs to be define\n");
|
|
379 }
|
|
380 unless (defined $self->{'alignment_string_bounded'}) {
|
|
381 my $sequence_exon_bounded = $self->sequence_exon_bounded;
|
|
382 if (defined $self->cigar_start || defined $self->cigar_end) {
|
|
383 throw("method doesnt implement defined cigar_start and cigar_end");
|
|
384 }
|
|
385 $sequence_exon_bounded =~ s/b|o|j/\ /g;
|
|
386 my $cigar_line = $self->cigar_line;
|
|
387 $cigar_line =~ s/([MD])/$1 /g;
|
|
388
|
|
389 my @cigar_segments = split " ",$cigar_line;
|
|
390 my $alignment_string_bounded = "";
|
|
391 my $seq_start = 0;
|
|
392 my $exon_count = 1;
|
|
393 foreach my $segment (@cigar_segments) {
|
|
394 if ($segment =~ /^(\d*)D$/) {
|
|
395 my $length = $1;
|
|
396 $length = 1 if ($length eq "");
|
|
397 $alignment_string_bounded .= "-" x $length;
|
|
398 } elsif ($segment =~ /^(\d*)M$/) {
|
|
399 my $length = $1;
|
|
400 $length = 1 if ($length eq "");
|
|
401 my $substring = substr($sequence_exon_bounded,$seq_start,$length);
|
|
402 if ($substring =~ /\ /) {
|
|
403 my $num_boundaries = $substring =~ s/(\ )/$1/g;
|
|
404 $length += $num_boundaries;
|
|
405 $substring = substr($sequence_exon_bounded,$seq_start,$length);
|
|
406 }
|
|
407 $alignment_string_bounded .= $substring;
|
|
408 $seq_start += $length;
|
|
409 }
|
|
410 }
|
|
411 $self->{'alignment_string_bounded'} = $alignment_string_bounded;
|
|
412 }
|
|
413
|
|
414 return $self->{'alignment_string_bounded'};
|
|
415 }
|
|
416
|
|
417
|
|
418 =head2 cdna_alignment_string
|
|
419
|
|
420 Arg [1] : none
|
|
421 Example : my $cdna_alignment = $aligned_member->cdna_alignment_string();
|
|
422 Description: Converts the peptide alignment string to a cdna alignment
|
|
423 string. This only works for EnsEMBL peptides whose cdna can
|
|
424 be retrieved from the attached core databse.
|
|
425 If the cdna cannot be retrieved undef is returned and a
|
|
426 warning is thrown.
|
|
427 Returntype : string
|
|
428 Exceptions : none
|
|
429 Caller : general
|
|
430
|
|
431 =cut
|
|
432
|
|
433 sub cdna_alignment_string {
|
|
434 my ($self, $changeSelenos) = @_;
|
|
435 $changeSelenos = 0 unless (defined $changeSelenos);
|
|
436
|
|
437 unless (defined $self->{'cdna_alignment_string'}) {
|
|
438
|
|
439 my $cdna;
|
|
440 eval { $cdna = $self->sequence_cds;};
|
|
441 if ($@) {
|
|
442 throw("can't connect to CORE to get transcript and cdna for "
|
|
443 . "genome_db_id:" . $self->genome_db_id )
|
|
444 unless($self->get_Transcript);
|
|
445 $cdna = $self->get_Transcript->translateable_seq;
|
|
446 }
|
|
447
|
|
448 if (defined $self->cigar_start || defined $self->cigar_end) {
|
|
449 unless (defined $self->cigar_start && defined $self->cigar_end) {
|
|
450 throw("both cigar_start and cigar_end should be defined");
|
|
451 }
|
|
452 my $offset = $self->cigar_start * 3 - 3;
|
|
453 my $length = ($self->cigar_end - $self->cigar_start + 1) * 3;
|
|
454 $cdna = substr($cdna, $offset, $length);
|
|
455 }
|
|
456
|
|
457 my $start = 0;
|
|
458 my $cdna_align_string = '';
|
|
459
|
|
460 # foreach my $pep (split(//, $self->alignment_string)) { # Speed up below
|
|
461 my $alignment_string = $self->alignment_string;
|
|
462 foreach my $pep (unpack("A1" x length($alignment_string), $alignment_string)) {
|
|
463 if($pep eq '-') {
|
|
464 $cdna_align_string .= '--- ';
|
|
465 } elsif ((($pep eq 'U') and $changeSelenos) or ($pep eq '*')) {
|
|
466 $cdna_align_string .= 'NNN ';
|
|
467 $start += 3;
|
|
468 } else {
|
|
469 my $codon = substr($cdna, $start, 3);
|
|
470 unless (length($codon) == 3) {
|
|
471 # sometimes the last codon contains only 1 or 2 nucleotides.
|
|
472 # making sure that it has 3 by adding as many Ns as necessary
|
|
473 $codon .= 'N' x (3 - length($codon));
|
|
474 }
|
|
475 $cdna_align_string .= $codon . ' ';
|
|
476 $start += 3;
|
|
477 }
|
|
478 }
|
|
479 $self->{'cdna_alignment_string'} = $cdna_align_string;
|
|
480 }
|
|
481
|
|
482 return $self->{'cdna_alignment_string'};
|
|
483 }
|
|
484
|
|
485
|
|
486 1;
|