Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMember.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =head1 AUTHORSHIP | |
20 | |
21 Ensembl Team. Individual contributions can be found in the CVS log. | |
22 | |
23 =cut | |
24 | |
25 =head1 NAME | |
26 | |
27 AlignedMember - DESCRIPTION of Object | |
28 | |
29 =head1 DESCRIPTION | |
30 | |
31 A subclass of Member which extends it to allow it to be aligned with other AlignedMember objects. | |
32 General enough to allow for global, local, pair-wise and multiple alignments. | |
33 At the moment used primarily in NestedSet Tree data-structure, but there are plans to extend its usage. | |
34 | |
35 =head1 INHERITANCE TREE | |
36 | |
37 Bio::EnsEMBL::Compara::AlignedMember | |
38 +- Bio::EnsEMBL::Compara::Member | |
39 | |
40 =head1 METHODS | |
41 | |
42 =cut | |
43 | |
44 package Bio::EnsEMBL::Compara::AlignedMember; | |
45 | |
46 use strict; | |
47 use Bio::EnsEMBL::Utils::Exception; | |
48 use Bio::EnsEMBL::Compara::Member; | |
49 | |
50 use base ('Bio::EnsEMBL::Compara::Member'); | |
51 | |
52 | |
53 ################################## | |
54 # overriden superclass methods | |
55 ################################## | |
56 | |
57 =head2 copy | |
58 | |
59 Arg [1] : none | |
60 Example : $copy = $aligned_member->copy(); | |
61 Description : Creates a new AlignedMember object from an existing one | |
62 Returntype : Bio::EnsEMBL::Compara::AlignedMember | |
63 Exceptions : none | |
64 Caller : general | |
65 Status : Stable | |
66 | |
67 =cut | |
68 | |
69 sub copy { | |
70 my $self = shift; | |
71 | |
72 my $mycopy = @_ ? shift : {}; # extending or from scratch? | |
73 $self->SUPER::copy($mycopy); | |
74 bless $mycopy, 'Bio::EnsEMBL::Compara::AlignedMember'; | |
75 | |
76 # The following does not Work if the initial object is only a Member | |
77 if (UNIVERSAL::isa($self, 'Bio::EnsEMBL::Compara::AlignedMember')) { | |
78 $mycopy->cigar_line($self->cigar_line); | |
79 $mycopy->cigar_start($self->cigar_start); | |
80 $mycopy->cigar_end($self->cigar_end); | |
81 $mycopy->perc_cov($self->perc_cov); | |
82 $mycopy->perc_id($self->perc_id); | |
83 $mycopy->perc_pos($self->perc_pos); | |
84 $mycopy->method_link_species_set_id($self->method_link_species_set_id); | |
85 } | |
86 | |
87 return $mycopy; | |
88 } | |
89 | |
90 | |
91 ##################################################### | |
92 | |
93 | |
94 =head2 cigar_line | |
95 | |
96 Arg [1] : (optional) $cigar_line | |
97 Example : $object->cigar_line($cigar_line); | |
98 Example : $cigar_line = $object->cigar_line(); | |
99 Description : Getter/setter for the cigar_line attribute. The cigar line | |
100 represents the modifications that are required to go from | |
101 the original sequence to the aligned sequence. In particular, | |
102 it shows the location of the gaps in the sequence. The cigar | |
103 line is built with a series of numbers and characters where | |
104 the number represents the number of positions in the mode | |
105 defined by the next charcater. When the number is 1, it can be | |
106 omitted. For example, the cigar line '23MD4M' means that there | |
107 are 23 matches or mismatches, then 1 deletion (gap) and then | |
108 another 4 matches or mismatches. The aligned sequence is | |
109 obtained by inserting 1 gap at the right location. | |
110 Returntype : string | |
111 Exceptions : none | |
112 Caller : general | |
113 Status : Stable | |
114 | |
115 =cut | |
116 | |
117 sub cigar_line { | |
118 my $self = shift; | |
119 $self->{'_cigar_line'} = shift if(@_); | |
120 return $self->{'_cigar_line'}; | |
121 } | |
122 | |
123 | |
124 =head2 cigar_start | |
125 | |
126 Arg [1] : (optional) $cigar_start | |
127 Example : $object->cigar_start($cigar_start); | |
128 Example : $cigar_start = $object->cigar_start(); | |
129 Description : Getter/setter for the cigar_start attribute. For non-global | |
130 alignments, this represent the starting point of the local | |
131 alignment. | |
132 Currently the data provided as AlignedMembers (leaves of the | |
133 GeneTree) are obtained using global alignments and the | |
134 cigar_start is always undefined. | |
135 Returntype : integer | |
136 Exceptions : none | |
137 Caller : general | |
138 Status : Stable | |
139 | |
140 =cut | |
141 | |
142 sub cigar_start { | |
143 my $self = shift; | |
144 $self->{'_cigar_start'} = shift if(@_); | |
145 return $self->{'_cigar_start'}; | |
146 } | |
147 | |
148 | |
149 =head2 cigar_end | |
150 | |
151 Arg [1] : (optional) $cigar_end | |
152 Example : $object->cigar_end($cigar_end); | |
153 Example : $cigar_end = $object->cigar_end(); | |
154 Description : Getter/setter for the cigar_end attribute. For non-global | |
155 alignments, this represent the ending point of the local | |
156 alignment. | |
157 Currently the data provided as AlignedMembers (leaves of the | |
158 GeneTree) are obtained using global alignments and the | |
159 cigar_end is always undefined. | |
160 Returntype : integer | |
161 Exceptions : none | |
162 Caller : general | |
163 Status : Stable | |
164 | |
165 =cut | |
166 | |
167 sub cigar_end { | |
168 my $self = shift; | |
169 $self->{'_cigar_end'} = shift if(@_); | |
170 return $self->{'_cigar_end'}; | |
171 } | |
172 | |
173 | |
174 =head2 perc_cov | |
175 | |
176 Arg [1] : (optional) $perc_cov | |
177 Example : $object->perc_cov($perc_cov); | |
178 Example : $perc_cov = $object->perc_cov(); | |
179 Description : Getter/setter for the perc_cov attribute. For non-global | |
180 alignments, this represent the coverage of the alignment in | |
181 percentage of the total length of the sequence. | |
182 Currently the data provided as AlignedMembers (leaves of the | |
183 GeneTree) are obtained using global alignments (the whole | |
184 sequence is always included) and the perc_cov is always undefined. | |
185 Returntype : integer | |
186 Exceptions : none | |
187 Caller : general | |
188 Status : Stable | |
189 | |
190 =cut | |
191 | |
192 sub perc_cov { | |
193 my $self = shift; | |
194 $self->{'perc_cov'} = shift if(@_); | |
195 return $self->{'perc_cov'}; | |
196 } | |
197 | |
198 | |
199 =head2 perc_id | |
200 | |
201 Arg [1] : (optional) $perc_id | |
202 Example : $object->perc_id($perc_id); | |
203 Example : $perc_id = $object->perc_id(); | |
204 Description : Getter/setter for the perc_id attribute. This is generally | |
205 used for pairwise relationships. The percentage identity | |
206 reprensents the number of positions that are identical in | |
207 the alignment in both sequences. | |
208 Currently the data provided as AlignedMembers (leaves of the | |
209 GeneTree) are obtained using multiple alignments and the | |
210 perc_id is always undefined. | |
211 Returntype : integer | |
212 Exceptions : none | |
213 Caller : general | |
214 Status : Stable | |
215 | |
216 =cut | |
217 | |
218 sub perc_id { | |
219 my $self = shift; | |
220 $self->{'perc_id'} = shift if(@_); | |
221 return $self->{'perc_id'}; | |
222 } | |
223 | |
224 | |
225 =head2 perc_pos | |
226 | |
227 Arg [1] : (optional) $perc_pos | |
228 Example : $object->perc_pos($perc_pos); | |
229 Example : $perc_pos = $object->perc_pos(); | |
230 Description : Getter/setter for the perc_pos attribute. This is generally | |
231 used for pairwise relationships. The percentage positivity | |
232 reprensents the number of positions that are positive in | |
233 the alignment in both sequences. Currently, this is calculated | |
234 for protein sequences using the BLOSUM62 scoring matrix. | |
235 Currently the data provided as AlignedMembers (leaves of the | |
236 GeneTree) are obtained using multiple alignments and the | |
237 perc_cov is always undefined. | |
238 Returntype : integer | |
239 Exceptions : none | |
240 Caller : general | |
241 Status : Stable | |
242 | |
243 =cut | |
244 | |
245 sub perc_pos { | |
246 my $self = shift; | |
247 $self->{'perc_pos'} = shift if(@_); | |
248 return $self->{'perc_pos'}; | |
249 } | |
250 | |
251 | |
252 =head2 method_link_species_set_id | |
253 | |
254 Arg [1] : (optional) $method_link_species_set_id | |
255 Example : $object->method_link_species_set_id($method_link_species_set_id); | |
256 Example : $method_link_species_set_id = $object->method_link_species_set_id(); | |
257 Description : Getter/setter for the method_link_species_set_id attribute. Please, | |
258 refer to the Bio::EnsEMBL::Compara::MethodLinkSpeciesSet module | |
259 for more information on the method_link_species_set_id. | |
260 Returntype : int | |
261 Exceptions : Returns 0 if the method_link_species_set_id is not defined. | |
262 Caller : general | |
263 Status : Stable | |
264 | |
265 =cut | |
266 | |
267 sub method_link_species_set_id { | |
268 my $self = shift; | |
269 $self->{'method_link_species_set_id'} = shift if(@_); | |
270 $self->{'method_link_species_set_id'} = 0 unless(defined($self->{'method_link_species_set_id'})); | |
271 return $self->{'method_link_species_set_id'}; | |
272 } | |
273 | |
274 | |
275 sub set { | |
276 my $self = shift; | |
277 return $self->{'set'}; | |
278 } | |
279 | |
280 | |
281 =head2 alignment_string | |
282 | |
283 Arg [1] : (optional) bool $exon_cased | |
284 Example : my $alignment_string = $object->alignment_string(); | |
285 Example : my $alignment_string = $object->alignment_string(1); | |
286 Description : Returns the aligned sequence for this object. For sequences | |
287 split in exons, the $exon_cased flag permits to request | |
288 that each exon is represented in alternative upper and lower | |
289 case. | |
290 For local alignments, when the alignment does not cover the | |
291 whole protein, only the part of the sequence in the alignemnt | |
292 is returned. Currently only global alignments are provided. | |
293 Therefore the alignment_string always returns the whole aligned | |
294 sequence. | |
295 Returntype : string | |
296 Exceptions : throws if the cigar_line is not defined for this object. | |
297 Caller : general | |
298 Status : Stable | |
299 | |
300 =cut | |
301 | |
302 sub alignment_string { | |
303 my $self = shift; | |
304 my $exon_cased = shift; | |
305 | |
306 unless (defined $self->cigar_line && $self->cigar_line ne "") { | |
307 throw("To get an alignment_string, the cigar_line needs to be define\n"); | |
308 } | |
309 | |
310 # Use different keys for exon-cased and non exon-cased sequences | |
311 my $key = 'alignment_string'; | |
312 if ($exon_cased) { | |
313 $key = 'alignment_string_cased'; | |
314 } elsif (defined $self->{'alignment_string_cased'} and !defined($self->{'alignment_string'})) { | |
315 # non exon-cased sequence can be easily obtained from the exon-cased one. | |
316 $self->{'alignment_string'} = uc($self->{'alignment_string_cased'}) | |
317 } | |
318 | |
319 unless (defined $self->{$key}) { | |
320 my $sequence; | |
321 if ($exon_cased) { | |
322 $sequence = $self->sequence_exon_cased; | |
323 } else { | |
324 $sequence = $self->sequence; | |
325 } | |
326 if (defined $self->cigar_start || defined $self->cigar_end) { | |
327 unless (defined $self->cigar_start && defined $self->cigar_end) { | |
328 throw("both cigar_start and cigar_end should be defined"); | |
329 } | |
330 my $offset = $self->cigar_start - 1; | |
331 my $length = $self->cigar_end - $self->cigar_start + 1; | |
332 $sequence = substr($sequence, $offset, $length); | |
333 } | |
334 | |
335 my $cigar_line = $self->cigar_line; | |
336 $cigar_line =~ s/([MD])/$1 /g; | |
337 | |
338 my @cigar_segments = split " ",$cigar_line; | |
339 my $alignment_string = ""; | |
340 my $seq_start = 0; | |
341 foreach my $segment (@cigar_segments) { | |
342 if ($segment =~ /^(\d*)D$/) { | |
343 my $length = $1; | |
344 $length = 1 if ($length eq ""); | |
345 $alignment_string .= "-" x $length; | |
346 } elsif ($segment =~ /^(\d*)M$/) { | |
347 my $length = $1; | |
348 $length = 1 if ($length eq ""); | |
349 $alignment_string .= substr($sequence,$seq_start,$length); | |
350 $seq_start += $length; | |
351 } | |
352 } | |
353 $self->{$key} = $alignment_string; | |
354 } | |
355 | |
356 return $self->{$key}; | |
357 } | |
358 | |
359 | |
360 =head2 alignment_string_bounded | |
361 | |
362 Arg [1] : none | |
363 Example : my $alignment_string_bounded = $object->alignment_string_bounded(); | |
364 Description : Returns the aligned sequence for this object with padding characters | |
365 representing the introns. | |
366 Returntype : string | |
367 Exceptions : throws if the cigar_line is not defined for this object or if the | |
368 cigar_start or cigar_end are defined. | |
369 Caller : general | |
370 Status : Stable | |
371 | |
372 =cut | |
373 | |
374 sub alignment_string_bounded { | |
375 my $self = shift; | |
376 | |
377 unless (defined $self->cigar_line && $self->cigar_line ne "") { | |
378 throw("To get an alignment_string, the cigar_line needs to be define\n"); | |
379 } | |
380 unless (defined $self->{'alignment_string_bounded'}) { | |
381 my $sequence_exon_bounded = $self->sequence_exon_bounded; | |
382 if (defined $self->cigar_start || defined $self->cigar_end) { | |
383 throw("method doesnt implement defined cigar_start and cigar_end"); | |
384 } | |
385 $sequence_exon_bounded =~ s/b|o|j/\ /g; | |
386 my $cigar_line = $self->cigar_line; | |
387 $cigar_line =~ s/([MD])/$1 /g; | |
388 | |
389 my @cigar_segments = split " ",$cigar_line; | |
390 my $alignment_string_bounded = ""; | |
391 my $seq_start = 0; | |
392 my $exon_count = 1; | |
393 foreach my $segment (@cigar_segments) { | |
394 if ($segment =~ /^(\d*)D$/) { | |
395 my $length = $1; | |
396 $length = 1 if ($length eq ""); | |
397 $alignment_string_bounded .= "-" x $length; | |
398 } elsif ($segment =~ /^(\d*)M$/) { | |
399 my $length = $1; | |
400 $length = 1 if ($length eq ""); | |
401 my $substring = substr($sequence_exon_bounded,$seq_start,$length); | |
402 if ($substring =~ /\ /) { | |
403 my $num_boundaries = $substring =~ s/(\ )/$1/g; | |
404 $length += $num_boundaries; | |
405 $substring = substr($sequence_exon_bounded,$seq_start,$length); | |
406 } | |
407 $alignment_string_bounded .= $substring; | |
408 $seq_start += $length; | |
409 } | |
410 } | |
411 $self->{'alignment_string_bounded'} = $alignment_string_bounded; | |
412 } | |
413 | |
414 return $self->{'alignment_string_bounded'}; | |
415 } | |
416 | |
417 | |
418 =head2 cdna_alignment_string | |
419 | |
420 Arg [1] : none | |
421 Example : my $cdna_alignment = $aligned_member->cdna_alignment_string(); | |
422 Description: Converts the peptide alignment string to a cdna alignment | |
423 string. This only works for EnsEMBL peptides whose cdna can | |
424 be retrieved from the attached core databse. | |
425 If the cdna cannot be retrieved undef is returned and a | |
426 warning is thrown. | |
427 Returntype : string | |
428 Exceptions : none | |
429 Caller : general | |
430 | |
431 =cut | |
432 | |
433 sub cdna_alignment_string { | |
434 my ($self, $changeSelenos) = @_; | |
435 $changeSelenos = 0 unless (defined $changeSelenos); | |
436 | |
437 unless (defined $self->{'cdna_alignment_string'}) { | |
438 | |
439 my $cdna; | |
440 eval { $cdna = $self->sequence_cds;}; | |
441 if ($@) { | |
442 throw("can't connect to CORE to get transcript and cdna for " | |
443 . "genome_db_id:" . $self->genome_db_id ) | |
444 unless($self->get_Transcript); | |
445 $cdna = $self->get_Transcript->translateable_seq; | |
446 } | |
447 | |
448 if (defined $self->cigar_start || defined $self->cigar_end) { | |
449 unless (defined $self->cigar_start && defined $self->cigar_end) { | |
450 throw("both cigar_start and cigar_end should be defined"); | |
451 } | |
452 my $offset = $self->cigar_start * 3 - 3; | |
453 my $length = ($self->cigar_end - $self->cigar_start + 1) * 3; | |
454 $cdna = substr($cdna, $offset, $length); | |
455 } | |
456 | |
457 my $start = 0; | |
458 my $cdna_align_string = ''; | |
459 | |
460 # foreach my $pep (split(//, $self->alignment_string)) { # Speed up below | |
461 my $alignment_string = $self->alignment_string; | |
462 foreach my $pep (unpack("A1" x length($alignment_string), $alignment_string)) { | |
463 if($pep eq '-') { | |
464 $cdna_align_string .= '--- '; | |
465 } elsif ((($pep eq 'U') and $changeSelenos) or ($pep eq '*')) { | |
466 $cdna_align_string .= 'NNN '; | |
467 $start += 3; | |
468 } else { | |
469 my $codon = substr($cdna, $start, 3); | |
470 unless (length($codon) == 3) { | |
471 # sometimes the last codon contains only 1 or 2 nucleotides. | |
472 # making sure that it has 3 by adding as many Ns as necessary | |
473 $codon .= 'N' x (3 - length($codon)); | |
474 } | |
475 $cdna_align_string .= $codon . ' '; | |
476 $start += 3; | |
477 } | |
478 } | |
479 $self->{'cdna_alignment_string'} = $cdna_align_string; | |
480 } | |
481 | |
482 return $self->{'cdna_alignment_string'}; | |
483 } | |
484 | |
485 | |
486 1; |