Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/TranscriptAlleles.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 TranscriptAlleles - A utility class used to obtain information about the | |
24 relationships between a transcript and Alleles | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 use Bio::EnsEMBL::Utils::TranscriptAlleles; | |
29 | |
30 # get the peptide variations caused by a set of Alleles | |
31 | |
32 %variations = %{ | |
33 Bio::EnsEMBL::Utils::TranscriptAlleles::get_all_peptide_variations( | |
34 $transcript, $alleles ) }; | |
35 | |
36 =head1 DESCRIPTION | |
37 | |
38 This is a utility class which can be used to find consequence type of an | |
39 AlleleFeature in a transcript, and to determine the amino acid changes | |
40 caused by the AlleleFeature in the Transcript | |
41 | |
42 | |
43 =head1 METHODS | |
44 | |
45 =cut | |
46 | |
47 package Bio::EnsEMBL::Utils::TranscriptAlleles; | |
48 | |
49 use strict; | |
50 use warnings; | |
51 | |
52 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
53 use Bio::EnsEMBL::Variation::ConsequenceType; | |
54 use vars qw(@ISA @EXPORT_OK); | |
55 | |
56 use Data::Dumper; | |
57 | |
58 @ISA = qw(Exporter); | |
59 | |
60 @EXPORT_OK = qw(&get_all_ConsequenceType &type_variation); | |
61 | |
62 | |
63 =head2 get_all_ConsequenceType | |
64 | |
65 Arg [1] : $transcript the transcript to obtain the peptide variations for | |
66 Arg [2] : $alleles listref of AlleleFeatures | |
67 Example : $consequence_types = get_all_ConsequenceType($transcript, \@alleles); | |
68 foreach my $ct (@{$consequence_types}){ | |
69 print "Allele : ", $ct->allele_string, " has a consequence type of :",$ct->type; | |
70 print " and is affecting the transcript with ",@{$ct->aa_alleles}, "in position ", | |
71 $ct->aa_start,"-", $ct->aa_end if (defined $ct->aa_alleles); | |
72 print "\n"; | |
73 } | |
74 Description: Takes a list of AlleleFeatures and a Transcritpt, and return a list | |
75 of ConsequenceType of the alleles in the given Transcript | |
76 Returntype : listref of Bio::EnsEMBL::Variation::ConsequenceType | |
77 Exceptions : none | |
78 Caller : general | |
79 | |
80 =cut | |
81 | |
82 sub get_all_ConsequenceType { | |
83 my $transcript = shift; | |
84 my $alleles = shift; | |
85 | |
86 if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) { | |
87 throw('Bio::EnsEMBL::Transcript argument is required.'); | |
88 } | |
89 | |
90 if(!ref($alleles) || (ref($alleles) ne 'ARRAY')) { | |
91 throw('Reference to a list of Bio::EnsEMBL::Variation::AlleleFeature objects is required'); | |
92 } | |
93 | |
94 | |
95 my @alleles_ordered = sort { $a->start <=> $b->start} @$alleles; #sort the alleles by the genomic position | |
96 my @same_codon; #contains up to 3 allele features, that are in the same codon, but each position can contain more than 1 allele | |
97 my @out; #array containing the consequence types of the alleles in the transcript | |
98 foreach my $allele (@alleles_ordered) { | |
99 # foreach my $allele (@{$alleles}) { | |
100 #get consequence type of the AlleleFeature | |
101 # my $new_allele = $allele->transform('chromosome'); | |
102 #my $consequence_type = Bio::EnsEMBL::Variation::ConsequenceType->new($transcript->dbID(),'',$allele->start,$allele->end,$allele->strand,[$allele->allele_string]); | |
103 ### REAL HACK BY js5 because something is borked in TranscriptMapper | |
104 ### This relies on the Allele being of the form i.e. a SNP! [ACGT-](/[ACGT-])+ | |
105 ### The rest don't work anyway until we have a AlignStrainSlice | |
106 ### MUST BE SORTED.... | |
107 | |
108 #we have to consider het alleles | |
109 my $allele_string; | |
110 if ($allele->allele_string =~ /[\|\\\/]/){ | |
111 my @alleles = split /[\|\\\/]/,$allele->allele_string; | |
112 if ($alleles[0] ne $allele->ref_allele_string){ | |
113 $allele_string = $alleles[0]; | |
114 } | |
115 else{ | |
116 $allele_string = $alleles[1]; | |
117 } | |
118 } | |
119 else{ | |
120 $allele_string = $allele->allele_string; | |
121 } | |
122 my $opposite_strand = 0; #to indicate wether transcript and allele and in different strands | |
123 my $transcript_allele = $allele_string; | |
124 if( $transcript->strand != $allele->strand ) { | |
125 $transcript_allele =~tr/ACGT/TGCA/; | |
126 $opposite_strand = 1; | |
127 } | |
128 | |
129 my $consequence_type = Bio::EnsEMBL::Variation::ConsequenceType->new($transcript->dbID(),'',$allele->start, $allele->end, $transcript->strand, [$transcript_allele]); | |
130 #calculate the consequence type of the Allele if different from the reference Allele | |
131 #if (($opposite_strand && $allele->ref_allele_string eq $allele_string) || (!$opposite_strand && $allele->ref_allele_string eq $allele_string)){ #same allele as reference, there is no consequence, called SARA | |
132 if ($allele->ref_allele_string eq $allele_string) { #same allele as reference, there is no consequence, called SARA | |
133 #same allele as reference, there is no consequence, called SARA | |
134 #we have to calculate if there are more than 2 in the same codon | |
135 empty_codon(\@out,\@same_codon); | |
136 $consequence_type->type('SARA'); | |
137 push @out, $consequence_type; | |
138 next; | |
139 } | |
140 | |
141 my $ref_consequences = type_variation($transcript,"",$consequence_type); | |
142 if ($allele->start != $allele->end){ | |
143 empty_codon(\@out,\@same_codon); | |
144 #do not calculate for indels effects of 2 or more in same codon | |
145 push @out, @{$ref_consequences}; | |
146 next; | |
147 } | |
148 | |
149 my $new_consequence = shift @{$ref_consequences}; | |
150 if (! defined $new_consequence ) { | |
151 empty_codon(\@out,\@same_codon); | |
152 push @out, $consequence_type; # should be empty | |
153 next; | |
154 } | |
155 | |
156 if ( !defined $new_consequence->aa_start){ | |
157 empty_codon(\@out,\@same_codon); | |
158 push @out, $new_consequence; | |
159 next; | |
160 } | |
161 #first element of the codon | |
162 if (!defined $same_codon[0]){ | |
163 push @{$same_codon[0]}, $new_consequence; #goes to the first position | |
164 next; | |
165 } | |
166 #for alleles with aa effect, find out if they are in the same codon | |
167 if ($same_codon[-1]->[0]->aa_start == $new_consequence->aa_start){ | |
168 #they are in the same codon, find out if it is the same position | |
169 if ($same_codon[-1]->[0]->start == $new_consequence->start){ | |
170 #it is the same position | |
171 push @{$same_codon[-1]},$new_consequence; #push in the last | |
172 } | |
173 else{ | |
174 push @{$same_codon[$#same_codon + 1]},$new_consequence; #this is a new element in the codon | |
175 } | |
176 | |
177 } | |
178 else{ | |
179 #if there is more than one element in the same_codon array, calculate the effect of the codon | |
180 if (@same_codon > 1){ | |
181 calculate_same_codon(\@same_codon); | |
182 } | |
183 map {push @out, @{$_}} @same_codon; | |
184 @same_codon = (); | |
185 push @{$same_codon[0]}, $new_consequence; #push the element not in the same codon | |
186 } | |
187 } | |
188 #add last consequence_type | |
189 empty_codon(\@out,\@same_codon); | |
190 | |
191 return \@out; | |
192 } | |
193 | |
194 sub empty_codon{ | |
195 my $out = shift; | |
196 my $same_codon = shift; | |
197 | |
198 if (@{$same_codon} == 1){ | |
199 map {push @{$out}, @{$_}} @{$same_codon}; | |
200 } | |
201 elsif (@{$same_codon} > 1){ | |
202 calculate_same_codon($same_codon); | |
203 map {push @{$out}, @{$_}} @{$same_codon}; | |
204 } | |
205 @{$same_codon} = (); | |
206 } | |
207 | |
208 # recalculates the effect of 2 or 3 SNPs in the same codon | |
209 sub calculate_same_codon{ | |
210 my $same_codon = shift; | |
211 my $new_codon; | |
212 my $old_aa; | |
213 my $codon_table = Bio::Tools::CodonTable->new; | |
214 if (@{$same_codon} == 3){ | |
215 #if there are 3 alleles in the same codon | |
216 map {$new_codon .= @{$_->[0]->alleles};$old_aa = $_->[0]->aa_alleles()->[0]} @{$same_codon}; | |
217 } | |
218 else{ | |
219 #if there are 2 alleles affecting the same codon | |
220 my $first_pos = ($same_codon->[0]->[0]->cdna_start -1) % 3; #position of the first allele in the codon | |
221 my $second_pos = ($same_codon->[1]->[0]->cdna_start -1)% 3; #position of the second allele in the codon | |
222 if ($first_pos == 0){ | |
223 #codon starts with first allele | |
224 $new_codon = $same_codon->[0]->[0]->alleles->[0]; #first base in the codon | |
225 if ($second_pos == 1){ | |
226 $new_codon .= $same_codon->[1]->[0]->alleles->[0]; #second base in the codon | |
227 $new_codon .= substr($same_codon->[1]->[0]->codon,2,1); #third base in the codon | |
228 } | |
229 else{ | |
230 $new_codon .= substr($same_codon->[1]->[0]->codon,1,1); #second base in the codon | |
231 $new_codon .= $same_codon->[1]->[0]->alleles->[0]; #third base in the codon | |
232 } | |
233 } | |
234 else{ | |
235 #alleles are in position 1 and 2 in the codon | |
236 $new_codon = substr($same_codon->[1]->[0]->codon,0,1); #first base in the codon | |
237 $new_codon .= $same_codon->[0]->[0]->alleles->[0]; #second base in the codon | |
238 $new_codon .= $same_codon->[1]->[0]->alleles->[0]; #third base in the codon | |
239 } | |
240 $old_aa = $same_codon->[0]->[0]->aa_alleles->[0]; | |
241 } | |
242 #calculate the new_aa | |
243 my $new_aa = $codon_table->translate($new_codon); | |
244 #and update the aa_alleles field in all the codons | |
245 foreach my $codon (@{$same_codon}){ | |
246 map {$_->aa_alleles([$old_aa,$new_aa])} @{$codon}; | |
247 } | |
248 | |
249 } | |
250 # | |
251 # Classifies a variation which is in the vicinity of a transcript | |
252 # | |
253 sub type_variation { | |
254 my $tr = shift; | |
255 my $g = shift; | |
256 my $var = shift; | |
257 | |
258 my $UPSTREAM = 5000; | |
259 my $DOWNSTREAM = 5000; | |
260 | |
261 #empty type first in the case of recursive call | |
262 $var->empty_type if defined $var->type; | |
263 | |
264 if (!$var->isa('Bio::EnsEMBL::Variation::ConsequenceType')) { | |
265 throw("Not possible to calculate the consequence type for ",ref($var)," : Bio::EnsEMBL::Variation::ConsequenceType object expected"); | |
266 } | |
267 | |
268 if (($var->start < $tr->start - $UPSTREAM) || ($var->end > $tr->end + $DOWNSTREAM)){ | |
269 #since the variation is more than UPSTREAM and DOWNSTREAM of the transcript, there is no effect in the transcript | |
270 return []; | |
271 } | |
272 | |
273 | |
274 # check the cache | |
275 my $tran_features = $tr->{_variation_effect_feature_cache}; | |
276 | |
277 # populate it if not found | |
278 unless ($tran_features) { | |
279 $tran_features = { | |
280 mapper => $tr->get_TranscriptMapper, | |
281 }; | |
282 | |
283 my ($attrib) = @{$tr->slice()->get_all_Attributes('codon_table')}; #for mithocondrial dna it is necessary to change the table | |
284 | |
285 my $codon_table; | |
286 $codon_table = $attrib->value() if($attrib); | |
287 $codon_table ||= 1; # default vertebrate codon table | |
288 | |
289 if ($tran_features->{translation} = $tr->translate(undef, undef, undef, $codon_table)) { | |
290 $tran_features->{translateable_seq} = $tr->translateable_seq; | |
291 | |
292 # to include the stop codon we need to translate the Bio::Seq sequence, not just | |
293 # $tr->translation, this is the source of the missing STOP_LOSTs | |
294 my $mrna_seqobj = Bio::Seq->new( | |
295 -seq => $tran_features->{translateable_seq}, | |
296 -moltype => 'dna', | |
297 -alphabet => 'dna' | |
298 ); | |
299 | |
300 $tran_features->{peptide} = $mrna_seqobj->translate(undef, undef, undef, $codon_table)->seq; | |
301 } | |
302 | |
303 $tr->{_variation_effect_feature_cache} = $tran_features; | |
304 } | |
305 | |
306 if ( !defined( $tran_features->{translation} ) ) | |
307 { # for other biotype rather than coding/IG genes | |
308 # check if the variation is completely outside the transcript: | |
309 | |
310 if ( $var->end() < $tr->start() ) { | |
311 $var->type( ( $tr->strand() == 1 ) ? 'UPSTREAM' : 'DOWNSTREAM' ); | |
312 return [$var]; | |
313 } | |
314 if ( $var->start() > $tr->end() ) { | |
315 $var->type( ( $tr->strand() == 1 ) ? 'DOWNSTREAM' : 'UPSTREAM' ); | |
316 return [$var]; | |
317 } | |
318 | |
319 if ( $var->start() >= $tr->start() and $var->end() <= $tr->end() ) | |
320 { # within the transcript | |
321 if ( $tr->biotype() eq "miRNA" ) { | |
322 my ($attribute) = @{ $tr->get_all_Attributes('miRNA') }; | |
323 | |
324 # the value is the mature miRNA coordinate within miRNA | |
325 # transcript | |
326 if ( defined($attribute) | |
327 && $attribute->value() =~ /(\d+)-(\d+)/ ) | |
328 { | |
329 # transfer cdna value to genomic coordinates | |
330 my @mapper_objs = $tr->cdna2genomic( $1, $2, $tr->strand() ); | |
331 | |
332 foreach my $obj (@mapper_objs) | |
333 { #Note you can get more than one mature seq per miRNA | |
334 if ( $obj->isa("Bio::EnsEMBL::Mapper::Coordinate") ) { | |
335 if ( $var->start() >= $obj->start() | |
336 and $var->end() <= $obj->end() ) | |
337 { | |
338 $var->type("WITHIN_MATURE_miRNA"); | |
339 return [$var]; | |
340 } | |
341 } | |
342 } | |
343 } | |
344 } | |
345 | |
346 $var->type("WITHIN_NON_CODING_GENE"); | |
347 return [$var]; | |
348 | |
349 } ## end if ( $var->start() >= ...) | |
350 } ## end if ( !defined( $tr->translation...)) | |
351 | |
352 # get a transcript mapper object | |
353 my $tm = $tran_features->{mapper}; | |
354 | |
355 # map to CDNA coords | |
356 my @cdna_coords = $tm->genomic2cdna($var->start,$var->end,$var->strand); | |
357 | |
358 # map to CDS cooords | |
359 my @cds_coords = $tm->genomic2cds($var->start, $var->end,$var->strand); | |
360 | |
361 # map to peptide coords | |
362 my @pep_coords = $tm->genomic2pep($var->start, $var->end, $var->strand); | |
363 | |
364 # get the phase of the first exon | |
365 my $exon_phase = $tr->start_Exon->phase; | |
366 | |
367 # check for partial codon consequence | |
368 if( | |
369 @pep_coords == 1 | |
370 && @cds_coords == 1 | |
371 && !($cds_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) | |
372 && !($pep_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) | |
373 ) { | |
374 | |
375 # get the CDS sequence | |
376 my $cds = $tran_features->{translateable_seq}; | |
377 | |
378 my $start = $pep_coords[0]->start(); | |
379 my $codon_cds_start = ($start * 3) - 2; | |
380 | |
381 my $last_codon_length = length($cds) - ($codon_cds_start - 1); | |
382 | |
383 if($last_codon_length < 3 && $last_codon_length > 0) { | |
384 $var->type("PARTIAL_CODON"); | |
385 | |
386 # add the CDS coords | |
387 $var->cds_start($cds_coords[0]->start + ($exon_phase > 0 ? $exon_phase : 0)); | |
388 $var->cds_end($cds_coords[0]->end + ($exon_phase > 0 ? $exon_phase : 0)); | |
389 | |
390 # add the cDNA coords | |
391 $var->cdna_start($cdna_coords[0]->start); | |
392 $var->cdna_end($cdna_coords[0]->end); | |
393 | |
394 return [$var]; | |
395 } | |
396 } | |
397 | |
398 | |
399 # Handle simple cases where the variation is not split into parts. | |
400 # Call method recursively with component parts in complicated case. | |
401 # E.g. a single multi-base variation may be both intronic and coding | |
402 | |
403 if(@cdna_coords > 1) { | |
404 my @out; | |
405 #this will be a new type, complex_indel | |
406 $var->type('COMPLEX_INDEL'); | |
407 return [$var]; | |
408 # foreach my $c (@coords) { | |
409 # my %new_var = %{$var}; | |
410 # $new_var{'end'} = $var->start + $c->length() - 1; | |
411 # $var->start( $new_var{'end'} + 1); | |
412 # #empty the type before re-run | |
413 # $var->empty_type ; | |
414 # push @out, @{type_variation($tr, $g, bless \%new_var, ref($var))}; | |
415 # } | |
416 # return \@out; | |
417 | |
418 | |
419 } | |
420 | |
421 # look at different splice distances | |
422 my @coords_splice_2 = $tm->genomic2cdna($var->start -2, $var->end +2, $var->strand); | |
423 my @coords_splice_3 = $tm->genomic2cdna($var->start -3, $var->end +3, $var->strand); | |
424 my @coords_splice_8 = $tm->genomic2cdna($var->start -8, $var->end +8, $var->strand); | |
425 | |
426 my ($splice_site_2, $splice_site_3, $splice_site_8); | |
427 | |
428 if (scalar @coords_splice_2 >1) { | |
429 $splice_site_2=1; | |
430 } | |
431 elsif (scalar @coords_splice_3 >1) { | |
432 $splice_site_3=1; | |
433 } | |
434 elsif (scalar @coords_splice_8 >1) { | |
435 $splice_site_8=1; | |
436 } | |
437 | |
438 | |
439 my $c = $cdna_coords[0]; | |
440 if($c->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
441 | |
442 # check if the variation is completely outside the transcript: | |
443 | |
444 if($var->end < $tr->start()) { | |
445 $var->type( ($tr->strand() == 1) ? 'UPSTREAM' : 'DOWNSTREAM' ); | |
446 return [$var]; | |
447 } | |
448 if($var->start > $tr->end()) { | |
449 $var->type( ($tr->strand() == 1) ? 'DOWNSTREAM' : 'UPSTREAM' ); | |
450 return [$var]; | |
451 } | |
452 | |
453 # nonsense-mediated decay transcript | |
454 if($tr->biotype() eq 'nonsense_mediated_decay') { | |
455 $var->type("NMD_TRANSCRIPT"); | |
456 #return [$var]; | |
457 } | |
458 | |
459 # variation must be intronic since mapped to cdna gap, but is within | |
460 # transcript, note that ESSENTIAL_SPLICE_SITE only consider first (AG) and last (GT) 2 bases inside the intron. | |
461 # if variation is in intron, we need to check the lenth of intron, if it's shoter than 6, we call it SYNONYMOUS_CODING rather then INTRONIC | |
462 | |
463 foreach my $intron (@{$tran_features->{introns}}) { | |
464 if ($intron->length <=5) {#the length of frameshift intron could be 1,2,4,5 bases | |
465 if ($var->start>=$intron->start and $var->end<=$intron->end) { | |
466 #this is a type of SYNONYMOUS_CODING since changes happen in frameshift intron, which don't change exon structure | |
467 $var->type('SYNONYMOUS_CODING'); | |
468 return [$var]; | |
469 } | |
470 } | |
471 } | |
472 #if it's not in frameshift intron, then it's in normal intron | |
473 $var->type('INTRONIC'); | |
474 | |
475 if ($splice_site_2) { | |
476 $var->type('ESSENTIAL_SPLICE_SITE'); | |
477 } | |
478 elsif ($splice_site_3 or $splice_site_8) { | |
479 $var->type('SPLICE_SITE'); | |
480 } | |
481 return [$var]; | |
482 } | |
483 | |
484 # nonsense-mediated decay transcript | |
485 if($tr->biotype() eq 'nonsense_mediated_decay') { | |
486 $var->type("NMD_TRANSCRIPT"); | |
487 #return [$var]; | |
488 } | |
489 | |
490 #now variation must be in exons, the first 3 bs into exon could be splice_site | |
491 | |
492 if ($splice_site_2 or $splice_site_3) { | |
493 | |
494 my ($se_s, $se_e, $ee_s, $ee_e) = ($tr->start_Exon->start, $tr->start_Exon->end, $tr->end_Exon->start, $tr->end_Exon->end); | |
495 ($se_s, $se_e, $ee_s, $ee_e) = ($se_e, $se_s, $ee_e, $ee_s) if $tr->strand < 0; | |
496 | |
497 # check coord relative to first exon | |
498 # near beginning of first exon is obv not a splice site | |
499 if($var->start <= $se_e) { | |
500 if(abs($se_e - $var->start) <= 3) { | |
501 $var->type('SPLICE_SITE'); | |
502 } | |
503 } | |
504 | |
505 # also check relative to last exon | |
506 # near end of last exon is also not a splice site | |
507 elsif($var->start >= $ee_s) { | |
508 if(abs($ee_s - $var->start) <= 3) { | |
509 $var->type('SPLICE_SITE'); | |
510 } | |
511 } | |
512 | |
513 # if not near either end of transcript, then it is definitely a splice site | |
514 else { | |
515 $var->type('SPLICE_SITE'); | |
516 } | |
517 } | |
518 | |
519 $var->cdna_start( $c->start() ); | |
520 $var->cdna_end( $c->end() ); | |
521 | |
522 if(@cds_coords > 1) { | |
523 # my @out; | |
524 #this is a new type, complex_indel | |
525 $var->type('COMPLEX_INDEL'); | |
526 return [$var]; | |
527 # foreach my $c (@coords) { | |
528 # my %new_var = %{$var}; | |
529 # $new_var{'end'} = $var->start + $c->length() - 1; | |
530 # $var->start( $new_var{'end'} + 1); | |
531 # #empty the type before re-run | |
532 # $var->empty_type ; | |
533 # push @out, @{type_variation($tr, $g, bless \%new_var, ref($var))}; | |
534 # } | |
535 # return \@out; | |
536 } | |
537 | |
538 $c = $cds_coords[0]; | |
539 | |
540 if($c->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
541 # mapped successfully to CDNA but not to CDS, must be UTR | |
542 | |
543 if($var->end < $tr->coding_region_start()) { | |
544 $var->type( ($tr->strand() == 1) ? '5PRIME_UTR' : '3PRIME_UTR' ); | |
545 } | |
546 elsif($var->start > $tr->coding_region_end()) { | |
547 $var->type( ($tr->strand() == 1) ? '3PRIME_UTR' : '5PRIME_UTR'); | |
548 } | |
549 else { | |
550 throw('Unexpected: CDNA variation which is not in CDS is not in UTR'); | |
551 } | |
552 return [$var]; | |
553 } | |
554 | |
555 # we need to add the exon phase on in case of weird transcripts | |
556 # where the first exon is not in normal phase | |
557 $var->cds_start( $c->start() + ($exon_phase > 0 ? $exon_phase : 0)); | |
558 $var->cds_end( $c->end() + ($exon_phase > 0 ? $exon_phase : 0)); | |
559 | |
560 | |
561 if(@pep_coords != 1 || $pep_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
562 throw("Unexpected: Could map to CDS but not to peptide coordinates."); | |
563 } | |
564 | |
565 $c = $pep_coords[0]; | |
566 | |
567 $var->aa_start( $c->start()); | |
568 $var->aa_end( $c->end()); | |
569 | |
570 apply_aa_change($tr, $var); | |
571 | |
572 return [$var]; | |
573 } | |
574 | |
575 # | |
576 # Determines the effect of a coding variation on the peptide sequence | |
577 # | |
578 | |
579 sub apply_aa_change { | |
580 my $tr = shift; | |
581 my $var = shift; | |
582 | |
583 my ($attrib) = @{$tr->slice()->get_all_Attributes('codon_table')}; #for mithocondrial dna it is necessary to change the table | |
584 | |
585 my $codon_table; | |
586 $codon_table = $attrib->value() if($attrib); | |
587 $codon_table ||= 1; # default vertebrate codon table | |
588 | |
589 # check the cache | |
590 my $tran_features = $tr->{_variation_effect_feature_cache}; | |
591 | |
592 # populate it if not found | |
593 unless ($tran_features) { | |
594 $tran_features = { | |
595 mapper => $tr->get_TranscriptMapper, | |
596 }; | |
597 | |
598 if ($tran_features->{translation} = $tr->translate(undef, undef, undef, $codon_table)) { | |
599 $tran_features->{translateable_seq} = $tr->translateable_seq; | |
600 | |
601 # to include the stop codon we need to translate the Bio::Seq sequence, not just | |
602 # $tr->translation, this is the source of the missing STOP_LOSTs | |
603 my $mrna_seqobj = Bio::Seq->new( | |
604 -seq => $tran_features->{translateable_seq}, | |
605 -moltype => 'dna', | |
606 -alphabet => 'dna' | |
607 ); | |
608 | |
609 $tran_features->{peptide} = $mrna_seqobj->translate(undef, undef, undef, $codon_table)->seq; | |
610 } | |
611 | |
612 $tr->{_variation_effect_feature_cache} = $tran_features; | |
613 } | |
614 | |
615 my $mrna = $tran_features->{translateable_seq}; # get from cache | |
616 | |
617 my $peptide = $tran_features->{peptide}; # get from cache | |
618 | |
619 my $len = $var->aa_end - $var->aa_start + 1; | |
620 my $old_aa = substr($peptide, $var->aa_start -1 , $len); | |
621 | |
622 my $codon_cds_start = $var->aa_start * 3 - 2; | |
623 my $codon_cds_end = $var->aa_end * 3; | |
624 my $codon_len = $codon_cds_end - $codon_cds_start + 1; | |
625 | |
626 my @alleles = @{$var->alleles}; | |
627 | |
628 my $var_len = $var->cds_end - $var->cds_start + 1; | |
629 | |
630 my @aa_alleles = ($old_aa); | |
631 | |
632 my $ref_codon = substr($mrna, $codon_cds_start-1, $codon_len); | |
633 my @codons; | |
634 push @codons, $ref_codon; | |
635 | |
636 #here could generate multi type if have multi-allele change: "ACTAGT/-/T" | |
637 foreach my $a (@alleles) { | |
638 $a =~ s/\-//; | |
639 my $cds = $mrna; | |
640 | |
641 if($var_len != length($a)) { | |
642 if(abs(length($a) - $var_len) % 3) { | |
643 # frameshifting variation, do not set peptide_allele string | |
644 # since too complicated and could be very long | |
645 | |
646 $var->type('FRAMESHIFT_CODING'); | |
647 return [$var]; | |
648 } | |
649 | |
650 if($codon_len == 0) { # insertion | |
651 $aa_alleles[0] = '-'; | |
652 $old_aa = '-'; | |
653 } | |
654 } | |
655 | |
656 my $new_aa; | |
657 | |
658 # change sequence | |
659 substr($cds, $var->cds_start-1, $var_len) = $a; | |
660 | |
661 # get the new codon | |
662 my $codon_str = substr($cds, $codon_cds_start-1, $codon_len + length($a)-$var_len); | |
663 | |
664 push @codons, $codon_str; | |
665 $var->codon($codon_str); #add the codon to the ConsequenceType object | |
666 my $codon_seq = Bio::Seq->new(-seq => $codon_str, | |
667 -moltype => 'dna', | |
668 -alphabet => 'dna'); | |
669 | |
670 $new_aa = $codon_seq->translate(undef,undef,undef,$codon_table)->seq(); | |
671 | |
672 if(length($new_aa)<1){ | |
673 $new_aa='-'; | |
674 } | |
675 | |
676 if(uc($new_aa) ne uc($old_aa)) { | |
677 push @aa_alleles, $new_aa; | |
678 if ($new_aa =~ /\*/) { | |
679 $var->type('STOP_GAINED'); | |
680 } | |
681 elsif ($old_aa =~ /\*/) { | |
682 $var->type('STOP_LOST'); | |
683 } | |
684 } | |
685 } | |
686 | |
687 #note if type is already defined as SOTP_GAINED OR STOP_LOST, then even @aa_alleles > 1, we are not given type | |
688 # of 'NON_SYNONYMOUS_CODING' | |
689 if(@aa_alleles > 1) { | |
690 if (!$var->type or (join ' ',@{$var->type}) !~ /STOP/) { | |
691 $var->type('NON_SYNONYMOUS_CODING'); | |
692 } | |
693 } | |
694 else { | |
695 $var->type('SYNONYMOUS_CODING'); | |
696 } | |
697 | |
698 #$var->codons(\@codons); | |
699 $var->aa_alleles(\@aa_alleles); | |
700 } | |
701 | |
702 | |
703 1; |