comparison variant_effect_predictor/Bio/EnsEMBL/Upstream.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Upstream - Object that defines an upstream region
24
25 =head1 SYNOPSIS
26
27 use Bio::EnsEMBL::Upstream;
28
29 my $upstream = Bio::EnsEMBL::Upstream->new(
30 -transcript => $transcript,
31 -length => 2000 # bp
32 );
33
34 # Retrieve coordinates of upstream region
35 my $upstream_region_start = $upstream->upstart;
36 my $upstream_region_end = $upstream->upend;
37
38 # Retrieve coordinates in 'downstream' first intron
39 my $intron_region_start = $upstream->downstart;
40 my $intron_region_end = $upstream->downend;
41
42 # Coordinates are returned in the same scheme as the input transcript.
43 # However, the coordinates of an upstream region can be transformed to
44 # any other scheme using a slice
45
46 $upstream->transform($slice);
47
48 # Coordinates can be retrieved in scheme in the same manner as the
49 # above.
50
51 =head1 DESCRIPTION
52
53 An object that determines the upstream region of a transcript. Such a
54 region is non-coding and ensures that other genes or transcripts are
55 not present. Ultimately, these objects can be used to looking for
56 promoter elements. To this end, it is also possible to derive a region
57 downstream of the first exon, within the first intron and where promoter
58 elements sometimes are found.
59
60 =head1 METHODS
61
62 =cut
63
64 package Bio::EnsEMBL::Upstream;
65
66 use strict;
67 use vars qw(@ISA);
68 use Bio::EnsEMBL::DBSQL::DBAdaptor;
69 use Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor;
70
71 use Bio::EnsEMBL::Utils::Exception qw(throw);
72 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
73
74 @ISA = qw(Bio::EnsEMBL::Feature);
75
76
77 =head2 new
78
79 Arg [transcript] : (optional) Bio::EnsEMBL::Transcript
80 Arg [length] : (optional) int $length
81 Example : $upstream = Bio::EnsEMBL::Upstream->new(-transcript => $transcript,
82 -length => 2000);
83 Description: Creates a new upstream object
84 Returntype : Bio::EnsEMBL::Upstream
85 Exceptions : none
86 Caller : Bio::EnsEMBL::Transcript, general
87 Status : Stable
88
89 =cut
90
91 sub new {
92 my ($class, @args) = @_;
93 my $self = {};
94
95 bless $self, $class;
96
97 my ($transcript,
98 $length) = rearrange([qw(TRANSCRIPT
99 LENGTH
100 )],@args);
101
102 $self->transcript($transcript) if defined $transcript;
103 $self->length($length) if $length;
104
105 return $self
106 }
107
108 =head2 transcript
109
110 Arg : (optional) Bio::EnsEMBL::Transcript
111 Example : $self->transcript($transcript);
112 Description: Getter/setter for transcript object
113 Returntype : Bio::EnsEMBL::Transcript
114 Exceptions : Throws if argument is not undefined
115 or a Bio::EnsEMBL::Transcript
116 Caller : $self->new, $self->_derive_coords,
117 $self->_first_coding_Exon
118 Status : Stable
119
120 =cut
121
122
123 sub transcript {
124 my $self = shift;
125
126 if (@_){
127 $self->{_transcript} = shift;
128
129 if (defined $self->{_transcript}) {
130 throw("Transcript is not a Bio::EnsEMBL::Transcript")
131 if (! $self->{_transcript}->isa("Bio::EnsEMBL::Transcript"));
132 $self->_flush_cache;
133 }
134 }
135
136 return $self->{_transcript}
137 }
138
139 =head2 length
140
141 Arg : (optional) int $length
142 Example : $self->length(2000); # bp
143 Description: Getter/setter for upstream region length.
144 Returntype : int
145 Exceptions : Throws if length is requested before it has been set.
146 Caller : $self->new, $self->_derive_coords
147 Status : Stable
148
149 =cut
150
151 sub length {
152 my $self = shift;
153
154 if (@_){
155 $self->{_length} = shift;
156 $self->_flush_cache;
157 }
158
159 throw("Region length has not been set.")
160 unless $self->{_length};
161
162 return $self->{_length}
163 }
164
165 =head2 _flush_cache
166
167 Arg : none
168 Example : $self->_flush_cache;
169 Description: Empties cached coordinates (called when
170 coordinate scheme or region length has changed).
171 Returntype : none
172 Exceptions : none
173 Caller : $self->length, $self->transform
174 Status : Stable
175
176 =cut
177
178 sub _flush_cache {
179 my $self = shift;
180
181 $self->upstart(undef);
182 $self->upend(undef);
183 $self->downstart(undef);
184 $self->downend(undef);
185 }
186
187 =head2 upstart
188
189 Arg : none
190 Example : $self->upstart;
191 Description: Returns the start coordinate of the region
192 upstream of the transcript. This coordinate
193 is always the furthest from the translation
194 initiation codon, whereas upend always abutts
195 the translation initiation codon.
196 Returntype : int
197 Exceptions : none
198 Caller : general
199 Status : Stable
200
201 =cut
202
203 sub upstart {
204 my $self = shift;
205
206 if (@_) {
207 $self->{_upstart} = shift @_;
208 return
209 }
210
211 if (! defined $self->{_upstart}) {
212 $self->_derive_coords('up');
213 }
214
215 return $self->{_upstart}
216 }
217
218 =head2 upend
219
220 Arg : none
221 Example : $self->upend;
222 Description: Returns the end coordinate of the region
223 upstream of the transcript. This coordinate
224 always always abutts the translation
225 initiation codon, whereas upstart always
226 returns the coorindate furthest from the
227 translation initiation codon.
228 Returntype : int
229 Exceptions : none
230 Caller : general
231 Status : Stable
232
233 =cut
234
235 sub upend {
236 my $self = shift;
237
238 if (@_) {
239 $self->{_upend} = shift @_;
240 return
241 }
242
243 if (! defined $self->{_upend}) {
244 $self->_derive_coords('up');
245 }
246
247 return $self->{_upend}
248 }
249
250 =head2 downstart
251
252 Arg : none
253 Example : $self->downstart;
254 Description: Returns the start coordinate of the region
255 in the first intron of the transcript. This
256 coordinate is always closest to the first
257 exon (irregardless of strand).
258 Returntype : int
259 Exceptions : none
260 Caller : general
261 Status : Stable
262
263 =cut
264
265 sub downstart {
266 my $self = shift;
267
268 if (@_) {
269 $self->{_downstart} = shift @_;
270 return
271 }
272
273 if (! defined $self->{_downstart}) {
274 $self->_derive_coords('down');
275 }
276
277 return $self->{_downstart}
278 }
279
280 =head2 downend
281
282 Arg : none
283 Example : $self->downend;
284 Description: Returns the end coordinate of the region
285 in the first intron of the transcript. This
286 coordinate is always furthest from the first
287 exon (irregardless of strand).
288 Returntype : int
289 Exceptions : none
290 Caller : general
291 Status : Stable
292
293 =cut
294
295 sub downend {
296 my $self = shift;
297
298 if (@_) {
299 $self->{_downend} = shift @_;
300 return
301 }
302
303 if (! defined $self->{_downend}) {
304 $self->_derive_coords('down');
305 }
306
307 return $self->{_downend}
308 }
309
310 =head2 transform
311
312 Arg :
313 Example :
314 Description: Not yet implemented
315 Returntype :
316 Exceptions :
317 Caller :
318 Status : At Risk
319
320 =cut
321
322
323 # Over-riding inherited class. As yet unimplemented.
324
325 sub transform {
326 my $self = shift;
327
328 throw("No transform method implemented for " . $self);
329 }
330
331 =head2 derive_upstream_coords
332
333 Arg : none
334 Example : my ($upstart, $upend)
335 = $self->derive_upstream_coords;
336 Description: Derives upstream coordinates (for
337 compatability with older scripts).
338 Returntype : arrayref
339 Exceptions : none
340 Caller : general
341 Status : Stable
342
343 =cut
344
345 sub derive_upstream_coords {
346 my $self = shift;
347
348 return [$self->upstart, $self->upend]
349 }
350
351 =head2 derive_downstream_coords
352
353 Arg : none
354 Example : my ($downstart, $downend)
355 = $self->derive_downstream_coords;
356 Description: Derives downstream coordinates (for
357 compatability with older scripts).
358 Returntype : arrayref
359 Exceptions : none
360 Caller : general
361 Status : Stable
362
363 =cut
364
365 sub derive_downstream_coords {
366 my $self = shift;
367
368 return [$self->downstart, $self->downend]
369 }
370
371 =head2 _derive_coords
372
373 Arg : string $direction (either 'up' or 'down').
374 Example : $self->_derive_coords('up');
375 Description: Determines the coordinates of either upstream
376 or downstream region.
377 Returntype : none
378 Exceptions : Throws if argument is not either 'up' or 'down'
379 Caller : $self->upstart, $self->upend, $self->downstart,
380 $self->downend
381 Status : Stable
382
383 =cut
384
385 sub _derive_coords {
386 my ($self, $direction) = @_;
387
388 # Check direction
389 throw("Must specify either \'up\' of \'down\'-stream direction to derive coords.")
390 unless (($direction eq 'up')||($direction eq 'down'));
391
392 # Put things in easily accessible places.
393 my $core_db_slice_adaptor = $self->transcript->slice->adaptor;
394 my $region_length = $self->length;
395
396 # Whatever coord system the gene is currently is, transform to the toplevel.
397 my $transcript = $self->transcript->transform('toplevel');
398
399 # Use our transformed transcript to determine the upstream region coords.
400 # End should always be just before the coding start (like ATG), including 3' UTR.
401 # Start is the outer limit of the region upstream (furthest from ATG).
402
403 my $region_start;
404 my $region_end;
405
406 if ($transcript->strand == 1){
407 if ($direction eq 'up'){
408 $region_end = $transcript->coding_region_start - 1;
409 $region_start = $region_end - $region_length;
410 } elsif ($direction eq 'down'){
411 $region_end = $self->_first_coding_Exon->end + 1;
412 $region_start = $region_end + $region_length;
413 }
414 } elsif ($transcript->strand == -1) {
415 if ($direction eq 'up'){
416 $region_end = $transcript->coding_region_end + 1;
417 $region_start = $region_end + $region_length;
418
419 } elsif ($direction eq 'down'){
420 $region_end = $self->_first_coding_Exon->start - 1;
421 $region_start = $region_end - $region_length;
422 }
423 }
424
425 # Trim the upstream/downstream region to remove extraneous coding sequences
426 # from other genes and/or transcripts.
427
428 my ($slice_low_coord, $slice_high_coord) = sort {$a <=> $b} ($region_start, $region_end);
429
430 my $region_slice
431 = $core_db_slice_adaptor->fetch_by_region($transcript->slice->coord_system->name,
432 $transcript->slice->seq_region_name,
433 $slice_low_coord,
434 $slice_high_coord);
435
436 if ($transcript->strand == 1) {
437 if ($direction eq 'up') {
438 $region_start += $self->_bases_to_trim('left_end', $region_slice);
439 } elsif ($direction eq 'down') {
440 $region_start -= $self->_bases_to_trim('right_end', $region_slice);
441 }
442 } elsif ($transcript->strand == -1) {
443 if ($direction eq 'up') {
444 $region_start -= $self->_bases_to_trim('right_end', $region_slice);
445 } elsif ($direction eq 'down') {
446 $region_start += $self->_bases_to_trim('left_end', $region_slice);
447 }
448 }
449
450 # Always return start < end
451
452 ($region_start, $region_end) = sort {$a <=> $b} ($region_start, $region_end);
453
454 if ($direction eq 'up') {
455 $self->upstart($region_start);
456 $self->upend($region_end);
457 } elsif ($direction eq 'down') {
458 $self->downstart($region_start);
459 $self->downend($region_end);
460 }
461 }
462
463 =head2 _bases_to_trim
464
465 Arg : string $end_to_trim (either 'right_end' or
466 'left_end').
467 Arg : Bio::EnsEMBL::Slice
468 Example : $self->_derive_coords('right_end', $slice);
469 Description: Finds exons from other genes/transcripts that
470 invade our upstream/downstream slice and
471 returns the number of bases that should be
472 truncated from the appropriate end of the
473 upstream/downstream region.
474 Returntype : in
475 Exceptions : Throws if argument is not either 'right_end'
476 or 'left_end'
477 Caller : $self->_derive_coords
478 Status : Stable
479
480 =cut
481
482 # Method to look for coding regions that invade the upstream region. For
483 # now, this method returns the number of bases to trim. I doesn't yet
484 # do anything special if an exon is completely swallowed (truncates at
485 # the end of the overlapping exon and discards any non-coding sequence
486 # further upstream) or overlaps the 'wrong' end of the region (cases where
487 # two alternate exons share one end of sequence - does this happen?).
488
489 # The input argument 'end' defines the end of the slice that should be
490 # truncated.
491
492 sub _bases_to_trim {
493 my ($self, $end_to_trim, $slice) = @_;
494
495 throw "Slice end argument must be either left_end or right_end"
496 unless ($end_to_trim eq 'right_end' || $end_to_trim eq 'left_end');
497
498 my @overlap_coords;
499 my $slice_length = $slice->length;
500 my $right_trim = 0;
501 my $left_trim = 0;
502
503 foreach my $exon (@{$slice->get_all_Exons}){
504 next if $exon->stable_id eq $self->_first_coding_Exon->stable_id;
505
506 my $start = $exon->start;
507 my $end = $exon->end;
508
509 # Choose from four possible exon arrangements
510
511 # -----|********************|----- Slice
512 # --|=========================|--- Exon arrangement 1
513 # ----------|======|-------------- Exon arrangement 2
514 # --|=======|--------------------- Exon arrangement 3
515 # -------------------|=========|-- Exon arrangement 4
516
517
518 if ($start <= 0 && $end >= $slice_length) { # exon arrangement 1
519 $right_trim = $slice_length - 1;
520 $left_trim = $slice_length - 1;
521 last;
522
523 } elsif ($start >= 0 && $end <= $slice_length) { # exon arrangement 2
524 my $this_right_trim = ($slice_length - $start) + 1;
525
526 $right_trim = $this_right_trim
527 if $this_right_trim > $right_trim;
528
529 $left_trim = $end
530 if $end > $left_trim;
531
532 } elsif ($start <= 0 && $end < $slice_length) { # exon arrangement 3
533 $right_trim = $slice_length; # a bit draconian
534 $left_trim = $end
535 if $end > $left_trim;
536
537 } elsif ($start > 0 && $end >= $slice_length) { # exon arrangement 4
538 my $this_right_trim = ($slice_length - $start) + 1;
539
540 $right_trim = $this_right_trim
541 if $this_right_trim > $right_trim;
542
543 $left_trim = $slice_length; # also a bit draconian
544 }
545
546 }
547
548 return $right_trim if $end_to_trim eq 'right_end';
549 return $left_trim if $end_to_trim eq 'left_end';
550 }
551
552 =head2 _first_coding_Exon
553
554 Arg : none
555 Example : $self->_first_coding_Exon;
556 Description: Finds the first exon of our transcript that
557 contains coding bases.
558 Returntype : Bio::EnsEMBL::Exon
559 Exceptions : none
560 Caller : $self->_derive_coords, $self->_bases_to_trim
561 Status : Stable
562
563 =cut
564
565 sub _first_coding_Exon {
566 my $self = shift;
567
568 unless ($self->{_first_coding_exon}){
569
570 my $exons = $self->transcript->get_all_translateable_Exons;
571
572 $self->{_first_coding_exon} = $exons->[0]
573 if $self->transcript->strand == 1;
574 $self->{_first_coding_exon} = $exons->[-1]
575 if $self->transcript->strand == -1;
576 }
577
578 return $self->{_first_coding_exon}
579 }
580
581
582 return 1;