0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Upstream - Object that defines an upstream region
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 use Bio::EnsEMBL::Upstream;
|
|
28
|
|
29 my $upstream = Bio::EnsEMBL::Upstream->new(
|
|
30 -transcript => $transcript,
|
|
31 -length => 2000 # bp
|
|
32 );
|
|
33
|
|
34 # Retrieve coordinates of upstream region
|
|
35 my $upstream_region_start = $upstream->upstart;
|
|
36 my $upstream_region_end = $upstream->upend;
|
|
37
|
|
38 # Retrieve coordinates in 'downstream' first intron
|
|
39 my $intron_region_start = $upstream->downstart;
|
|
40 my $intron_region_end = $upstream->downend;
|
|
41
|
|
42 # Coordinates are returned in the same scheme as the input transcript.
|
|
43 # However, the coordinates of an upstream region can be transformed to
|
|
44 # any other scheme using a slice
|
|
45
|
|
46 $upstream->transform($slice);
|
|
47
|
|
48 # Coordinates can be retrieved in scheme in the same manner as the
|
|
49 # above.
|
|
50
|
|
51 =head1 DESCRIPTION
|
|
52
|
|
53 An object that determines the upstream region of a transcript. Such a
|
|
54 region is non-coding and ensures that other genes or transcripts are
|
|
55 not present. Ultimately, these objects can be used to looking for
|
|
56 promoter elements. To this end, it is also possible to derive a region
|
|
57 downstream of the first exon, within the first intron and where promoter
|
|
58 elements sometimes are found.
|
|
59
|
|
60 =head1 METHODS
|
|
61
|
|
62 =cut
|
|
63
|
|
64 package Bio::EnsEMBL::Upstream;
|
|
65
|
|
66 use strict;
|
|
67 use vars qw(@ISA);
|
|
68 use Bio::EnsEMBL::DBSQL::DBAdaptor;
|
|
69 use Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor;
|
|
70
|
|
71 use Bio::EnsEMBL::Utils::Exception qw(throw);
|
|
72 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
|
|
73
|
|
74 @ISA = qw(Bio::EnsEMBL::Feature);
|
|
75
|
|
76
|
|
77 =head2 new
|
|
78
|
|
79 Arg [transcript] : (optional) Bio::EnsEMBL::Transcript
|
|
80 Arg [length] : (optional) int $length
|
|
81 Example : $upstream = Bio::EnsEMBL::Upstream->new(-transcript => $transcript,
|
|
82 -length => 2000);
|
|
83 Description: Creates a new upstream object
|
|
84 Returntype : Bio::EnsEMBL::Upstream
|
|
85 Exceptions : none
|
|
86 Caller : Bio::EnsEMBL::Transcript, general
|
|
87 Status : Stable
|
|
88
|
|
89 =cut
|
|
90
|
|
91 sub new {
|
|
92 my ($class, @args) = @_;
|
|
93 my $self = {};
|
|
94
|
|
95 bless $self, $class;
|
|
96
|
|
97 my ($transcript,
|
|
98 $length) = rearrange([qw(TRANSCRIPT
|
|
99 LENGTH
|
|
100 )],@args);
|
|
101
|
|
102 $self->transcript($transcript) if defined $transcript;
|
|
103 $self->length($length) if $length;
|
|
104
|
|
105 return $self
|
|
106 }
|
|
107
|
|
108 =head2 transcript
|
|
109
|
|
110 Arg : (optional) Bio::EnsEMBL::Transcript
|
|
111 Example : $self->transcript($transcript);
|
|
112 Description: Getter/setter for transcript object
|
|
113 Returntype : Bio::EnsEMBL::Transcript
|
|
114 Exceptions : Throws if argument is not undefined
|
|
115 or a Bio::EnsEMBL::Transcript
|
|
116 Caller : $self->new, $self->_derive_coords,
|
|
117 $self->_first_coding_Exon
|
|
118 Status : Stable
|
|
119
|
|
120 =cut
|
|
121
|
|
122
|
|
123 sub transcript {
|
|
124 my $self = shift;
|
|
125
|
|
126 if (@_){
|
|
127 $self->{_transcript} = shift;
|
|
128
|
|
129 if (defined $self->{_transcript}) {
|
|
130 throw("Transcript is not a Bio::EnsEMBL::Transcript")
|
|
131 if (! $self->{_transcript}->isa("Bio::EnsEMBL::Transcript"));
|
|
132 $self->_flush_cache;
|
|
133 }
|
|
134 }
|
|
135
|
|
136 return $self->{_transcript}
|
|
137 }
|
|
138
|
|
139 =head2 length
|
|
140
|
|
141 Arg : (optional) int $length
|
|
142 Example : $self->length(2000); # bp
|
|
143 Description: Getter/setter for upstream region length.
|
|
144 Returntype : int
|
|
145 Exceptions : Throws if length is requested before it has been set.
|
|
146 Caller : $self->new, $self->_derive_coords
|
|
147 Status : Stable
|
|
148
|
|
149 =cut
|
|
150
|
|
151 sub length {
|
|
152 my $self = shift;
|
|
153
|
|
154 if (@_){
|
|
155 $self->{_length} = shift;
|
|
156 $self->_flush_cache;
|
|
157 }
|
|
158
|
|
159 throw("Region length has not been set.")
|
|
160 unless $self->{_length};
|
|
161
|
|
162 return $self->{_length}
|
|
163 }
|
|
164
|
|
165 =head2 _flush_cache
|
|
166
|
|
167 Arg : none
|
|
168 Example : $self->_flush_cache;
|
|
169 Description: Empties cached coordinates (called when
|
|
170 coordinate scheme or region length has changed).
|
|
171 Returntype : none
|
|
172 Exceptions : none
|
|
173 Caller : $self->length, $self->transform
|
|
174 Status : Stable
|
|
175
|
|
176 =cut
|
|
177
|
|
178 sub _flush_cache {
|
|
179 my $self = shift;
|
|
180
|
|
181 $self->upstart(undef);
|
|
182 $self->upend(undef);
|
|
183 $self->downstart(undef);
|
|
184 $self->downend(undef);
|
|
185 }
|
|
186
|
|
187 =head2 upstart
|
|
188
|
|
189 Arg : none
|
|
190 Example : $self->upstart;
|
|
191 Description: Returns the start coordinate of the region
|
|
192 upstream of the transcript. This coordinate
|
|
193 is always the furthest from the translation
|
|
194 initiation codon, whereas upend always abutts
|
|
195 the translation initiation codon.
|
|
196 Returntype : int
|
|
197 Exceptions : none
|
|
198 Caller : general
|
|
199 Status : Stable
|
|
200
|
|
201 =cut
|
|
202
|
|
203 sub upstart {
|
|
204 my $self = shift;
|
|
205
|
|
206 if (@_) {
|
|
207 $self->{_upstart} = shift @_;
|
|
208 return
|
|
209 }
|
|
210
|
|
211 if (! defined $self->{_upstart}) {
|
|
212 $self->_derive_coords('up');
|
|
213 }
|
|
214
|
|
215 return $self->{_upstart}
|
|
216 }
|
|
217
|
|
218 =head2 upend
|
|
219
|
|
220 Arg : none
|
|
221 Example : $self->upend;
|
|
222 Description: Returns the end coordinate of the region
|
|
223 upstream of the transcript. This coordinate
|
|
224 always always abutts the translation
|
|
225 initiation codon, whereas upstart always
|
|
226 returns the coorindate furthest from the
|
|
227 translation initiation codon.
|
|
228 Returntype : int
|
|
229 Exceptions : none
|
|
230 Caller : general
|
|
231 Status : Stable
|
|
232
|
|
233 =cut
|
|
234
|
|
235 sub upend {
|
|
236 my $self = shift;
|
|
237
|
|
238 if (@_) {
|
|
239 $self->{_upend} = shift @_;
|
|
240 return
|
|
241 }
|
|
242
|
|
243 if (! defined $self->{_upend}) {
|
|
244 $self->_derive_coords('up');
|
|
245 }
|
|
246
|
|
247 return $self->{_upend}
|
|
248 }
|
|
249
|
|
250 =head2 downstart
|
|
251
|
|
252 Arg : none
|
|
253 Example : $self->downstart;
|
|
254 Description: Returns the start coordinate of the region
|
|
255 in the first intron of the transcript. This
|
|
256 coordinate is always closest to the first
|
|
257 exon (irregardless of strand).
|
|
258 Returntype : int
|
|
259 Exceptions : none
|
|
260 Caller : general
|
|
261 Status : Stable
|
|
262
|
|
263 =cut
|
|
264
|
|
265 sub downstart {
|
|
266 my $self = shift;
|
|
267
|
|
268 if (@_) {
|
|
269 $self->{_downstart} = shift @_;
|
|
270 return
|
|
271 }
|
|
272
|
|
273 if (! defined $self->{_downstart}) {
|
|
274 $self->_derive_coords('down');
|
|
275 }
|
|
276
|
|
277 return $self->{_downstart}
|
|
278 }
|
|
279
|
|
280 =head2 downend
|
|
281
|
|
282 Arg : none
|
|
283 Example : $self->downend;
|
|
284 Description: Returns the end coordinate of the region
|
|
285 in the first intron of the transcript. This
|
|
286 coordinate is always furthest from the first
|
|
287 exon (irregardless of strand).
|
|
288 Returntype : int
|
|
289 Exceptions : none
|
|
290 Caller : general
|
|
291 Status : Stable
|
|
292
|
|
293 =cut
|
|
294
|
|
295 sub downend {
|
|
296 my $self = shift;
|
|
297
|
|
298 if (@_) {
|
|
299 $self->{_downend} = shift @_;
|
|
300 return
|
|
301 }
|
|
302
|
|
303 if (! defined $self->{_downend}) {
|
|
304 $self->_derive_coords('down');
|
|
305 }
|
|
306
|
|
307 return $self->{_downend}
|
|
308 }
|
|
309
|
|
310 =head2 transform
|
|
311
|
|
312 Arg :
|
|
313 Example :
|
|
314 Description: Not yet implemented
|
|
315 Returntype :
|
|
316 Exceptions :
|
|
317 Caller :
|
|
318 Status : At Risk
|
|
319
|
|
320 =cut
|
|
321
|
|
322
|
|
323 # Over-riding inherited class. As yet unimplemented.
|
|
324
|
|
325 sub transform {
|
|
326 my $self = shift;
|
|
327
|
|
328 throw("No transform method implemented for " . $self);
|
|
329 }
|
|
330
|
|
331 =head2 derive_upstream_coords
|
|
332
|
|
333 Arg : none
|
|
334 Example : my ($upstart, $upend)
|
|
335 = $self->derive_upstream_coords;
|
|
336 Description: Derives upstream coordinates (for
|
|
337 compatability with older scripts).
|
|
338 Returntype : arrayref
|
|
339 Exceptions : none
|
|
340 Caller : general
|
|
341 Status : Stable
|
|
342
|
|
343 =cut
|
|
344
|
|
345 sub derive_upstream_coords {
|
|
346 my $self = shift;
|
|
347
|
|
348 return [$self->upstart, $self->upend]
|
|
349 }
|
|
350
|
|
351 =head2 derive_downstream_coords
|
|
352
|
|
353 Arg : none
|
|
354 Example : my ($downstart, $downend)
|
|
355 = $self->derive_downstream_coords;
|
|
356 Description: Derives downstream coordinates (for
|
|
357 compatability with older scripts).
|
|
358 Returntype : arrayref
|
|
359 Exceptions : none
|
|
360 Caller : general
|
|
361 Status : Stable
|
|
362
|
|
363 =cut
|
|
364
|
|
365 sub derive_downstream_coords {
|
|
366 my $self = shift;
|
|
367
|
|
368 return [$self->downstart, $self->downend]
|
|
369 }
|
|
370
|
|
371 =head2 _derive_coords
|
|
372
|
|
373 Arg : string $direction (either 'up' or 'down').
|
|
374 Example : $self->_derive_coords('up');
|
|
375 Description: Determines the coordinates of either upstream
|
|
376 or downstream region.
|
|
377 Returntype : none
|
|
378 Exceptions : Throws if argument is not either 'up' or 'down'
|
|
379 Caller : $self->upstart, $self->upend, $self->downstart,
|
|
380 $self->downend
|
|
381 Status : Stable
|
|
382
|
|
383 =cut
|
|
384
|
|
385 sub _derive_coords {
|
|
386 my ($self, $direction) = @_;
|
|
387
|
|
388 # Check direction
|
|
389 throw("Must specify either \'up\' of \'down\'-stream direction to derive coords.")
|
|
390 unless (($direction eq 'up')||($direction eq 'down'));
|
|
391
|
|
392 # Put things in easily accessible places.
|
|
393 my $core_db_slice_adaptor = $self->transcript->slice->adaptor;
|
|
394 my $region_length = $self->length;
|
|
395
|
|
396 # Whatever coord system the gene is currently is, transform to the toplevel.
|
|
397 my $transcript = $self->transcript->transform('toplevel');
|
|
398
|
|
399 # Use our transformed transcript to determine the upstream region coords.
|
|
400 # End should always be just before the coding start (like ATG), including 3' UTR.
|
|
401 # Start is the outer limit of the region upstream (furthest from ATG).
|
|
402
|
|
403 my $region_start;
|
|
404 my $region_end;
|
|
405
|
|
406 if ($transcript->strand == 1){
|
|
407 if ($direction eq 'up'){
|
|
408 $region_end = $transcript->coding_region_start - 1;
|
|
409 $region_start = $region_end - $region_length;
|
|
410 } elsif ($direction eq 'down'){
|
|
411 $region_end = $self->_first_coding_Exon->end + 1;
|
|
412 $region_start = $region_end + $region_length;
|
|
413 }
|
|
414 } elsif ($transcript->strand == -1) {
|
|
415 if ($direction eq 'up'){
|
|
416 $region_end = $transcript->coding_region_end + 1;
|
|
417 $region_start = $region_end + $region_length;
|
|
418
|
|
419 } elsif ($direction eq 'down'){
|
|
420 $region_end = $self->_first_coding_Exon->start - 1;
|
|
421 $region_start = $region_end - $region_length;
|
|
422 }
|
|
423 }
|
|
424
|
|
425 # Trim the upstream/downstream region to remove extraneous coding sequences
|
|
426 # from other genes and/or transcripts.
|
|
427
|
|
428 my ($slice_low_coord, $slice_high_coord) = sort {$a <=> $b} ($region_start, $region_end);
|
|
429
|
|
430 my $region_slice
|
|
431 = $core_db_slice_adaptor->fetch_by_region($transcript->slice->coord_system->name,
|
|
432 $transcript->slice->seq_region_name,
|
|
433 $slice_low_coord,
|
|
434 $slice_high_coord);
|
|
435
|
|
436 if ($transcript->strand == 1) {
|
|
437 if ($direction eq 'up') {
|
|
438 $region_start += $self->_bases_to_trim('left_end', $region_slice);
|
|
439 } elsif ($direction eq 'down') {
|
|
440 $region_start -= $self->_bases_to_trim('right_end', $region_slice);
|
|
441 }
|
|
442 } elsif ($transcript->strand == -1) {
|
|
443 if ($direction eq 'up') {
|
|
444 $region_start -= $self->_bases_to_trim('right_end', $region_slice);
|
|
445 } elsif ($direction eq 'down') {
|
|
446 $region_start += $self->_bases_to_trim('left_end', $region_slice);
|
|
447 }
|
|
448 }
|
|
449
|
|
450 # Always return start < end
|
|
451
|
|
452 ($region_start, $region_end) = sort {$a <=> $b} ($region_start, $region_end);
|
|
453
|
|
454 if ($direction eq 'up') {
|
|
455 $self->upstart($region_start);
|
|
456 $self->upend($region_end);
|
|
457 } elsif ($direction eq 'down') {
|
|
458 $self->downstart($region_start);
|
|
459 $self->downend($region_end);
|
|
460 }
|
|
461 }
|
|
462
|
|
463 =head2 _bases_to_trim
|
|
464
|
|
465 Arg : string $end_to_trim (either 'right_end' or
|
|
466 'left_end').
|
|
467 Arg : Bio::EnsEMBL::Slice
|
|
468 Example : $self->_derive_coords('right_end', $slice);
|
|
469 Description: Finds exons from other genes/transcripts that
|
|
470 invade our upstream/downstream slice and
|
|
471 returns the number of bases that should be
|
|
472 truncated from the appropriate end of the
|
|
473 upstream/downstream region.
|
|
474 Returntype : in
|
|
475 Exceptions : Throws if argument is not either 'right_end'
|
|
476 or 'left_end'
|
|
477 Caller : $self->_derive_coords
|
|
478 Status : Stable
|
|
479
|
|
480 =cut
|
|
481
|
|
482 # Method to look for coding regions that invade the upstream region. For
|
|
483 # now, this method returns the number of bases to trim. I doesn't yet
|
|
484 # do anything special if an exon is completely swallowed (truncates at
|
|
485 # the end of the overlapping exon and discards any non-coding sequence
|
|
486 # further upstream) or overlaps the 'wrong' end of the region (cases where
|
|
487 # two alternate exons share one end of sequence - does this happen?).
|
|
488
|
|
489 # The input argument 'end' defines the end of the slice that should be
|
|
490 # truncated.
|
|
491
|
|
492 sub _bases_to_trim {
|
|
493 my ($self, $end_to_trim, $slice) = @_;
|
|
494
|
|
495 throw "Slice end argument must be either left_end or right_end"
|
|
496 unless ($end_to_trim eq 'right_end' || $end_to_trim eq 'left_end');
|
|
497
|
|
498 my @overlap_coords;
|
|
499 my $slice_length = $slice->length;
|
|
500 my $right_trim = 0;
|
|
501 my $left_trim = 0;
|
|
502
|
|
503 foreach my $exon (@{$slice->get_all_Exons}){
|
|
504 next if $exon->stable_id eq $self->_first_coding_Exon->stable_id;
|
|
505
|
|
506 my $start = $exon->start;
|
|
507 my $end = $exon->end;
|
|
508
|
|
509 # Choose from four possible exon arrangements
|
|
510
|
|
511 # -----|********************|----- Slice
|
|
512 # --|=========================|--- Exon arrangement 1
|
|
513 # ----------|======|-------------- Exon arrangement 2
|
|
514 # --|=======|--------------------- Exon arrangement 3
|
|
515 # -------------------|=========|-- Exon arrangement 4
|
|
516
|
|
517
|
|
518 if ($start <= 0 && $end >= $slice_length) { # exon arrangement 1
|
|
519 $right_trim = $slice_length - 1;
|
|
520 $left_trim = $slice_length - 1;
|
|
521 last;
|
|
522
|
|
523 } elsif ($start >= 0 && $end <= $slice_length) { # exon arrangement 2
|
|
524 my $this_right_trim = ($slice_length - $start) + 1;
|
|
525
|
|
526 $right_trim = $this_right_trim
|
|
527 if $this_right_trim > $right_trim;
|
|
528
|
|
529 $left_trim = $end
|
|
530 if $end > $left_trim;
|
|
531
|
|
532 } elsif ($start <= 0 && $end < $slice_length) { # exon arrangement 3
|
|
533 $right_trim = $slice_length; # a bit draconian
|
|
534 $left_trim = $end
|
|
535 if $end > $left_trim;
|
|
536
|
|
537 } elsif ($start > 0 && $end >= $slice_length) { # exon arrangement 4
|
|
538 my $this_right_trim = ($slice_length - $start) + 1;
|
|
539
|
|
540 $right_trim = $this_right_trim
|
|
541 if $this_right_trim > $right_trim;
|
|
542
|
|
543 $left_trim = $slice_length; # also a bit draconian
|
|
544 }
|
|
545
|
|
546 }
|
|
547
|
|
548 return $right_trim if $end_to_trim eq 'right_end';
|
|
549 return $left_trim if $end_to_trim eq 'left_end';
|
|
550 }
|
|
551
|
|
552 =head2 _first_coding_Exon
|
|
553
|
|
554 Arg : none
|
|
555 Example : $self->_first_coding_Exon;
|
|
556 Description: Finds the first exon of our transcript that
|
|
557 contains coding bases.
|
|
558 Returntype : Bio::EnsEMBL::Exon
|
|
559 Exceptions : none
|
|
560 Caller : $self->_derive_coords, $self->_bases_to_trim
|
|
561 Status : Stable
|
|
562
|
|
563 =cut
|
|
564
|
|
565 sub _first_coding_Exon {
|
|
566 my $self = shift;
|
|
567
|
|
568 unless ($self->{_first_coding_exon}){
|
|
569
|
|
570 my $exons = $self->transcript->get_all_translateable_Exons;
|
|
571
|
|
572 $self->{_first_coding_exon} = $exons->[0]
|
|
573 if $self->transcript->strand == 1;
|
|
574 $self->{_first_coding_exon} = $exons->[-1]
|
|
575 if $self->transcript->strand == -1;
|
|
576 }
|
|
577
|
|
578 return $self->{_first_coding_exon}
|
|
579 }
|
|
580
|
|
581
|
|
582 return 1;
|