0
|
1 # $Id: Transcript.pm,v 1.17 2002/09/25 08:58:23 heikki Exp $
|
|
2 #
|
|
3 # bioperl module for Bio::LiveSeq::Transcript
|
|
4 #
|
|
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
|
|
6 #
|
|
7 # Copyright Joseph Insana
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::LiveSeq::Transcript - Transcript class for LiveSeq
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # documentation needed
|
|
20
|
|
21 =head1 DESCRIPTION
|
|
22
|
|
23 This stores informations about coding sequences (CDS).
|
|
24 The implementation is that a Transcript object accesses a collection of
|
|
25 Exon objects, inferring from them the nucleotide structure and sequence.
|
|
26
|
|
27 =head1 AUTHOR - Joseph A.L. Insana
|
|
28
|
|
29 Email: Insana@ebi.ac.uk, jinsana@gmx.net
|
|
30
|
|
31 Address:
|
|
32
|
|
33 EMBL Outstation, European Bioinformatics Institute
|
|
34 Wellcome Trust Genome Campus, Hinxton
|
|
35 Cambs. CB10 1SD, United Kingdom
|
|
36
|
|
37 =head1 APPENDIX
|
|
38
|
|
39 The rest of the documentation details each of the object
|
|
40 methods. Internal methods are usually preceded with a _
|
|
41
|
|
42 =cut
|
|
43
|
|
44 # Let the code begin...
|
|
45
|
|
46 package Bio::LiveSeq::Transcript;
|
|
47 $VERSION=5.2;
|
|
48
|
|
49 # Version history:
|
|
50 # Tue Mar 21 14:38:02 GMT 2000 v 1.0 begun
|
|
51 # Tue Mar 21 17:45:31 GMT 2000 v 1.1 new() created
|
|
52 # Wed Mar 22 19:40:13 GMT 2000 v 1.4 all_Exons() seq(), length(), all_labels()
|
|
53 # Thu Mar 23 19:08:36 GMT 2000 v 1.5 follows() replaces is_downstream()
|
|
54 # Thu Mar 23 20:59:02 GMT 2000 v 2.0 valid _inside_position label position
|
|
55 # Fri Mar 24 18:33:18 GMT 2000 v 2.2 rewritten position(), now should work with diverse coordinate_starts
|
|
56 # Sat Mar 25 04:08:18 GMT 2000 v 2.21 added firstlabel to position and label so that Translation can exploit it
|
|
57 # Sat Mar 25 06:39:27 GMT 2000 v 2.3 started override of subseq, works just internally
|
|
58 # Mon Mar 27 19:05:15 BST 2000 v 2.4 subseq finished, it works with coord_start
|
|
59 # Fri Mar 31 18:48:07 BST 2000 v 2.5 started downstream_seq()
|
|
60 # Mon Apr 3 17:37:34 BST 2000 v 2.52 upstream_seq added
|
|
61 # Fri Apr 7 03:29:43 BST 2000 v 2.6 up/downstream now can use Gene information
|
|
62 # Sat Apr 8 12:59:58 BST 2000 v 3.0 all_Exons now skips no more valid exons
|
|
63 # Sat Apr 8 13:32:08 BST 2000 v 3.1 get_Translation added
|
|
64 # Wed Apr 12 12:37:08 BST 2000 v 3.2 all_Exons updates Transcript's start/end
|
|
65 # Wed Apr 12 12:41:22 BST 2000 v 3.3 each Exon has "transcript" attribute added
|
|
66 # Wed Apr 12 16:35:56 BST 2000 v 3.4 started coding _deletecheck
|
|
67 # Wed Apr 12 23:40:19 BST 2000 v 3.5 start and end redefined here, no more checks after deletion to refix start/end attributes. And no need of those. Eliminated hence from new()
|
|
68 # Wed Apr 12 23:47:02 BST 2000 v 3.9 finished _deletecheck, debugging starts
|
|
69 # Thu Apr 13 00:37:16 BST 2000 v 4.0 debugging done: seems working OK
|
|
70 # Thu Apr 27 16:18:55 BST 2000 v 4.1 translation_table added
|
|
71 # Tue May 16 17:57:40 BST 2000 v 4.11 corrected bug in docs of downstream_seq
|
|
72 # Wed May 17 16:48:34 BST 2000 v 4.2 frame() added
|
|
73 # Mon May 22 15:22:12 BST 2000 v 4.21 labelsubseq tweaked for cases where startlabel==endlabel (no useless follow() query!)
|
|
74 # Thu Jun 22 20:02:39 BST 2000 v 4.3 valid() moved to SeqI, to be inherited as the general one
|
|
75 # Thu Jun 22 20:27:57 BST 2000 v 4.4 optimized labelsubseq coded!
|
|
76 # Thu Jun 22 21:17:51 BST 2000 v 4.44 in_which_Exon() added
|
|
77 # Sat Jun 24 00:49:55 BST 2000 v 4.5 new subseq() that exploits the new fast labelsubseq
|
|
78 # Thu Jun 29 16:31:19 BST 2000 v 5.0 downsream_seq and upstream_seq recoded so that if entry is RNA it will return sequences up to the entry limits -> it should be properly debugged, expecially the upstream_seq one
|
|
79 # Wed Jul 12 04:01:53 BST 2000 v 5.1 croak -> carp+return(-1)
|
|
80 # Wed Mar 28 15:16:21 BST 2001 v 5.2 carp -> warn,throw (coded methods in SeqI)
|
|
81
|
|
82 use strict;
|
|
83 # use Carp qw(carp cluck);
|
|
84 use vars qw($VERSION @ISA);
|
|
85 use Bio::LiveSeq::SeqI 3.2; # uses SeqI, inherits from it
|
|
86 use Bio::LiveSeq::Exon 1.0; # uses Exon to create new exon in case of deletion
|
|
87 @ISA=qw(Bio::LiveSeq::SeqI);
|
|
88
|
|
89 =head2 new
|
|
90
|
|
91 Title : new
|
|
92 Usage : $transcript = Bio::LiveSeq::Transcript->new(-exons => \@obj_refs);
|
|
93
|
|
94 Function: generates a new Bio::LiveSeq::Transcript
|
|
95 Returns : reference to a new object of class Transcript
|
|
96 Errorcode -1
|
|
97 Args : reference to an array of Exon object references
|
|
98
|
|
99 =cut
|
|
100
|
|
101 sub new {
|
|
102 my ($thing, %args) = @_;
|
|
103 my $class = ref($thing) || $thing;
|
|
104 my ($obj,%transcript);
|
|
105
|
|
106 my @exons=@{$args{-exons}};
|
|
107
|
|
108 $obj = \%transcript;
|
|
109 $obj = bless $obj, $class;
|
|
110
|
|
111 unless (@exons) {
|
|
112 $obj->warn("$class not initialised because exons array empty");
|
|
113 return(-1);
|
|
114 }
|
|
115
|
|
116 # now useless, after start and end methods have been overridden here
|
|
117 my $firstexon = $exons[0];
|
|
118 #my $lastexon = $exons[-1];
|
|
119 #my $start = $firstexon->start;
|
|
120 #my $end = $lastexon->end;
|
|
121 my $strand = $firstexon->strand;
|
|
122 my $seq = $firstexon->{'seq'};
|
|
123 $obj->alphabet('rna');
|
|
124
|
|
125 unless (_checkexons(\@exons)) {
|
|
126 $obj->warn("$class not initialised because of problems in the exon structure");
|
|
127 return(-1);
|
|
128 }
|
|
129 $obj->{'strand'}=$strand;
|
|
130 $obj->{'exons'}=\@exons;
|
|
131 $obj->{'seq'}=$seq;
|
|
132
|
|
133 # set Transcript into each Exon
|
|
134 my $exon;
|
|
135 foreach $exon (@exons) {
|
|
136 $exon->{'transcript'}=$obj;
|
|
137 }
|
|
138 return $obj;
|
|
139 }
|
|
140
|
|
141
|
|
142 =head2 all_Exons
|
|
143
|
|
144 Title : all_Exons
|
|
145 Usage : $transcript_obj->all_Exons()
|
|
146 Function: returns references to all Exon objects the Transcript is composed of
|
|
147 Example : foreach $exon ($transcript->all_Exons()) { do_something }
|
|
148 Returns : array of object references
|
|
149 Args : none
|
|
150
|
|
151 =cut
|
|
152
|
|
153 sub all_Exons {
|
|
154 my $self=shift;
|
|
155 my $exonsref=$self->{'exons'};
|
|
156 my @exons=@{$exonsref};
|
|
157 my @newexons;
|
|
158 my $exon;
|
|
159 foreach $exon (@exons) {
|
|
160 unless ($exon->obj_valid) {
|
|
161 $self->warn("$exon no more valid, start or end label lost, skipping....",1); # ignorable
|
|
162 } else {
|
|
163 push(@newexons,$exon);
|
|
164 }
|
|
165 }
|
|
166 if ($#exons != $#newexons) {
|
|
167 # update exons field
|
|
168 $self->{'exons'}=\@newexons;
|
|
169 }
|
|
170 return (@newexons);
|
|
171 }
|
|
172
|
|
173 =head2 downstream_seq
|
|
174
|
|
175 Title : downstream_seq
|
|
176 Usage : $transcript_obj->downstream_seq()
|
|
177 : $transcript_obj->downstream_seq(64)
|
|
178 Function: returns a string of nucleotides downstream of the end of the
|
|
179 CDS. If there is some information of the real mRNA, from features in
|
|
180 an attached Gene object, it will return up to those boundaries.
|
|
181 Otherwise it will return 1000 nucleotides.
|
|
182 If an argument is given it will override the default 1000 number
|
|
183 and return instead /that/ requested number of nucleotides.
|
|
184 But if a Gene object is attached, this argument will be ignored.
|
|
185 Returns : string
|
|
186 Args : an optional integer number of nucleotides to be returned instead of
|
|
187 the default if no gene attached
|
|
188
|
|
189 =cut
|
|
190
|
|
191 sub downstream_seq {
|
|
192 my ($self,$howmany)=@_;
|
|
193 my $str;
|
|
194 if (defined ($howmany)) {
|
|
195 unless ($howmany > 0) {
|
|
196 $self->throw("No sense in asking less than 1 downstream nucleotides!");
|
|
197 }
|
|
198 } else {
|
|
199 unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve until the end
|
|
200 #$str=$DNAobj->labelsubseq($self->end,undef,undef,"unsecuremoderequested");
|
|
201 #return(substr($str,1)); # delete first nucleotide that is the last of Transcript
|
|
202 if ($self->gene) { # if there is Gene object attached fetch relevant info
|
|
203 $str=$self->{'seq'}->labelsubseq($self->end,undef,$self->gene->maxtranscript->end); # retrieve from end of this Transcript to end of the maxtranscript
|
|
204 $str=substr($str,1); # delete first nucleotide that is the last of Transcript
|
|
205 if (CORE::length($str) > 0) {
|
|
206 return($str);
|
|
207 } else { # if there was no downstream through the gene's maxtranscript, go the usual way
|
|
208 $howmany = 1000;
|
|
209 }
|
|
210 } else {
|
|
211 $howmany = 1000;
|
|
212 }
|
|
213 }
|
|
214 }
|
|
215 my @exons=$self->all_Exons;
|
|
216 my $strand=$self->strand();
|
|
217 my $lastexon=$exons[-1];
|
|
218 my $lastexonlength=$lastexon->length;
|
|
219 # $howmany nucs after end of last exon
|
|
220 #my $downstream_seq=$lastexon->subseq($lastexonlength+1,undef,$howmany);
|
|
221 my $downstream_seq;
|
|
222
|
|
223 if ($howmany) {
|
|
224 $downstream_seq=substr($lastexon->labelsubseq($self->end,$howmany,undef,"unsecuremoderequested"),1);
|
|
225 } else {
|
|
226 if ($strand == 1) {
|
|
227 $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->end,"unsecuremoderequested"),1);
|
|
228 } else {
|
|
229 $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->start,"unsecuremoderequested"),1);
|
|
230 }
|
|
231 }
|
|
232 return $downstream_seq;
|
|
233 }
|
|
234
|
|
235 =head2 upstream_seq
|
|
236
|
|
237 Title : upstream_seq
|
|
238 Usage : $transcript_obj->upstream_seq()
|
|
239 : $transcript_obj->upstream_seq(64)
|
|
240 Function: just like downstream_seq but returns nucleotides before the ATG
|
|
241 Note : the default, if no Gene information present and no nucleotides
|
|
242 number given, is to return up to 400 nucleotides.
|
|
243
|
|
244 =cut
|
|
245
|
|
246 sub upstream_seq {
|
|
247 my ($self,$howmany)=@_;
|
|
248 if (defined ($howmany)) {
|
|
249 unless ($howmany > 0) {
|
|
250 $self->throw("No sense in asking less than 1 upstream nucleotides!");
|
|
251 }
|
|
252 } else {
|
|
253 unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve from the start
|
|
254 if ($self->gene) { # if there is Gene object attached fetch relevant info
|
|
255 my $str=$self->{'seq'}->labelsubseq($self->gene->maxtranscript->start,undef,$self->start); # retrieve from start of maxtranscript to start of this Transcript
|
|
256 chop $str; # delete last nucleotide that is the A of starting ATG
|
|
257 if (length($str) > 0) {
|
|
258 return($str);
|
|
259 } else { # if there was no upstream through the gene's maxtranscript, go the usual way
|
|
260 $howmany = 400;
|
|
261 }
|
|
262 } else {
|
|
263 $howmany = 400;
|
|
264 }
|
|
265 }
|
|
266 }
|
|
267 my @exons=$self->all_Exons;
|
|
268 my $firstexon=$exons[0];
|
|
269
|
|
270 my $upstream_seq;
|
|
271 my $strand=$self->strand();
|
|
272
|
|
273 if ($howmany) {# $howmany nucs before begin of first exon
|
|
274 my $labelbefore=$firstexon->label(-$howmany,$firstexon->start);
|
|
275 if ($labelbefore < 1) {
|
|
276 if ($strand == 1) {
|
|
277 $labelbefore=$self->{'seq'}->start;
|
|
278 } else {
|
|
279 $labelbefore=$self->{'seq'}->end;
|
|
280 }
|
|
281 }
|
|
282 $upstream_seq=$firstexon->labelsubseq($labelbefore,undef,$firstexon->start,"unsecuremoderequested");
|
|
283 chop $upstream_seq;
|
|
284 } else {
|
|
285 if ($strand == 1) {
|
|
286 $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->start,undef,$self->start,"unsecuremoderequested");
|
|
287 chop $upstream_seq; # delete last nucleotide that is the A of starting ATG
|
|
288 } else {
|
|
289 $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->end,undef,$self->start,"unsecuremoderequested");
|
|
290 chop $upstream_seq; # delete last nucleotide that is the A of starting ATG
|
|
291 }
|
|
292 }
|
|
293 return $upstream_seq;
|
|
294 }
|
|
295
|
|
296 # These get redefined here, overriding the SeqI one because they draw their
|
|
297 # information from the Exons a Transcript is built of
|
|
298 # optional argument: firstlabel. If not given, it checks coordinate_start
|
|
299 # This is useful when called by Translation
|
|
300 # also used by _delete
|
|
301 sub label {
|
|
302 my ($self,$position,$firstlabel)=@_;
|
|
303 unless ($position) { # if position = 0 complain ?
|
|
304 $self->warn("Position not given or position 0");
|
|
305 return (-1);
|
|
306 }
|
|
307 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
|
|
308 my ($label,@labels,$length,$arraypos);
|
|
309 unless (defined ($firstlabel)) {
|
|
310 $firstlabel=$self->coordinate_start; # this is inside Transcript obj
|
|
311 }
|
|
312 my $coord_pos=$self->_inside_position($firstlabel);
|
|
313 $length=$self->length;
|
|
314 #if ($strand == 1) {
|
|
315 if ($position < 1) {
|
|
316 $position++; # to account for missing of 0 position
|
|
317 }
|
|
318 $arraypos=$position+$coord_pos-2;
|
|
319 #print "\n=-=-=-=-DEBUG: arraypos $arraypos, pos $position, coordpos: $coord_pos";
|
|
320 if ($arraypos < 0) {
|
|
321 $label=$self->{'seq'}->label($arraypos,$start,$strand); #?
|
|
322 } elsif ($arraypos >= $length) {
|
|
323 $label=$self->{'seq'}->label($arraypos-$length+2,$end,$strand); #?
|
|
324 } else { # inside the Transcript
|
|
325 @labels=$self->all_labels;
|
|
326 $label=$labels[$arraypos];
|
|
327 }
|
|
328 #}
|
|
329 }
|
|
330
|
|
331 # argument: label
|
|
332 # returns: position of label according to coord_start
|
|
333 # errorcode: 0 label not found
|
|
334 # optional argument: firstlabel. If not given, it checks coordinate_start
|
|
335 # This is useful when called by Translation
|
|
336 sub position {
|
|
337 my ($self,$label,$firstlabel)=@_;
|
|
338 unless ($self->{'seq'}->valid($label)) {
|
|
339 $self->warn("label is not valid");
|
|
340 return (0);
|
|
341 }
|
|
342 unless (defined ($firstlabel)) {
|
|
343 $firstlabel=$self->coordinate_start; # this is inside Transcript obj
|
|
344 }
|
|
345 if ($label == $firstlabel) {
|
|
346 return (1);
|
|
347 }
|
|
348 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
|
|
349 my ($position,$in_pos,$out_pos,$coord_pos);
|
|
350 my $length=$self->length;
|
|
351 $coord_pos=$self->_inside_position($firstlabel);
|
|
352 if ($self->valid($label)) { # if label is inside the Transcript
|
|
353 $in_pos=$self->_inside_position($label);
|
|
354 $position=$in_pos-$coord_pos+1;
|
|
355 if ($position <= 0) {
|
|
356 return ($position-1); # accounts for the missing of the 0 position
|
|
357 }
|
|
358 } else {
|
|
359 if ($self->follows($end,$label)) { # label after end of transcript
|
|
360 $out_pos=$self->{'seq'}->position($label,$end,$strand);
|
|
361 #print "\n+++++++++DEBUG label $label FOLLOWS end $end outpos $out_pos coordpos $coord_pos";
|
|
362 $position=$out_pos+$length-$coord_pos;
|
|
363 } elsif ($self->follows($label,$start)) { # label before begin of transcript
|
|
364 #print "\n+++++++++DEBUG label $label BEFORE start $start outpos $out_pos coordpos $coord_pos";
|
|
365 $out_pos=$self->{'seq'}->position($label,$start,$strand);
|
|
366 $position=$out_pos-$coord_pos+1;
|
|
367 } else { # label is in intron (not valid, not after, not before)!
|
|
368 $self->warn("Cannot give position of label pointing to intron according to CDS numbering!",1);
|
|
369 return (0);
|
|
370 }
|
|
371 }
|
|
372 return ($position);
|
|
373 }
|
|
374
|
|
375 sub seq {
|
|
376 my $self=shift;
|
|
377 my ($exon,$str);
|
|
378 my @exons=$self->all_Exons();
|
|
379 foreach $exon (@exons) {
|
|
380 $str .= $exon->seq();
|
|
381 }
|
|
382 return $str;
|
|
383 }
|
|
384
|
|
385 sub length {
|
|
386 my $self=shift;
|
|
387 my ($exon,$length);
|
|
388 my @exons=$self->all_Exons();
|
|
389 foreach $exon (@exons) {
|
|
390 $length += $exon->length();
|
|
391 }
|
|
392 return $length;
|
|
393 }
|
|
394
|
|
395 sub all_labels {
|
|
396 my $self=shift;
|
|
397 my ($exon,@labels);
|
|
398 my @exons=$self->all_Exons();
|
|
399 foreach $exon (@exons) {
|
|
400 push (@labels,$exon->all_labels());
|
|
401 }
|
|
402 return @labels;
|
|
403 }
|
|
404
|
|
405 # redefined here so that it will retrieve effective subseq without introns
|
|
406 # otherwise it would have retrieved an underlying DNA (possibly with introns)
|
|
407 # subsequence
|
|
408 # Drawback: this is really bulky, label->position and then a call to
|
|
409 # subseq that will do the opposite position-> label
|
|
410 #
|
|
411 # one day this can be rewritten as the main one so that the normal subseq
|
|
412 # will rely on this one and hence avoid this double (useless and lengthy)
|
|
413 # conversion between labels and positions
|
|
414 sub old_labelsubseq {
|
|
415 my ($self,$start,$length,$end)=@_;
|
|
416 my ($pos1,$pos2);
|
|
417 if ($start) {
|
|
418 unless ($self->valid($start)) {
|
|
419 $self->warn("Start label not valid"); return (-1);
|
|
420 }
|
|
421 $pos1=$self->position($start);
|
|
422 }
|
|
423 if ($end) {
|
|
424 if ($end == $start) {
|
|
425 $length=1;
|
|
426 } else {
|
|
427 unless ($self->valid($end)) {
|
|
428 $self->warn("End label not valid"); return (-1);
|
|
429 }
|
|
430 unless ($self->follows($start,$end) == 1) {
|
|
431 $self->warn("End label does not follow Start label!"); return (-1);
|
|
432 }
|
|
433 $pos2=$self->position($end);
|
|
434 undef $length;
|
|
435 }
|
|
436 }
|
|
437 return ($self->subseq($pos1,$pos2,$length));
|
|
438 }
|
|
439
|
|
440 # rewritten, eventually
|
|
441
|
|
442 sub labelsubseq {
|
|
443 my ($self,$start,$length,$end,$unsecuremode)=@_;
|
|
444 unless (defined $unsecuremode &&
|
|
445 $unsecuremode eq "unsecuremoderequested")
|
|
446 { # to skip security checks (faster)
|
|
447 if ($start) {
|
|
448 unless ($self->valid($start)) {
|
|
449 $self->warn("Start label not valid"); return (-1);
|
|
450 }
|
|
451 } else {
|
|
452 $start=$self->start;
|
|
453 }
|
|
454 if ($end) {
|
|
455 if ($end == $start) {
|
|
456 $length=1;
|
|
457 undef $end;
|
|
458 } else {
|
|
459 undef $length; # end argument overrides length argument
|
|
460 unless ($self->valid($end)) {
|
|
461 $self->warn("End label not valid"); return (-1);
|
|
462 }
|
|
463 unless ($self->follows($start,$end) == 1) {
|
|
464 $self->warn("End label does not follow Start label!"); return (-1);
|
|
465 }
|
|
466 }
|
|
467 } else {
|
|
468 $end=$self->end;
|
|
469 }
|
|
470 }
|
|
471 my ($seq,$exon,$startexon,$endexon); my @exonlabels;
|
|
472 my @exons=$self->all_Exons;
|
|
473 EXONCHECK:
|
|
474 foreach $exon (@exons) {
|
|
475 if ((!(defined($startexon)))&&($exon->valid($start))) { # checks only if not yet found
|
|
476 $startexon=$exon;
|
|
477 }
|
|
478 if ($exon->valid($end)) {
|
|
479 $endexon=$exon;
|
|
480 }
|
|
481 if ((!(defined($seq)) && (defined($startexon)))) { # initializes only once
|
|
482 if ((defined($endexon)) && ($endexon eq $startexon)) { # then perfect, we are finished
|
|
483 if ($length) {
|
|
484 $seq = $startexon->labelsubseq($start,$length,undef,"unsecuremoderequested");
|
|
485
|
|
486
|
|
487 last EXONCHECK;
|
|
488 } else {
|
|
489 $seq = $startexon->labelsubseq($start,undef,$end,"unsecuremoderequested");
|
|
490 }
|
|
491 last EXONCHECK;
|
|
492 } else { # get up to the end of the exon
|
|
493 $seq = $startexon->labelsubseq($start,undef,undef,"unsecuremoderequested");
|
|
494 }
|
|
495 }
|
|
496 if (($startexon)&&($exon ne $startexon)) {
|
|
497 if (defined($endexon)) { # we arrived to the last exon
|
|
498 $seq .= $endexon->labelsubseq(undef,undef,$end,"unsecuremoderequested"); # get from the start of the exon
|
|
499 last EXONCHECK;
|
|
500
|
|
501 } elsif (defined($startexon)) { # we are in a whole-exon-in-the-middle case
|
|
502 $seq .= $exon->seq; # we add it completely to the seq
|
|
503 } # else, we still have to reach the start point, exon useless, we move on
|
|
504 if ($length) { # if length argument specified
|
|
505 if (($seq && (CORE::length($seq) >= $length))) {
|
|
506 last EXONCHECK;
|
|
507 }
|
|
508 }
|
|
509 }
|
|
510 }
|
|
511 if ($length) {
|
|
512 return (substr($seq,0,$length));
|
|
513 } else {
|
|
514 return ($seq);
|
|
515 }
|
|
516 }
|
|
517
|
|
518
|
|
519 # argument: label
|
|
520 # returns: the objref and progressive number of the Exon containing that label
|
|
521 # errorcode: -1
|
|
522 sub in_which_Exon {
|
|
523 my ($self,$label)=@_;
|
|
524 my ($count,$exon);
|
|
525 my @exons=$self->all_Exons;
|
|
526 foreach $exon (@exons) {
|
|
527 $count++; # 1st exon is numbered "1"
|
|
528 if ($exon->valid($label)) {
|
|
529 return ($exon,$count)
|
|
530 }
|
|
531 }
|
|
532 return (-1); # if nothing found
|
|
533 }
|
|
534
|
|
535 # recoded to exploit the new fast labelsubseq()
|
|
536 # valid only inside Transcript
|
|
537 sub subseq {
|
|
538 my ($self,$pos1,$pos2,$length) = @_;
|
|
539 my ($str,$startlabel,$endlabel);
|
|
540 if (defined ($pos1)) {
|
|
541 if ($pos1 == 0) { # if position = 0 complain
|
|
542 $self->warn("Position cannot be 0!"); return (-1);
|
|
543 }
|
|
544 if ((defined ($pos2))&&($pos1>$pos2)) {
|
|
545 $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1);
|
|
546 }
|
|
547 $startlabel=$self->label($pos1);
|
|
548 unless ($self->valid($startlabel)) {
|
|
549 $self->warn("Start label not valid"); return (-1);
|
|
550 }
|
|
551 if ($startlabel < 1) {
|
|
552 $self->warn("position $pos1 not valid as start of subseq!"); return (-1);
|
|
553 }
|
|
554 } else {
|
|
555 $startlabel=$self->start;
|
|
556 }
|
|
557 if (defined ($pos2)) {
|
|
558 if ($pos2 == 0) { # if position = 0 complain
|
|
559 $self->warn("Position cannot be 0!"); return (-1);
|
|
560 }
|
|
561 undef $length;
|
|
562 if ((defined ($pos1))&&($pos1>$pos2)) {
|
|
563 $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1);
|
|
564 }
|
|
565 $endlabel=$self->label($pos2);
|
|
566 unless ($self->valid($endlabel)) {
|
|
567 $self->warn("End label not valid"); return (-1);
|
|
568 }
|
|
569 if ($endlabel < 1) {
|
|
570 $self->warn("position $pos2 not valid as end of subseq!"); return (-1);
|
|
571 }
|
|
572 } else {
|
|
573 unless (defined ($length)) {
|
|
574 $endlabel=$self->end;
|
|
575 }
|
|
576 }
|
|
577 return ($self->labelsubseq($startlabel,$length,$endlabel,"unsecuremoderequested"));
|
|
578 }
|
|
579
|
|
580 # works only inside the transcript, complains if asked outside
|
|
581 sub old_subseq {
|
|
582 my ($self,$pos1,$pos2,$length) = @_;
|
|
583 my ($str,$startcount,$endcount,$seq,$seqlength);
|
|
584 if (defined ($length)) {
|
|
585 if ($length < 1) {
|
|
586 $self->warn("No sense asking for a subseq of length < 1");
|
|
587 return (-1);
|
|
588 }
|
|
589 }
|
|
590 my $firstlabel=$self->coordinate_start; # this is inside Transcript obj
|
|
591 my $coord_pos=$self->_inside_position($firstlabel); # TESTME old
|
|
592 $seq=$self->seq;
|
|
593 $seqlength=CORE::length($seq);
|
|
594 unless (defined ($pos1)) {
|
|
595 $startcount=1+$coord_pos-1; # i.e. coord_pos
|
|
596 } else {
|
|
597 if ($pos1 == 0) { # if position = 0 complain
|
|
598 $self->warn("Position cannot be 0!"); return (-1);
|
|
599 } elsif ($pos1 < 0) {
|
|
600 $pos1++;
|
|
601 }
|
|
602 if ((defined ($pos2))&&($pos1>$pos2)) {
|
|
603 $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!");
|
|
604 return (-1);
|
|
605 }
|
|
606 $startcount=$pos1+$coord_pos-1;
|
|
607 }
|
|
608 unless (defined ($pos2)) {
|
|
609 ;
|
|
610 } else {
|
|
611 if ($pos2 == 0) { # if position = 0 complain
|
|
612 $self->warn("Position cannot be 0!"); return (-1);
|
|
613 } elsif ($pos2 < 0) {
|
|
614 $pos2++;
|
|
615 }
|
|
616 if ((defined ($pos1))&&($pos1>$pos2)) {
|
|
617 $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!");
|
|
618 return (-1);
|
|
619 }
|
|
620 $endcount=$pos2+$coord_pos-1;
|
|
621 if ($endcount > $seqlength) {
|
|
622 #print "\n###DEBUG###: pos1 $pos1 pos2 $pos2 coordpos $coord_pos endcount $endcount seqln $seqlength\n";
|
|
623 $self->warn("Cannot access end position after the end of Transcript");
|
|
624 return (-1);
|
|
625 }
|
|
626 $length=$endcount-$startcount+1;
|
|
627 }
|
|
628 #print "\n###DEBUG pos1 $pos1 pos2 $pos2 start $startcount end $endcount length $length coordpos $coord_pos\n";
|
|
629 my $offset=$startcount-1;
|
|
630 if ($offset < 0) {
|
|
631 $self->warn("Cannot access startposition before the beginning of Transcript, returning from start",1); # ignorable
|
|
632 return (substr($seq,0,$length));
|
|
633 } elsif ($offset >= $seqlength) {
|
|
634 $self->warn("Cannot access startposition after the end of Transcript");
|
|
635 return (-1);
|
|
636 } else {
|
|
637 $str=substr($seq,$offset,$length);
|
|
638 if (CORE::length($str) < $length) {
|
|
639 $self->warn("Attention, cannot return the length requested ".
|
|
640 "for subseq",1) if $self->verbose > 0; # ignorable
|
|
641 }
|
|
642 return $str;
|
|
643 }
|
|
644 }
|
|
645
|
|
646 # redefined so that it doesn't require other methods (after deletions) to
|
|
647 # reset it.
|
|
648 sub start {
|
|
649 my $self = shift;
|
|
650 my $exonsref=$self->{'exons'};
|
|
651 my @exons=@{$exonsref};
|
|
652 return ($exons[0]->start);
|
|
653 }
|
|
654
|
|
655 sub end {
|
|
656 my $self = shift;
|
|
657 my $exonsref=$self->{'exons'};
|
|
658 my @exons=@{$exonsref};
|
|
659 return ($exons[-1]->end);
|
|
660 }
|
|
661
|
|
662
|
|
663 # internal methods begin here
|
|
664
|
|
665 # returns: position of label in transcript's all_labels
|
|
666 # with STARTlabel == 1
|
|
667 # errorcode 0 -> label not found
|
|
668 # argument: label
|
|
669 sub _inside_position {
|
|
670 my ($self,$label)=@_;
|
|
671 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand());
|
|
672 my ($position,$checkme);
|
|
673 my @labels=$self->all_labels;
|
|
674 foreach $checkme (@labels) {
|
|
675 $position++;
|
|
676 if ($label == $checkme) {
|
|
677 return ($position);
|
|
678 }
|
|
679 }
|
|
680 return (0);
|
|
681 }
|
|
682
|
|
683 # returns 1 OK or 0 ERROR
|
|
684 # arguments: reference to array of Exon object references
|
|
685 sub _checkexons {
|
|
686 my ($exon,$thisstart);
|
|
687 my $self=$exon;
|
|
688 my $exonsref=$_[0];
|
|
689 my @exons=@{$exonsref};
|
|
690
|
|
691 my $firstexon = $exons[0];
|
|
692
|
|
693 unless (ref($firstexon) eq "Bio::LiveSeq::Exon") {
|
|
694 $self->warn("Object not of class Exon");
|
|
695 return (0);
|
|
696 }
|
|
697 my $strand = $firstexon->strand;
|
|
698
|
|
699 my $prevend = $firstexon->end;
|
|
700 shift @exons; # skip first one
|
|
701 foreach $exon (@exons) {
|
|
702 unless (ref($exon) eq "Bio::LiveSeq::Exon") { # object class check
|
|
703 $self->warn("Object not of class Exon");
|
|
704 return (0);
|
|
705 }
|
|
706 if ($exon->strand != $strand) { # strand consistency check
|
|
707 $self->warn("Exons' strands not consistent when trying to create Transcript");
|
|
708 return (0);
|
|
709 }
|
|
710 $thisstart = $exon->start;
|
|
711 unless ($exon->{'seq'}->follows($prevend,$thisstart,$strand)) {
|
|
712 $self->warn("Exons not in correct order when trying to create Transcript");
|
|
713 return (0);
|
|
714 }
|
|
715 $prevend = $exon->end;
|
|
716 }
|
|
717 return (1);
|
|
718 }
|
|
719
|
|
720 =head2 get_Translation
|
|
721
|
|
722 Title : valid
|
|
723 Usage : $translation = $obj->get_Translation()
|
|
724 Function: retrieves the reference to the object of class Translation (if any)
|
|
725 attached to a LiveSeq object
|
|
726 Returns : object reference
|
|
727 Args : none
|
|
728
|
|
729 =cut
|
|
730
|
|
731 sub get_Translation {
|
|
732 my $self=shift;
|
|
733 return ($self->{'translation'}); # this is set when Translation->new is called
|
|
734 }
|
|
735
|
|
736 # this checks so that deletion spanning multiple exons is
|
|
737 # handled accordingly and correctly
|
|
738 # arguments: begin and end label of a deletion
|
|
739 # this is called BEFORE any deletion in the chain
|
|
740 sub _deletecheck {
|
|
741 my ($self,$startlabel,$endlabel)=@_;
|
|
742 my $exonsref=$self->{'exons'};
|
|
743 my @exons=@{$exonsref};
|
|
744 my ($startexon,$endexon,$exon);
|
|
745 $startexon=$endexon=0;
|
|
746 foreach $exon (@exons) {
|
|
747 if (($startexon == 0)&&($exon->valid($startlabel))) {
|
|
748 $startexon=$exon; # exon containing start of deletion
|
|
749 }
|
|
750 if (($endexon == 0)&&($exon->valid($endlabel))) {
|
|
751 $endexon=$exon; # exon containing end of deletion
|
|
752 }
|
|
753 if (($startexon)&&($endexon)) {
|
|
754 last; # don't check further
|
|
755 }
|
|
756 }
|
|
757 my $nextend=$self->label(2,$endlabel); # retrieve the next label
|
|
758 my $prevstart=$self->label(-1,$startlabel); # retrieve the prev label
|
|
759
|
|
760 if ($startexon eq $endexon) { # intra-exon deletion
|
|
761 if (($startexon->start eq $startlabel) && ($startexon->end eq $endlabel)) {
|
|
762 # let's delete the entire exon
|
|
763 my @newexons;
|
|
764 foreach $exon (@exons) {
|
|
765 unless ($exon eq $startexon) {
|
|
766 push(@newexons,$exon);
|
|
767 }
|
|
768 }
|
|
769 $self->{'exons'}=\@newexons;
|
|
770 } elsif ($startexon->start eq $startlabel) { # special cases
|
|
771 $startexon->{'start'}=$nextend; # set a new start of exon
|
|
772 } elsif ($startexon->end eq $endlabel) {
|
|
773 $startexon->{'end'}=$prevstart; # set a new end of exon
|
|
774 } else {
|
|
775 return; # no problem
|
|
776 }
|
|
777 } else { # two new exons to be created, inter-exons deletion
|
|
778 my @newexons;
|
|
779 my $exonobj;
|
|
780 my $dna=$self->{'seq'};
|
|
781 my $strand=$self->strand;
|
|
782 my $notmiddle=1; # flag for skipping exons in the middle of deletion
|
|
783 foreach $exon (@exons) {
|
|
784 if ($exon eq $startexon) {
|
|
785 $exonobj=Bio::LiveSeq::Exon->new('-seq'=>$dna,'-start'=>$exon->start,'-end'=>$prevstart,'-strand'=>$strand); # new partial exon
|
|
786 push(@newexons,$exonobj);
|
|
787 $notmiddle=0; # now we enter totally deleted exons
|
|
788 } elsif ($exon eq $endexon) {
|
|
789 $exonobj=Bio::LiveSeq::Exon->new('-seq'=>$dna,'-start'=>$nextend,'-end'=>$exon->end,'-strand'=>$strand); # new partial exon
|
|
790 push(@newexons,$exonobj);
|
|
791 $notmiddle=1; # exiting totally deleted exons
|
|
792 } else {
|
|
793 if ($notmiddle) { # if before or after exons with deletion
|
|
794 push(@newexons,$exon);
|
|
795 }# else skip them
|
|
796 }
|
|
797 }
|
|
798 $self->{'exons'}=\@newexons;
|
|
799 }
|
|
800 }
|
|
801
|
|
802 =head2 translation_table
|
|
803
|
|
804 Title : translation_table
|
|
805 Usage : $name = $obj->translation_table;
|
|
806 : $name = $obj->translation_table(11);
|
|
807 Function: Returns or sets the translation_table used for translating the
|
|
808 transcript.
|
|
809 If it has never been set, it will return undef.
|
|
810 Returns : an integer
|
|
811
|
|
812 =cut
|
|
813
|
|
814 sub translation_table {
|
|
815 my ($self,$value) = @_;
|
|
816 if (defined $value) {
|
|
817 $self->{'translation_table'} = $value;
|
|
818 }
|
|
819 unless (exists $self->{'translation_table'}) {
|
|
820 return (undef);
|
|
821 } else {
|
|
822 return $self->{'translation_table'};
|
|
823 }
|
|
824 }
|
|
825
|
|
826 =head2 frame
|
|
827
|
|
828 Title : frame
|
|
829 Usage : $frame = $transcript->frame($label);
|
|
830 Function: Returns the frame of a particular nucleotide.
|
|
831 Frame can be 0 1 or 2 and means the position in the codon triplet
|
|
832 of the particulat nucleotide. 0 is the first codon_position.
|
|
833 Codon_position (1 2 3) is simply frame+1.
|
|
834 If the label asked for is not inside the Transcript, -1 will be
|
|
835 returned.
|
|
836 Args : a label
|
|
837 Returns : 0 1 or 2
|
|
838 Errorcode -1
|
|
839
|
|
840 =cut
|
|
841
|
|
842 # args: label
|
|
843 # returns: frame of nucleotide (0 1 2)
|
|
844 # errorcode: -1
|
|
845 sub frame {
|
|
846 my ($self,$inputlabel)=@_;
|
|
847 my @labels=$self->all_labels;
|
|
848 my ($label,$frame,$count);
|
|
849 foreach $label (@labels) {
|
|
850 if ($inputlabel == $label) {
|
|
851 return ($count % 3);
|
|
852 }
|
|
853 $count++; # 0 1 2 3 4....
|
|
854 }
|
|
855 return (-1); # label not found amid Transcript labels
|
|
856 }
|
|
857
|
|
858 1;
|