| 0 | 1 # $Id: Transcript.pm,v 1.17 2002/09/25 08:58:23 heikki Exp $ | 
|  | 2 # | 
|  | 3 # bioperl module for Bio::LiveSeq::Transcript | 
|  | 4 # | 
|  | 5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net> | 
|  | 6 # | 
|  | 7 # Copyright Joseph Insana | 
|  | 8 # | 
|  | 9 # You may distribute this module under the same terms as perl itself | 
|  | 10 # | 
|  | 11 # POD documentation - main docs before the code | 
|  | 12 | 
|  | 13 =head1 NAME | 
|  | 14 | 
|  | 15 Bio::LiveSeq::Transcript - Transcript class for LiveSeq | 
|  | 16 | 
|  | 17 =head1 SYNOPSIS | 
|  | 18 | 
|  | 19   # documentation needed | 
|  | 20 | 
|  | 21 =head1 DESCRIPTION | 
|  | 22 | 
|  | 23 This stores informations about coding sequences (CDS). | 
|  | 24 The implementation is that a Transcript object accesses a collection of | 
|  | 25 Exon objects, inferring from them the nucleotide structure and sequence. | 
|  | 26 | 
|  | 27 =head1 AUTHOR - Joseph A.L. Insana | 
|  | 28 | 
|  | 29 Email:  Insana@ebi.ac.uk, jinsana@gmx.net | 
|  | 30 | 
|  | 31 Address: | 
|  | 32 | 
|  | 33      EMBL Outstation, European Bioinformatics Institute | 
|  | 34      Wellcome Trust Genome Campus, Hinxton | 
|  | 35      Cambs. CB10 1SD, United Kingdom | 
|  | 36 | 
|  | 37 =head1 APPENDIX | 
|  | 38 | 
|  | 39 The rest of the documentation details each of the object | 
|  | 40 methods. Internal methods are usually preceded with a _ | 
|  | 41 | 
|  | 42 =cut | 
|  | 43 | 
|  | 44 # Let the code begin... | 
|  | 45 | 
|  | 46 package Bio::LiveSeq::Transcript; | 
|  | 47 $VERSION=5.2; | 
|  | 48 | 
|  | 49 # Version history: | 
|  | 50 # Tue Mar 21 14:38:02 GMT 2000 v 1.0 begun | 
|  | 51 # Tue Mar 21 17:45:31 GMT 2000 v 1.1 new() created | 
|  | 52 # Wed Mar 22 19:40:13 GMT 2000 v 1.4 all_Exons() seq(), length(), all_labels() | 
|  | 53 # Thu Mar 23 19:08:36 GMT 2000 v 1.5 follows() replaces is_downstream() | 
|  | 54 # Thu Mar 23 20:59:02 GMT 2000 v 2.0 valid _inside_position label position | 
|  | 55 # Fri Mar 24 18:33:18 GMT 2000 v 2.2 rewritten position(), now should work with diverse coordinate_starts | 
|  | 56 # Sat Mar 25 04:08:18 GMT 2000 v 2.21 added firstlabel to position and label so that Translation can exploit it | 
|  | 57 # Sat Mar 25 06:39:27 GMT 2000 v 2.3 started override of subseq, works just internally | 
|  | 58 # Mon Mar 27 19:05:15 BST 2000 v 2.4 subseq finished, it works with coord_start | 
|  | 59 # Fri Mar 31 18:48:07 BST 2000 v 2.5 started downstream_seq() | 
|  | 60 # Mon Apr  3 17:37:34 BST 2000 v 2.52 upstream_seq added | 
|  | 61 # Fri Apr  7 03:29:43 BST 2000 v 2.6 up/downstream now can use Gene information | 
|  | 62 # Sat Apr  8 12:59:58 BST 2000 v 3.0 all_Exons now skips no more valid exons | 
|  | 63 # Sat Apr  8 13:32:08 BST 2000 v 3.1 get_Translation added | 
|  | 64 # Wed Apr 12 12:37:08 BST 2000 v 3.2 all_Exons updates Transcript's start/end | 
|  | 65 # Wed Apr 12 12:41:22 BST 2000 v 3.3 each Exon has "transcript" attribute added | 
|  | 66 # Wed Apr 12 16:35:56 BST 2000 v 3.4 started coding _deletecheck | 
|  | 67 # Wed Apr 12 23:40:19 BST 2000 v 3.5 start and end redefined here, no more checks after deletion to refix start/end attributes. And no need of those. Eliminated hence from new() | 
|  | 68 # Wed Apr 12 23:47:02 BST 2000 v 3.9 finished _deletecheck, debugging starts | 
|  | 69 # Thu Apr 13 00:37:16 BST 2000 v 4.0 debugging done: seems working OK | 
|  | 70 # Thu Apr 27 16:18:55 BST 2000 v 4.1 translation_table added | 
|  | 71 # Tue May 16 17:57:40 BST 2000 v 4.11 corrected bug in docs of downstream_seq | 
|  | 72 # Wed May 17 16:48:34 BST 2000 v 4.2 frame() added | 
|  | 73 # Mon May 22 15:22:12 BST 2000 v 4.21 labelsubseq tweaked for cases where startlabel==endlabel (no useless follow() query!) | 
|  | 74 # Thu Jun 22 20:02:39 BST 2000 v 4.3 valid() moved to SeqI, to be inherited as the general one | 
|  | 75 # Thu Jun 22 20:27:57 BST 2000 v 4.4 optimized labelsubseq coded! | 
|  | 76 # Thu Jun 22 21:17:51 BST 2000 v 4.44 in_which_Exon() added | 
|  | 77 # Sat Jun 24 00:49:55 BST 2000 v 4.5 new subseq() that exploits the new fast labelsubseq | 
|  | 78 # Thu Jun 29 16:31:19 BST 2000 v 5.0 downsream_seq and upstream_seq recoded so that if entry is RNA it will return sequences up to the entry limits -> it should be properly debugged, expecially the upstream_seq one | 
|  | 79 # Wed Jul 12 04:01:53 BST 2000 v 5.1 croak -> carp+return(-1) | 
|  | 80 # Wed Mar 28 15:16:21 BST 2001 v 5.2 carp -> warn,throw (coded methods in SeqI) | 
|  | 81 | 
|  | 82 use strict; | 
|  | 83 # use Carp qw(carp cluck); | 
|  | 84 use vars qw($VERSION @ISA); | 
|  | 85 use Bio::LiveSeq::SeqI 3.2; # uses SeqI, inherits from it | 
|  | 86 use Bio::LiveSeq::Exon 1.0; # uses Exon to create new exon in case of deletion | 
|  | 87 @ISA=qw(Bio::LiveSeq::SeqI); | 
|  | 88 | 
|  | 89 =head2 new | 
|  | 90 | 
|  | 91   Title   : new | 
|  | 92   Usage   : $transcript = Bio::LiveSeq::Transcript->new(-exons => \@obj_refs); | 
|  | 93 | 
|  | 94   Function: generates a new Bio::LiveSeq::Transcript | 
|  | 95   Returns : reference to a new object of class Transcript | 
|  | 96   Errorcode -1 | 
|  | 97   Args    : reference to an array of Exon object references | 
|  | 98 | 
|  | 99 =cut | 
|  | 100 | 
|  | 101 sub new { | 
|  | 102   my ($thing, %args) = @_; | 
|  | 103   my $class = ref($thing) || $thing; | 
|  | 104   my ($obj,%transcript); | 
|  | 105 | 
|  | 106   my @exons=@{$args{-exons}}; | 
|  | 107 | 
|  | 108   $obj = \%transcript; | 
|  | 109   $obj = bless $obj, $class; | 
|  | 110 | 
|  | 111   unless (@exons) { | 
|  | 112     $obj->warn("$class not initialised because exons array empty"); | 
|  | 113     return(-1); | 
|  | 114   } | 
|  | 115 | 
|  | 116   # now useless, after start and end methods have been overridden here | 
|  | 117   my $firstexon = $exons[0]; | 
|  | 118   #my $lastexon = $exons[-1]; | 
|  | 119   #my $start = $firstexon->start; | 
|  | 120   #my $end = $lastexon->end; | 
|  | 121   my $strand = $firstexon->strand; | 
|  | 122   my $seq = $firstexon->{'seq'}; | 
|  | 123   $obj->alphabet('rna'); | 
|  | 124 | 
|  | 125   unless (_checkexons(\@exons)) { | 
|  | 126     $obj->warn("$class not initialised because of problems in the exon structure"); | 
|  | 127     return(-1); | 
|  | 128   } | 
|  | 129   $obj->{'strand'}=$strand; | 
|  | 130   $obj->{'exons'}=\@exons; | 
|  | 131   $obj->{'seq'}=$seq; | 
|  | 132 | 
|  | 133   # set Transcript into each Exon | 
|  | 134   my $exon; | 
|  | 135   foreach $exon (@exons) { | 
|  | 136     $exon->{'transcript'}=$obj; | 
|  | 137   } | 
|  | 138   return $obj; | 
|  | 139 } | 
|  | 140 | 
|  | 141 | 
|  | 142 =head2 all_Exons | 
|  | 143 | 
|  | 144  Title   : all_Exons | 
|  | 145  Usage   : $transcript_obj->all_Exons() | 
|  | 146  Function: returns references to all Exon objects the Transcript is composed of | 
|  | 147  Example : foreach $exon ($transcript->all_Exons()) { do_something } | 
|  | 148  Returns : array of object references | 
|  | 149  Args    : none | 
|  | 150 | 
|  | 151 =cut | 
|  | 152 | 
|  | 153 sub all_Exons { | 
|  | 154   my $self=shift; | 
|  | 155   my $exonsref=$self->{'exons'}; | 
|  | 156   my @exons=@{$exonsref}; | 
|  | 157   my @newexons; | 
|  | 158   my $exon; | 
|  | 159   foreach $exon (@exons) { | 
|  | 160     unless ($exon->obj_valid) { | 
|  | 161       $self->warn("$exon no more valid, start or end label lost, skipping....",1); # ignorable | 
|  | 162     } else { | 
|  | 163       push(@newexons,$exon); | 
|  | 164     } | 
|  | 165   } | 
|  | 166   if ($#exons != $#newexons) { | 
|  | 167     # update exons field | 
|  | 168     $self->{'exons'}=\@newexons; | 
|  | 169   } | 
|  | 170   return (@newexons); | 
|  | 171 } | 
|  | 172 | 
|  | 173 =head2 downstream_seq | 
|  | 174 | 
|  | 175  Title   : downstream_seq | 
|  | 176  Usage   : $transcript_obj->downstream_seq() | 
|  | 177          : $transcript_obj->downstream_seq(64) | 
|  | 178  Function: returns a string of nucleotides downstream of the end of the | 
|  | 179            CDS. If there is some information of the real mRNA, from features in | 
|  | 180            an attached Gene object, it will return up to those boundaries. | 
|  | 181            Otherwise it will return 1000 nucleotides. | 
|  | 182            If an argument is given it will override the default 1000 number | 
|  | 183            and return instead /that/ requested number of nucleotides. | 
|  | 184            But if a Gene object is attached, this argument will be ignored. | 
|  | 185  Returns : string | 
|  | 186  Args    : an optional integer number of nucleotides to be returned instead of | 
|  | 187            the default if no gene attached | 
|  | 188 | 
|  | 189 =cut | 
|  | 190 | 
|  | 191 sub downstream_seq { | 
|  | 192   my ($self,$howmany)=@_; | 
|  | 193   my $str; | 
|  | 194   if (defined ($howmany)) { | 
|  | 195     unless ($howmany > 0) { | 
|  | 196       $self->throw("No sense in asking less than 1 downstream nucleotides!"); | 
|  | 197     } | 
|  | 198   } else { | 
|  | 199     unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve until the end | 
|  | 200       #$str=$DNAobj->labelsubseq($self->end,undef,undef,"unsecuremoderequested"); | 
|  | 201       #return(substr($str,1)); # delete first nucleotide that is the last of Transcript | 
|  | 202       if ($self->gene) { # if there is Gene object attached fetch relevant info | 
|  | 203 	$str=$self->{'seq'}->labelsubseq($self->end,undef,$self->gene->maxtranscript->end); # retrieve from end of this Transcript to end of the maxtranscript | 
|  | 204 	$str=substr($str,1); # delete first nucleotide that is the last of Transcript | 
|  | 205 	if (CORE::length($str) > 0) { | 
|  | 206 	  return($str); | 
|  | 207 	} else { # if there was no downstream through the gene's maxtranscript, go the usual way | 
|  | 208 	  $howmany = 1000; | 
|  | 209 	} | 
|  | 210       } else { | 
|  | 211 	$howmany = 1000; | 
|  | 212       } | 
|  | 213     } | 
|  | 214   } | 
|  | 215   my @exons=$self->all_Exons; | 
|  | 216   my $strand=$self->strand(); | 
|  | 217   my $lastexon=$exons[-1]; | 
|  | 218   my $lastexonlength=$lastexon->length; | 
|  | 219   # $howmany nucs after end of last exon | 
|  | 220   #my $downstream_seq=$lastexon->subseq($lastexonlength+1,undef,$howmany); | 
|  | 221   my $downstream_seq; | 
|  | 222 | 
|  | 223   if ($howmany) { | 
|  | 224       $downstream_seq=substr($lastexon->labelsubseq($self->end,$howmany,undef,"unsecuremoderequested"),1); | 
|  | 225   } else { | 
|  | 226     if ($strand == 1) { | 
|  | 227       $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->end,"unsecuremoderequested"),1); | 
|  | 228     } else { | 
|  | 229       $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->start,"unsecuremoderequested"),1); | 
|  | 230     } | 
|  | 231   } | 
|  | 232   return $downstream_seq; | 
|  | 233 } | 
|  | 234 | 
|  | 235 =head2 upstream_seq | 
|  | 236 | 
|  | 237  Title   : upstream_seq | 
|  | 238  Usage   : $transcript_obj->upstream_seq() | 
|  | 239          : $transcript_obj->upstream_seq(64) | 
|  | 240  Function: just like downstream_seq but returns nucleotides before the ATG | 
|  | 241  Note    : the default, if no Gene information present and no nucleotides | 
|  | 242            number given, is to return up to 400 nucleotides. | 
|  | 243 | 
|  | 244 =cut | 
|  | 245 | 
|  | 246 sub upstream_seq { | 
|  | 247   my ($self,$howmany)=@_; | 
|  | 248   if (defined ($howmany)) { | 
|  | 249     unless ($howmany > 0) { | 
|  | 250       $self->throw("No sense in asking less than 1 upstream nucleotides!"); | 
|  | 251     } | 
|  | 252   } else { | 
|  | 253     unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve from the start | 
|  | 254       if ($self->gene) { # if there is Gene object attached fetch relevant info | 
|  | 255 	my $str=$self->{'seq'}->labelsubseq($self->gene->maxtranscript->start,undef,$self->start); # retrieve from start of maxtranscript to start of this Transcript | 
|  | 256 	chop $str; # delete last nucleotide that is the A of starting ATG | 
|  | 257 	if (length($str) > 0) { | 
|  | 258 	  return($str); | 
|  | 259 	} else { # if there was no upstream through the gene's maxtranscript, go the usual way | 
|  | 260 	  $howmany = 400; | 
|  | 261 	} | 
|  | 262       } else { | 
|  | 263 	$howmany = 400; | 
|  | 264       } | 
|  | 265     } | 
|  | 266   } | 
|  | 267   my @exons=$self->all_Exons; | 
|  | 268   my $firstexon=$exons[0]; | 
|  | 269 | 
|  | 270   my $upstream_seq; | 
|  | 271   my $strand=$self->strand(); | 
|  | 272 | 
|  | 273   if ($howmany) {# $howmany nucs before begin of first exon | 
|  | 274     my $labelbefore=$firstexon->label(-$howmany,$firstexon->start); | 
|  | 275     if ($labelbefore < 1) { | 
|  | 276       if ($strand == 1) { | 
|  | 277 	$labelbefore=$self->{'seq'}->start; | 
|  | 278       } else { | 
|  | 279 	$labelbefore=$self->{'seq'}->end; | 
|  | 280       } | 
|  | 281     } | 
|  | 282     $upstream_seq=$firstexon->labelsubseq($labelbefore,undef,$firstexon->start,"unsecuremoderequested"); | 
|  | 283     chop $upstream_seq; | 
|  | 284   } else { | 
|  | 285     if ($strand == 1) { | 
|  | 286       $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->start,undef,$self->start,"unsecuremoderequested"); | 
|  | 287       chop $upstream_seq; # delete last nucleotide that is the A of starting ATG | 
|  | 288     } else { | 
|  | 289       $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->end,undef,$self->start,"unsecuremoderequested"); | 
|  | 290       chop $upstream_seq; # delete last nucleotide that is the A of starting ATG | 
|  | 291     } | 
|  | 292   } | 
|  | 293   return $upstream_seq; | 
|  | 294 } | 
|  | 295 | 
|  | 296 # These get redefined here, overriding the SeqI one because they draw their | 
|  | 297 # information from the Exons a Transcript is built of | 
|  | 298 # optional argument: firstlabel. If not given, it checks coordinate_start | 
|  | 299 #                                This is useful when called by Translation | 
|  | 300 #                                also used by _delete | 
|  | 301 sub label { | 
|  | 302   my ($self,$position,$firstlabel)=@_; | 
|  | 303   unless ($position) {  # if position = 0 complain ? | 
|  | 304     $self->warn("Position not given or position 0"); | 
|  | 305     return (-1); | 
|  | 306   } | 
|  | 307   my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); | 
|  | 308   my ($label,@labels,$length,$arraypos); | 
|  | 309   unless (defined ($firstlabel)) { | 
|  | 310     $firstlabel=$self->coordinate_start; # this is inside Transcript obj | 
|  | 311   } | 
|  | 312   my $coord_pos=$self->_inside_position($firstlabel); | 
|  | 313   $length=$self->length; | 
|  | 314   #if ($strand == 1) { | 
|  | 315     if ($position < 1) { | 
|  | 316       $position++; # to account for missing of 0 position | 
|  | 317     } | 
|  | 318     $arraypos=$position+$coord_pos-2; | 
|  | 319     #print "\n=-=-=-=-DEBUG: arraypos $arraypos, pos $position, coordpos: $coord_pos"; | 
|  | 320     if ($arraypos < 0) { | 
|  | 321       $label=$self->{'seq'}->label($arraypos,$start,$strand); #? | 
|  | 322     } elsif ($arraypos >= $length) { | 
|  | 323       $label=$self->{'seq'}->label($arraypos-$length+2,$end,$strand); #? | 
|  | 324     } else { # inside the Transcript | 
|  | 325       @labels=$self->all_labels; | 
|  | 326       $label=$labels[$arraypos]; | 
|  | 327     } | 
|  | 328   #} | 
|  | 329 } | 
|  | 330 | 
|  | 331 # argument: label | 
|  | 332 # returns: position of label according to coord_start | 
|  | 333 # errorcode: 0 label not found | 
|  | 334 # optional argument: firstlabel. If not given, it checks coordinate_start | 
|  | 335 #                                This is useful when called by Translation | 
|  | 336 sub position { | 
|  | 337   my ($self,$label,$firstlabel)=@_; | 
|  | 338   unless ($self->{'seq'}->valid($label)) { | 
|  | 339     $self->warn("label is not valid"); | 
|  | 340     return (0); | 
|  | 341   } | 
|  | 342   unless (defined ($firstlabel)) { | 
|  | 343     $firstlabel=$self->coordinate_start; # this is inside Transcript obj | 
|  | 344   } | 
|  | 345   if ($label == $firstlabel) { | 
|  | 346     return (1); | 
|  | 347   } | 
|  | 348   my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); | 
|  | 349   my ($position,$in_pos,$out_pos,$coord_pos); | 
|  | 350   my $length=$self->length; | 
|  | 351   $coord_pos=$self->_inside_position($firstlabel); | 
|  | 352   if ($self->valid($label)) { # if label is inside the Transcript | 
|  | 353     $in_pos=$self->_inside_position($label); | 
|  | 354     $position=$in_pos-$coord_pos+1; | 
|  | 355     if ($position <= 0) { | 
|  | 356       return ($position-1); # accounts for the missing of the 0 position | 
|  | 357     } | 
|  | 358   } else { | 
|  | 359     if ($self->follows($end,$label)) { # label after end of transcript | 
|  | 360       $out_pos=$self->{'seq'}->position($label,$end,$strand); | 
|  | 361       #print "\n+++++++++DEBUG label $label FOLLOWS end $end outpos $out_pos coordpos $coord_pos"; | 
|  | 362       $position=$out_pos+$length-$coord_pos; | 
|  | 363     } elsif ($self->follows($label,$start)) { # label before begin of transcript | 
|  | 364       #print "\n+++++++++DEBUG label $label BEFORE start $start outpos $out_pos coordpos $coord_pos"; | 
|  | 365       $out_pos=$self->{'seq'}->position($label,$start,$strand); | 
|  | 366       $position=$out_pos-$coord_pos+1; | 
|  | 367     } else { # label is in intron (not valid, not after, not before)! | 
|  | 368       $self->warn("Cannot give position of label pointing to intron according to CDS numbering!",1); | 
|  | 369       return (0); | 
|  | 370     } | 
|  | 371   } | 
|  | 372   return ($position); | 
|  | 373 } | 
|  | 374 | 
|  | 375 sub seq { | 
|  | 376   my $self=shift; | 
|  | 377   my ($exon,$str); | 
|  | 378   my @exons=$self->all_Exons(); | 
|  | 379   foreach $exon (@exons) { | 
|  | 380     $str .= $exon->seq(); | 
|  | 381   } | 
|  | 382   return $str; | 
|  | 383 } | 
|  | 384 | 
|  | 385 sub length { | 
|  | 386   my $self=shift; | 
|  | 387   my ($exon,$length); | 
|  | 388   my @exons=$self->all_Exons(); | 
|  | 389   foreach $exon (@exons) { | 
|  | 390     $length += $exon->length(); | 
|  | 391   } | 
|  | 392   return $length; | 
|  | 393 } | 
|  | 394 | 
|  | 395 sub all_labels { | 
|  | 396   my $self=shift; | 
|  | 397   my ($exon,@labels); | 
|  | 398   my @exons=$self->all_Exons(); | 
|  | 399   foreach $exon (@exons) { | 
|  | 400     push (@labels,$exon->all_labels()); | 
|  | 401   } | 
|  | 402   return @labels; | 
|  | 403 } | 
|  | 404 | 
|  | 405 # redefined here so that it will retrieve effective subseq without introns | 
|  | 406 # otherwise it would have retrieved an underlying DNA (possibly with introns) | 
|  | 407 # subsequence | 
|  | 408 # Drawback: this is really bulky, label->position and then a call to | 
|  | 409 # subseq that will do the opposite position-> label | 
|  | 410 # | 
|  | 411 # one day this can be rewritten as the main one so that the normal subseq | 
|  | 412 # will rely on this one and hence avoid this double (useless and lengthy) | 
|  | 413 # conversion between labels and positions | 
|  | 414 sub old_labelsubseq { | 
|  | 415   my ($self,$start,$length,$end)=@_; | 
|  | 416   my ($pos1,$pos2); | 
|  | 417   if ($start) { | 
|  | 418     unless ($self->valid($start)) { | 
|  | 419       $self->warn("Start label not valid"); return (-1); | 
|  | 420     } | 
|  | 421     $pos1=$self->position($start); | 
|  | 422   } | 
|  | 423   if ($end) { | 
|  | 424     if ($end == $start) { | 
|  | 425       $length=1; | 
|  | 426     } else { | 
|  | 427       unless ($self->valid($end)) { | 
|  | 428 	$self->warn("End label not valid"); return (-1); | 
|  | 429       } | 
|  | 430       unless ($self->follows($start,$end) == 1) { | 
|  | 431 	$self->warn("End label does not follow Start label!"); return (-1); | 
|  | 432       } | 
|  | 433       $pos2=$self->position($end); | 
|  | 434       undef $length; | 
|  | 435     } | 
|  | 436   } | 
|  | 437   return ($self->subseq($pos1,$pos2,$length)); | 
|  | 438 } | 
|  | 439 | 
|  | 440 # rewritten, eventually | 
|  | 441 | 
|  | 442 sub labelsubseq { | 
|  | 443   my ($self,$start,$length,$end,$unsecuremode)=@_; | 
|  | 444   unless (defined $unsecuremode && | 
|  | 445 	  $unsecuremode eq "unsecuremoderequested") | 
|  | 446   { # to skip security checks (faster) | 
|  | 447     if ($start) { | 
|  | 448       unless ($self->valid($start)) { | 
|  | 449 	$self->warn("Start label not valid"); return (-1); | 
|  | 450       } | 
|  | 451     } else { | 
|  | 452       $start=$self->start; | 
|  | 453     } | 
|  | 454     if ($end) { | 
|  | 455       if ($end == $start) { | 
|  | 456 	$length=1; | 
|  | 457 	undef $end; | 
|  | 458       } else { | 
|  | 459 	undef $length; # end argument overrides length argument | 
|  | 460 	unless ($self->valid($end)) { | 
|  | 461 	  $self->warn("End label not valid"); return (-1); | 
|  | 462 	} | 
|  | 463 	unless ($self->follows($start,$end) == 1) { | 
|  | 464 	  $self->warn("End label does not follow Start label!"); return (-1); | 
|  | 465 	} | 
|  | 466       } | 
|  | 467     } else { | 
|  | 468       $end=$self->end; | 
|  | 469     } | 
|  | 470   } | 
|  | 471   my ($seq,$exon,$startexon,$endexon); my @exonlabels; | 
|  | 472   my @exons=$self->all_Exons; | 
|  | 473   EXONCHECK: | 
|  | 474   foreach $exon (@exons) { | 
|  | 475     if ((!(defined($startexon)))&&($exon->valid($start))) { # checks only if not yet found | 
|  | 476       $startexon=$exon; | 
|  | 477     } | 
|  | 478     if ($exon->valid($end)) { | 
|  | 479       $endexon=$exon; | 
|  | 480     } | 
|  | 481     if ((!(defined($seq)) && (defined($startexon)))) { # initializes only once | 
|  | 482       if ((defined($endexon)) && ($endexon eq $startexon)) { # then perfect, we are finished | 
|  | 483 	if ($length) { | 
|  | 484 	  $seq = $startexon->labelsubseq($start,$length,undef,"unsecuremoderequested"); | 
|  | 485 | 
|  | 486 | 
|  | 487 	  last EXONCHECK; | 
|  | 488 	} else { | 
|  | 489 	  $seq = $startexon->labelsubseq($start,undef,$end,"unsecuremoderequested"); | 
|  | 490 	} | 
|  | 491 	last EXONCHECK; | 
|  | 492       } else { # get up to the end of the exon | 
|  | 493 	$seq = $startexon->labelsubseq($start,undef,undef,"unsecuremoderequested"); | 
|  | 494       } | 
|  | 495     } | 
|  | 496     if (($startexon)&&($exon ne $startexon)) { | 
|  | 497       if (defined($endexon)) { # we arrived to the last exon | 
|  | 498 	$seq .= $endexon->labelsubseq(undef,undef,$end,"unsecuremoderequested"); # get from the start of the exon | 
|  | 499 	last EXONCHECK; | 
|  | 500 | 
|  | 501       } elsif (defined($startexon)) { # we are in a whole-exon-in-the-middle case | 
|  | 502 	  $seq .= $exon->seq; # we add it completely to the seq | 
|  | 503       } # else, we still have to reach the start point, exon useless, we move on | 
|  | 504       if ($length) { # if length argument specified | 
|  | 505 	if (($seq && (CORE::length($seq) >= $length))) { | 
|  | 506 	  last EXONCHECK; | 
|  | 507 	} | 
|  | 508       } | 
|  | 509     } | 
|  | 510   } | 
|  | 511   if ($length) { | 
|  | 512     return (substr($seq,0,$length)); | 
|  | 513   } else { | 
|  | 514     return ($seq); | 
|  | 515   } | 
|  | 516 } | 
|  | 517 | 
|  | 518 | 
|  | 519 # argument: label | 
|  | 520 # returns: the objref and progressive number of the Exon containing that label | 
|  | 521 # errorcode: -1 | 
|  | 522 sub in_which_Exon { | 
|  | 523   my ($self,$label)=@_; | 
|  | 524   my ($count,$exon); | 
|  | 525   my @exons=$self->all_Exons; | 
|  | 526   foreach $exon (@exons) { | 
|  | 527     $count++; # 1st exon is numbered "1" | 
|  | 528     if ($exon->valid($label)) { | 
|  | 529       return ($exon,$count) | 
|  | 530     } | 
|  | 531   } | 
|  | 532   return (-1); # if nothing found | 
|  | 533 } | 
|  | 534 | 
|  | 535 # recoded to exploit the new fast labelsubseq() | 
|  | 536 # valid only inside Transcript | 
|  | 537 sub subseq { | 
|  | 538   my ($self,$pos1,$pos2,$length) = @_; | 
|  | 539   my ($str,$startlabel,$endlabel); | 
|  | 540   if (defined ($pos1)) { | 
|  | 541     if ($pos1 == 0) {  # if position = 0 complain | 
|  | 542       $self->warn("Position cannot be 0!"); return (-1); | 
|  | 543     } | 
|  | 544     if ((defined ($pos2))&&($pos1>$pos2)) { | 
|  | 545       $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); | 
|  | 546     } | 
|  | 547     $startlabel=$self->label($pos1); | 
|  | 548     unless ($self->valid($startlabel)) { | 
|  | 549       $self->warn("Start label not valid"); return (-1); | 
|  | 550     } | 
|  | 551     if ($startlabel < 1) { | 
|  | 552       $self->warn("position $pos1 not valid as start of subseq!"); return (-1); | 
|  | 553     } | 
|  | 554   } else { | 
|  | 555     $startlabel=$self->start; | 
|  | 556   } | 
|  | 557   if (defined ($pos2)) { | 
|  | 558     if ($pos2 == 0) {  # if position = 0 complain | 
|  | 559       $self->warn("Position cannot be 0!"); return (-1); | 
|  | 560     } | 
|  | 561     undef $length; | 
|  | 562     if ((defined ($pos1))&&($pos1>$pos2)) { | 
|  | 563       $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); | 
|  | 564     } | 
|  | 565     $endlabel=$self->label($pos2); | 
|  | 566     unless ($self->valid($endlabel)) { | 
|  | 567       $self->warn("End label not valid"); return (-1); | 
|  | 568     } | 
|  | 569     if ($endlabel < 1) { | 
|  | 570       $self->warn("position $pos2 not valid as end of subseq!"); return (-1); | 
|  | 571     } | 
|  | 572   } else { | 
|  | 573     unless (defined ($length)) { | 
|  | 574       $endlabel=$self->end; | 
|  | 575     } | 
|  | 576   } | 
|  | 577   return ($self->labelsubseq($startlabel,$length,$endlabel,"unsecuremoderequested")); | 
|  | 578 } | 
|  | 579 | 
|  | 580 # works only inside the transcript, complains if asked outside | 
|  | 581 sub old_subseq { | 
|  | 582   my ($self,$pos1,$pos2,$length) = @_; | 
|  | 583   my ($str,$startcount,$endcount,$seq,$seqlength); | 
|  | 584   if (defined ($length)) { | 
|  | 585     if ($length < 1) { | 
|  | 586       $self->warn("No sense asking for a subseq of length < 1"); | 
|  | 587       return (-1); | 
|  | 588     } | 
|  | 589   } | 
|  | 590   my $firstlabel=$self->coordinate_start; # this is inside Transcript obj | 
|  | 591   my $coord_pos=$self->_inside_position($firstlabel); # TESTME old | 
|  | 592   $seq=$self->seq; | 
|  | 593   $seqlength=CORE::length($seq); | 
|  | 594   unless (defined ($pos1)) { | 
|  | 595     $startcount=1+$coord_pos-1; # i.e. coord_pos | 
|  | 596   } else { | 
|  | 597     if ($pos1 == 0) {  # if position = 0 complain | 
|  | 598       $self->warn("Position cannot be 0!"); return (-1); | 
|  | 599     } elsif ($pos1 < 0) { | 
|  | 600       $pos1++; | 
|  | 601     } | 
|  | 602     if ((defined ($pos2))&&($pos1>$pos2)) { | 
|  | 603       $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!"); | 
|  | 604       return (-1); | 
|  | 605     } | 
|  | 606     $startcount=$pos1+$coord_pos-1; | 
|  | 607   } | 
|  | 608   unless (defined ($pos2)) { | 
|  | 609   ; | 
|  | 610   } else { | 
|  | 611     if ($pos2 == 0) {  # if position = 0 complain | 
|  | 612       $self->warn("Position cannot be 0!"); return (-1); | 
|  | 613     } elsif ($pos2 < 0) { | 
|  | 614       $pos2++; | 
|  | 615     } | 
|  | 616     if ((defined ($pos1))&&($pos1>$pos2)) { | 
|  | 617       $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!"); | 
|  | 618       return (-1); | 
|  | 619     } | 
|  | 620     $endcount=$pos2+$coord_pos-1; | 
|  | 621     if ($endcount > $seqlength) { | 
|  | 622       #print "\n###DEBUG###: pos1 $pos1 pos2 $pos2 coordpos $coord_pos endcount $endcount seqln $seqlength\n"; | 
|  | 623       $self->warn("Cannot access end position after the end of Transcript"); | 
|  | 624       return (-1); | 
|  | 625     } | 
|  | 626     $length=$endcount-$startcount+1; | 
|  | 627   } | 
|  | 628   #print "\n###DEBUG pos1 $pos1 pos2 $pos2 start $startcount end $endcount length $length coordpos $coord_pos\n"; | 
|  | 629   my $offset=$startcount-1; | 
|  | 630   if ($offset < 0) { | 
|  | 631     $self->warn("Cannot access startposition before the beginning of Transcript, returning from start",1); # ignorable | 
|  | 632     return (substr($seq,0,$length)); | 
|  | 633   } elsif ($offset >= $seqlength) { | 
|  | 634     $self->warn("Cannot access startposition after the end of Transcript"); | 
|  | 635     return (-1); | 
|  | 636   } else { | 
|  | 637     $str=substr($seq,$offset,$length); | 
|  | 638     if (CORE::length($str) < $length) { | 
|  | 639       $self->warn("Attention, cannot return the length requested ". | 
|  | 640 		  "for subseq",1) if $self->verbose > 0; # ignorable | 
|  | 641     } | 
|  | 642     return $str; | 
|  | 643   } | 
|  | 644 } | 
|  | 645 | 
|  | 646 # redefined so that it doesn't require other methods (after deletions) to | 
|  | 647 # reset it. | 
|  | 648 sub start { | 
|  | 649   my $self = shift; | 
|  | 650   my $exonsref=$self->{'exons'}; | 
|  | 651   my @exons=@{$exonsref}; | 
|  | 652   return ($exons[0]->start); | 
|  | 653 } | 
|  | 654 | 
|  | 655 sub end { | 
|  | 656   my $self = shift; | 
|  | 657   my $exonsref=$self->{'exons'}; | 
|  | 658   my @exons=@{$exonsref}; | 
|  | 659   return ($exons[-1]->end); | 
|  | 660 } | 
|  | 661 | 
|  | 662 | 
|  | 663 # internal methods begin here | 
|  | 664 | 
|  | 665 # returns: position of label in transcript's all_labels | 
|  | 666 #          with STARTlabel == 1 | 
|  | 667 # errorcode 0 -> label not found | 
|  | 668 # argument: label | 
|  | 669 sub _inside_position { | 
|  | 670   my ($self,$label)=@_; | 
|  | 671   my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); | 
|  | 672   my ($position,$checkme); | 
|  | 673   my @labels=$self->all_labels; | 
|  | 674   foreach $checkme (@labels) { | 
|  | 675     $position++; | 
|  | 676     if ($label == $checkme) { | 
|  | 677       return ($position); | 
|  | 678     } | 
|  | 679   } | 
|  | 680   return (0); | 
|  | 681 } | 
|  | 682 | 
|  | 683 # returns 1 OK or 0 ERROR | 
|  | 684 # arguments: reference to array of Exon object references | 
|  | 685 sub _checkexons { | 
|  | 686   my ($exon,$thisstart); | 
|  | 687   my $self=$exon; | 
|  | 688   my $exonsref=$_[0]; | 
|  | 689   my @exons=@{$exonsref}; | 
|  | 690 | 
|  | 691   my $firstexon = $exons[0]; | 
|  | 692 | 
|  | 693   unless (ref($firstexon) eq "Bio::LiveSeq::Exon") { | 
|  | 694     $self->warn("Object not of class Exon"); | 
|  | 695     return (0); | 
|  | 696   } | 
|  | 697   my $strand = $firstexon->strand; | 
|  | 698 | 
|  | 699   my $prevend = $firstexon->end; | 
|  | 700   shift @exons; # skip first one | 
|  | 701   foreach $exon (@exons) { | 
|  | 702     unless (ref($exon) eq "Bio::LiveSeq::Exon") { # object class check | 
|  | 703       $self->warn("Object not of class Exon"); | 
|  | 704       return (0); | 
|  | 705     } | 
|  | 706     if ($exon->strand != $strand) { # strand consistency check | 
|  | 707       $self->warn("Exons' strands not consistent when trying to create Transcript"); | 
|  | 708       return (0); | 
|  | 709     } | 
|  | 710     $thisstart = $exon->start; | 
|  | 711     unless ($exon->{'seq'}->follows($prevend,$thisstart,$strand)) { | 
|  | 712       $self->warn("Exons not in correct order when trying to create Transcript"); | 
|  | 713       return (0); | 
|  | 714     } | 
|  | 715     $prevend = $exon->end; | 
|  | 716   } | 
|  | 717   return (1); | 
|  | 718 } | 
|  | 719 | 
|  | 720 =head2 get_Translation | 
|  | 721 | 
|  | 722   Title   : valid | 
|  | 723   Usage   : $translation = $obj->get_Translation() | 
|  | 724   Function: retrieves the reference to the object of class Translation (if any) | 
|  | 725             attached to a LiveSeq object | 
|  | 726   Returns : object reference | 
|  | 727   Args    : none | 
|  | 728 | 
|  | 729 =cut | 
|  | 730 | 
|  | 731 sub get_Translation { | 
|  | 732   my $self=shift; | 
|  | 733   return ($self->{'translation'}); # this is set when Translation->new is called | 
|  | 734 } | 
|  | 735 | 
|  | 736 # this checks so that deletion spanning multiple exons is | 
|  | 737 # handled accordingly and correctly | 
|  | 738 # arguments: begin and end label of a deletion | 
|  | 739 # this is called BEFORE any deletion in the chain | 
|  | 740 sub _deletecheck { | 
|  | 741   my ($self,$startlabel,$endlabel)=@_; | 
|  | 742   my $exonsref=$self->{'exons'}; | 
|  | 743   my @exons=@{$exonsref}; | 
|  | 744   my ($startexon,$endexon,$exon); | 
|  | 745   $startexon=$endexon=0; | 
|  | 746   foreach $exon (@exons) { | 
|  | 747     if (($startexon == 0)&&($exon->valid($startlabel))) { | 
|  | 748       $startexon=$exon; # exon containing start of deletion | 
|  | 749     } | 
|  | 750     if (($endexon == 0)&&($exon->valid($endlabel))) { | 
|  | 751       $endexon=$exon; # exon containing end of deletion | 
|  | 752     } | 
|  | 753     if (($startexon)&&($endexon)) { | 
|  | 754       last; # don't check further | 
|  | 755     } | 
|  | 756   } | 
|  | 757   my $nextend=$self->label(2,$endlabel); # retrieve the next label | 
|  | 758   my $prevstart=$self->label(-1,$startlabel); # retrieve the prev label | 
|  | 759 | 
|  | 760   if ($startexon eq $endexon) { # intra-exon deletion | 
|  | 761     if (($startexon->start eq $startlabel) && ($startexon->end eq $endlabel)) { | 
|  | 762       # let's delete the entire exon | 
|  | 763       my @newexons; | 
|  | 764       foreach $exon (@exons) { | 
|  | 765 	unless ($exon eq $startexon) { | 
|  | 766 	  push(@newexons,$exon); | 
|  | 767 	} | 
|  | 768       } | 
|  | 769       $self->{'exons'}=\@newexons; | 
|  | 770     } elsif ($startexon->start eq $startlabel) { # special cases | 
|  | 771       $startexon->{'start'}=$nextend; # set a new start of exon | 
|  | 772     } elsif ($startexon->end eq $endlabel) { | 
|  | 773       $startexon->{'end'}=$prevstart; # set a new end of exon | 
|  | 774     } else { | 
|  | 775       return; # no problem | 
|  | 776     } | 
|  | 777   } else { # two new exons to be created, inter-exons deletion | 
|  | 778     my @newexons; | 
|  | 779     my $exonobj; | 
|  | 780     my $dna=$self->{'seq'}; | 
|  | 781     my $strand=$self->strand; | 
|  | 782     my $notmiddle=1; # flag for skipping exons in the middle of deletion | 
|  | 783     foreach $exon (@exons) { | 
|  | 784       if ($exon eq $startexon) { | 
|  | 785 	$exonobj=Bio::LiveSeq::Exon->new('-seq'=>$dna,'-start'=>$exon->start,'-end'=>$prevstart,'-strand'=>$strand); # new partial exon | 
|  | 786 	push(@newexons,$exonobj); | 
|  | 787 	$notmiddle=0; # now we enter totally deleted exons | 
|  | 788       } elsif ($exon eq $endexon) { | 
|  | 789 	$exonobj=Bio::LiveSeq::Exon->new('-seq'=>$dna,'-start'=>$nextend,'-end'=>$exon->end,'-strand'=>$strand); # new partial exon | 
|  | 790 	push(@newexons,$exonobj); | 
|  | 791 	$notmiddle=1; # exiting totally deleted exons | 
|  | 792       } else { | 
|  | 793 	if ($notmiddle) { # if before or after exons with deletion | 
|  | 794 	  push(@newexons,$exon); | 
|  | 795 	}# else skip them | 
|  | 796       } | 
|  | 797     } | 
|  | 798     $self->{'exons'}=\@newexons; | 
|  | 799   } | 
|  | 800 } | 
|  | 801 | 
|  | 802 =head2 translation_table | 
|  | 803 | 
|  | 804  Title   : translation_table | 
|  | 805  Usage   : $name = $obj->translation_table; | 
|  | 806          : $name = $obj->translation_table(11); | 
|  | 807  Function: Returns or sets the translation_table used for translating the | 
|  | 808            transcript. | 
|  | 809            If it has never been set, it will return undef. | 
|  | 810  Returns : an integer | 
|  | 811 | 
|  | 812 =cut | 
|  | 813 | 
|  | 814 sub translation_table { | 
|  | 815   my ($self,$value) = @_; | 
|  | 816   if (defined $value) { | 
|  | 817     $self->{'translation_table'} = $value; | 
|  | 818   } | 
|  | 819   unless (exists $self->{'translation_table'}) { | 
|  | 820     return (undef); | 
|  | 821   } else { | 
|  | 822     return $self->{'translation_table'}; | 
|  | 823   } | 
|  | 824 } | 
|  | 825 | 
|  | 826 =head2 frame | 
|  | 827 | 
|  | 828  Title   : frame | 
|  | 829  Usage   : $frame = $transcript->frame($label); | 
|  | 830  Function: Returns the frame of a particular nucleotide. | 
|  | 831            Frame can be 0 1 or 2 and means the position in the codon triplet | 
|  | 832            of the particulat nucleotide. 0 is the first codon_position. | 
|  | 833            Codon_position (1 2 3) is simply frame+1. | 
|  | 834            If the label asked for is not inside the Transcript, -1 will be | 
|  | 835            returned. | 
|  | 836  Args    : a label | 
|  | 837  Returns : 0 1 or 2 | 
|  | 838  Errorcode -1 | 
|  | 839 | 
|  | 840 =cut | 
|  | 841 | 
|  | 842 # args: label | 
|  | 843 # returns: frame of nucleotide (0 1 2) | 
|  | 844 # errorcode: -1 | 
|  | 845 sub frame { | 
|  | 846   my ($self,$inputlabel)=@_; | 
|  | 847   my @labels=$self->all_labels; | 
|  | 848   my ($label,$frame,$count); | 
|  | 849   foreach $label (@labels) { | 
|  | 850     if ($inputlabel == $label) { | 
|  | 851       return ($count % 3); | 
|  | 852     } | 
|  | 853     $count++; # 0 1 2 3 4.... | 
|  | 854   } | 
|  | 855   return (-1); # label not found amid Transcript labels | 
|  | 856 } | 
|  | 857 | 
|  | 858 1; |