Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/LiveSeq/Transcript.pm @ 0:21066c0abaf5 draft
Uploaded
| author | willmclaren |
|---|---|
| date | Fri, 03 Aug 2012 10:04:48 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:21066c0abaf5 |
|---|---|
| 1 # $Id: Transcript.pm,v 1.17 2002/09/25 08:58:23 heikki Exp $ | |
| 2 # | |
| 3 # bioperl module for Bio::LiveSeq::Transcript | |
| 4 # | |
| 5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net> | |
| 6 # | |
| 7 # Copyright Joseph Insana | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 # | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::LiveSeq::Transcript - Transcript class for LiveSeq | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 # documentation needed | |
| 20 | |
| 21 =head1 DESCRIPTION | |
| 22 | |
| 23 This stores informations about coding sequences (CDS). | |
| 24 The implementation is that a Transcript object accesses a collection of | |
| 25 Exon objects, inferring from them the nucleotide structure and sequence. | |
| 26 | |
| 27 =head1 AUTHOR - Joseph A.L. Insana | |
| 28 | |
| 29 Email: Insana@ebi.ac.uk, jinsana@gmx.net | |
| 30 | |
| 31 Address: | |
| 32 | |
| 33 EMBL Outstation, European Bioinformatics Institute | |
| 34 Wellcome Trust Genome Campus, Hinxton | |
| 35 Cambs. CB10 1SD, United Kingdom | |
| 36 | |
| 37 =head1 APPENDIX | |
| 38 | |
| 39 The rest of the documentation details each of the object | |
| 40 methods. Internal methods are usually preceded with a _ | |
| 41 | |
| 42 =cut | |
| 43 | |
| 44 # Let the code begin... | |
| 45 | |
| 46 package Bio::LiveSeq::Transcript; | |
| 47 $VERSION=5.2; | |
| 48 | |
| 49 # Version history: | |
| 50 # Tue Mar 21 14:38:02 GMT 2000 v 1.0 begun | |
| 51 # Tue Mar 21 17:45:31 GMT 2000 v 1.1 new() created | |
| 52 # Wed Mar 22 19:40:13 GMT 2000 v 1.4 all_Exons() seq(), length(), all_labels() | |
| 53 # Thu Mar 23 19:08:36 GMT 2000 v 1.5 follows() replaces is_downstream() | |
| 54 # Thu Mar 23 20:59:02 GMT 2000 v 2.0 valid _inside_position label position | |
| 55 # Fri Mar 24 18:33:18 GMT 2000 v 2.2 rewritten position(), now should work with diverse coordinate_starts | |
| 56 # Sat Mar 25 04:08:18 GMT 2000 v 2.21 added firstlabel to position and label so that Translation can exploit it | |
| 57 # Sat Mar 25 06:39:27 GMT 2000 v 2.3 started override of subseq, works just internally | |
| 58 # Mon Mar 27 19:05:15 BST 2000 v 2.4 subseq finished, it works with coord_start | |
| 59 # Fri Mar 31 18:48:07 BST 2000 v 2.5 started downstream_seq() | |
| 60 # Mon Apr 3 17:37:34 BST 2000 v 2.52 upstream_seq added | |
| 61 # Fri Apr 7 03:29:43 BST 2000 v 2.6 up/downstream now can use Gene information | |
| 62 # Sat Apr 8 12:59:58 BST 2000 v 3.0 all_Exons now skips no more valid exons | |
| 63 # Sat Apr 8 13:32:08 BST 2000 v 3.1 get_Translation added | |
| 64 # Wed Apr 12 12:37:08 BST 2000 v 3.2 all_Exons updates Transcript's start/end | |
| 65 # Wed Apr 12 12:41:22 BST 2000 v 3.3 each Exon has "transcript" attribute added | |
| 66 # Wed Apr 12 16:35:56 BST 2000 v 3.4 started coding _deletecheck | |
| 67 # Wed Apr 12 23:40:19 BST 2000 v 3.5 start and end redefined here, no more checks after deletion to refix start/end attributes. And no need of those. Eliminated hence from new() | |
| 68 # Wed Apr 12 23:47:02 BST 2000 v 3.9 finished _deletecheck, debugging starts | |
| 69 # Thu Apr 13 00:37:16 BST 2000 v 4.0 debugging done: seems working OK | |
| 70 # Thu Apr 27 16:18:55 BST 2000 v 4.1 translation_table added | |
| 71 # Tue May 16 17:57:40 BST 2000 v 4.11 corrected bug in docs of downstream_seq | |
| 72 # Wed May 17 16:48:34 BST 2000 v 4.2 frame() added | |
| 73 # Mon May 22 15:22:12 BST 2000 v 4.21 labelsubseq tweaked for cases where startlabel==endlabel (no useless follow() query!) | |
| 74 # Thu Jun 22 20:02:39 BST 2000 v 4.3 valid() moved to SeqI, to be inherited as the general one | |
| 75 # Thu Jun 22 20:27:57 BST 2000 v 4.4 optimized labelsubseq coded! | |
| 76 # Thu Jun 22 21:17:51 BST 2000 v 4.44 in_which_Exon() added | |
| 77 # Sat Jun 24 00:49:55 BST 2000 v 4.5 new subseq() that exploits the new fast labelsubseq | |
| 78 # Thu Jun 29 16:31:19 BST 2000 v 5.0 downsream_seq and upstream_seq recoded so that if entry is RNA it will return sequences up to the entry limits -> it should be properly debugged, expecially the upstream_seq one | |
| 79 # Wed Jul 12 04:01:53 BST 2000 v 5.1 croak -> carp+return(-1) | |
| 80 # Wed Mar 28 15:16:21 BST 2001 v 5.2 carp -> warn,throw (coded methods in SeqI) | |
| 81 | |
| 82 use strict; | |
| 83 # use Carp qw(carp cluck); | |
| 84 use vars qw($VERSION @ISA); | |
| 85 use Bio::LiveSeq::SeqI 3.2; # uses SeqI, inherits from it | |
| 86 use Bio::LiveSeq::Exon 1.0; # uses Exon to create new exon in case of deletion | |
| 87 @ISA=qw(Bio::LiveSeq::SeqI); | |
| 88 | |
| 89 =head2 new | |
| 90 | |
| 91 Title : new | |
| 92 Usage : $transcript = Bio::LiveSeq::Transcript->new(-exons => \@obj_refs); | |
| 93 | |
| 94 Function: generates a new Bio::LiveSeq::Transcript | |
| 95 Returns : reference to a new object of class Transcript | |
| 96 Errorcode -1 | |
| 97 Args : reference to an array of Exon object references | |
| 98 | |
| 99 =cut | |
| 100 | |
| 101 sub new { | |
| 102 my ($thing, %args) = @_; | |
| 103 my $class = ref($thing) || $thing; | |
| 104 my ($obj,%transcript); | |
| 105 | |
| 106 my @exons=@{$args{-exons}}; | |
| 107 | |
| 108 $obj = \%transcript; | |
| 109 $obj = bless $obj, $class; | |
| 110 | |
| 111 unless (@exons) { | |
| 112 $obj->warn("$class not initialised because exons array empty"); | |
| 113 return(-1); | |
| 114 } | |
| 115 | |
| 116 # now useless, after start and end methods have been overridden here | |
| 117 my $firstexon = $exons[0]; | |
| 118 #my $lastexon = $exons[-1]; | |
| 119 #my $start = $firstexon->start; | |
| 120 #my $end = $lastexon->end; | |
| 121 my $strand = $firstexon->strand; | |
| 122 my $seq = $firstexon->{'seq'}; | |
| 123 $obj->alphabet('rna'); | |
| 124 | |
| 125 unless (_checkexons(\@exons)) { | |
| 126 $obj->warn("$class not initialised because of problems in the exon structure"); | |
| 127 return(-1); | |
| 128 } | |
| 129 $obj->{'strand'}=$strand; | |
| 130 $obj->{'exons'}=\@exons; | |
| 131 $obj->{'seq'}=$seq; | |
| 132 | |
| 133 # set Transcript into each Exon | |
| 134 my $exon; | |
| 135 foreach $exon (@exons) { | |
| 136 $exon->{'transcript'}=$obj; | |
| 137 } | |
| 138 return $obj; | |
| 139 } | |
| 140 | |
| 141 | |
| 142 =head2 all_Exons | |
| 143 | |
| 144 Title : all_Exons | |
| 145 Usage : $transcript_obj->all_Exons() | |
| 146 Function: returns references to all Exon objects the Transcript is composed of | |
| 147 Example : foreach $exon ($transcript->all_Exons()) { do_something } | |
| 148 Returns : array of object references | |
| 149 Args : none | |
| 150 | |
| 151 =cut | |
| 152 | |
| 153 sub all_Exons { | |
| 154 my $self=shift; | |
| 155 my $exonsref=$self->{'exons'}; | |
| 156 my @exons=@{$exonsref}; | |
| 157 my @newexons; | |
| 158 my $exon; | |
| 159 foreach $exon (@exons) { | |
| 160 unless ($exon->obj_valid) { | |
| 161 $self->warn("$exon no more valid, start or end label lost, skipping....",1); # ignorable | |
| 162 } else { | |
| 163 push(@newexons,$exon); | |
| 164 } | |
| 165 } | |
| 166 if ($#exons != $#newexons) { | |
| 167 # update exons field | |
| 168 $self->{'exons'}=\@newexons; | |
| 169 } | |
| 170 return (@newexons); | |
| 171 } | |
| 172 | |
| 173 =head2 downstream_seq | |
| 174 | |
| 175 Title : downstream_seq | |
| 176 Usage : $transcript_obj->downstream_seq() | |
| 177 : $transcript_obj->downstream_seq(64) | |
| 178 Function: returns a string of nucleotides downstream of the end of the | |
| 179 CDS. If there is some information of the real mRNA, from features in | |
| 180 an attached Gene object, it will return up to those boundaries. | |
| 181 Otherwise it will return 1000 nucleotides. | |
| 182 If an argument is given it will override the default 1000 number | |
| 183 and return instead /that/ requested number of nucleotides. | |
| 184 But if a Gene object is attached, this argument will be ignored. | |
| 185 Returns : string | |
| 186 Args : an optional integer number of nucleotides to be returned instead of | |
| 187 the default if no gene attached | |
| 188 | |
| 189 =cut | |
| 190 | |
| 191 sub downstream_seq { | |
| 192 my ($self,$howmany)=@_; | |
| 193 my $str; | |
| 194 if (defined ($howmany)) { | |
| 195 unless ($howmany > 0) { | |
| 196 $self->throw("No sense in asking less than 1 downstream nucleotides!"); | |
| 197 } | |
| 198 } else { | |
| 199 unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve until the end | |
| 200 #$str=$DNAobj->labelsubseq($self->end,undef,undef,"unsecuremoderequested"); | |
| 201 #return(substr($str,1)); # delete first nucleotide that is the last of Transcript | |
| 202 if ($self->gene) { # if there is Gene object attached fetch relevant info | |
| 203 $str=$self->{'seq'}->labelsubseq($self->end,undef,$self->gene->maxtranscript->end); # retrieve from end of this Transcript to end of the maxtranscript | |
| 204 $str=substr($str,1); # delete first nucleotide that is the last of Transcript | |
| 205 if (CORE::length($str) > 0) { | |
| 206 return($str); | |
| 207 } else { # if there was no downstream through the gene's maxtranscript, go the usual way | |
| 208 $howmany = 1000; | |
| 209 } | |
| 210 } else { | |
| 211 $howmany = 1000; | |
| 212 } | |
| 213 } | |
| 214 } | |
| 215 my @exons=$self->all_Exons; | |
| 216 my $strand=$self->strand(); | |
| 217 my $lastexon=$exons[-1]; | |
| 218 my $lastexonlength=$lastexon->length; | |
| 219 # $howmany nucs after end of last exon | |
| 220 #my $downstream_seq=$lastexon->subseq($lastexonlength+1,undef,$howmany); | |
| 221 my $downstream_seq; | |
| 222 | |
| 223 if ($howmany) { | |
| 224 $downstream_seq=substr($lastexon->labelsubseq($self->end,$howmany,undef,"unsecuremoderequested"),1); | |
| 225 } else { | |
| 226 if ($strand == 1) { | |
| 227 $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->end,"unsecuremoderequested"),1); | |
| 228 } else { | |
| 229 $downstream_seq=substr($lastexon->labelsubseq($self->end,undef,$self->{'seq'}->start,"unsecuremoderequested"),1); | |
| 230 } | |
| 231 } | |
| 232 return $downstream_seq; | |
| 233 } | |
| 234 | |
| 235 =head2 upstream_seq | |
| 236 | |
| 237 Title : upstream_seq | |
| 238 Usage : $transcript_obj->upstream_seq() | |
| 239 : $transcript_obj->upstream_seq(64) | |
| 240 Function: just like downstream_seq but returns nucleotides before the ATG | |
| 241 Note : the default, if no Gene information present and no nucleotides | |
| 242 number given, is to return up to 400 nucleotides. | |
| 243 | |
| 244 =cut | |
| 245 | |
| 246 sub upstream_seq { | |
| 247 my ($self,$howmany)=@_; | |
| 248 if (defined ($howmany)) { | |
| 249 unless ($howmany > 0) { | |
| 250 $self->throw("No sense in asking less than 1 upstream nucleotides!"); | |
| 251 } | |
| 252 } else { | |
| 253 unless ($self->{'seq'}->alphabet eq 'rna') { # if rna retrieve from the start | |
| 254 if ($self->gene) { # if there is Gene object attached fetch relevant info | |
| 255 my $str=$self->{'seq'}->labelsubseq($self->gene->maxtranscript->start,undef,$self->start); # retrieve from start of maxtranscript to start of this Transcript | |
| 256 chop $str; # delete last nucleotide that is the A of starting ATG | |
| 257 if (length($str) > 0) { | |
| 258 return($str); | |
| 259 } else { # if there was no upstream through the gene's maxtranscript, go the usual way | |
| 260 $howmany = 400; | |
| 261 } | |
| 262 } else { | |
| 263 $howmany = 400; | |
| 264 } | |
| 265 } | |
| 266 } | |
| 267 my @exons=$self->all_Exons; | |
| 268 my $firstexon=$exons[0]; | |
| 269 | |
| 270 my $upstream_seq; | |
| 271 my $strand=$self->strand(); | |
| 272 | |
| 273 if ($howmany) {# $howmany nucs before begin of first exon | |
| 274 my $labelbefore=$firstexon->label(-$howmany,$firstexon->start); | |
| 275 if ($labelbefore < 1) { | |
| 276 if ($strand == 1) { | |
| 277 $labelbefore=$self->{'seq'}->start; | |
| 278 } else { | |
| 279 $labelbefore=$self->{'seq'}->end; | |
| 280 } | |
| 281 } | |
| 282 $upstream_seq=$firstexon->labelsubseq($labelbefore,undef,$firstexon->start,"unsecuremoderequested"); | |
| 283 chop $upstream_seq; | |
| 284 } else { | |
| 285 if ($strand == 1) { | |
| 286 $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->start,undef,$self->start,"unsecuremoderequested"); | |
| 287 chop $upstream_seq; # delete last nucleotide that is the A of starting ATG | |
| 288 } else { | |
| 289 $upstream_seq=$firstexon->labelsubseq($self->{'seq'}->end,undef,$self->start,"unsecuremoderequested"); | |
| 290 chop $upstream_seq; # delete last nucleotide that is the A of starting ATG | |
| 291 } | |
| 292 } | |
| 293 return $upstream_seq; | |
| 294 } | |
| 295 | |
| 296 # These get redefined here, overriding the SeqI one because they draw their | |
| 297 # information from the Exons a Transcript is built of | |
| 298 # optional argument: firstlabel. If not given, it checks coordinate_start | |
| 299 # This is useful when called by Translation | |
| 300 # also used by _delete | |
| 301 sub label { | |
| 302 my ($self,$position,$firstlabel)=@_; | |
| 303 unless ($position) { # if position = 0 complain ? | |
| 304 $self->warn("Position not given or position 0"); | |
| 305 return (-1); | |
| 306 } | |
| 307 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); | |
| 308 my ($label,@labels,$length,$arraypos); | |
| 309 unless (defined ($firstlabel)) { | |
| 310 $firstlabel=$self->coordinate_start; # this is inside Transcript obj | |
| 311 } | |
| 312 my $coord_pos=$self->_inside_position($firstlabel); | |
| 313 $length=$self->length; | |
| 314 #if ($strand == 1) { | |
| 315 if ($position < 1) { | |
| 316 $position++; # to account for missing of 0 position | |
| 317 } | |
| 318 $arraypos=$position+$coord_pos-2; | |
| 319 #print "\n=-=-=-=-DEBUG: arraypos $arraypos, pos $position, coordpos: $coord_pos"; | |
| 320 if ($arraypos < 0) { | |
| 321 $label=$self->{'seq'}->label($arraypos,$start,$strand); #? | |
| 322 } elsif ($arraypos >= $length) { | |
| 323 $label=$self->{'seq'}->label($arraypos-$length+2,$end,$strand); #? | |
| 324 } else { # inside the Transcript | |
| 325 @labels=$self->all_labels; | |
| 326 $label=$labels[$arraypos]; | |
| 327 } | |
| 328 #} | |
| 329 } | |
| 330 | |
| 331 # argument: label | |
| 332 # returns: position of label according to coord_start | |
| 333 # errorcode: 0 label not found | |
| 334 # optional argument: firstlabel. If not given, it checks coordinate_start | |
| 335 # This is useful when called by Translation | |
| 336 sub position { | |
| 337 my ($self,$label,$firstlabel)=@_; | |
| 338 unless ($self->{'seq'}->valid($label)) { | |
| 339 $self->warn("label is not valid"); | |
| 340 return (0); | |
| 341 } | |
| 342 unless (defined ($firstlabel)) { | |
| 343 $firstlabel=$self->coordinate_start; # this is inside Transcript obj | |
| 344 } | |
| 345 if ($label == $firstlabel) { | |
| 346 return (1); | |
| 347 } | |
| 348 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); | |
| 349 my ($position,$in_pos,$out_pos,$coord_pos); | |
| 350 my $length=$self->length; | |
| 351 $coord_pos=$self->_inside_position($firstlabel); | |
| 352 if ($self->valid($label)) { # if label is inside the Transcript | |
| 353 $in_pos=$self->_inside_position($label); | |
| 354 $position=$in_pos-$coord_pos+1; | |
| 355 if ($position <= 0) { | |
| 356 return ($position-1); # accounts for the missing of the 0 position | |
| 357 } | |
| 358 } else { | |
| 359 if ($self->follows($end,$label)) { # label after end of transcript | |
| 360 $out_pos=$self->{'seq'}->position($label,$end,$strand); | |
| 361 #print "\n+++++++++DEBUG label $label FOLLOWS end $end outpos $out_pos coordpos $coord_pos"; | |
| 362 $position=$out_pos+$length-$coord_pos; | |
| 363 } elsif ($self->follows($label,$start)) { # label before begin of transcript | |
| 364 #print "\n+++++++++DEBUG label $label BEFORE start $start outpos $out_pos coordpos $coord_pos"; | |
| 365 $out_pos=$self->{'seq'}->position($label,$start,$strand); | |
| 366 $position=$out_pos-$coord_pos+1; | |
| 367 } else { # label is in intron (not valid, not after, not before)! | |
| 368 $self->warn("Cannot give position of label pointing to intron according to CDS numbering!",1); | |
| 369 return (0); | |
| 370 } | |
| 371 } | |
| 372 return ($position); | |
| 373 } | |
| 374 | |
| 375 sub seq { | |
| 376 my $self=shift; | |
| 377 my ($exon,$str); | |
| 378 my @exons=$self->all_Exons(); | |
| 379 foreach $exon (@exons) { | |
| 380 $str .= $exon->seq(); | |
| 381 } | |
| 382 return $str; | |
| 383 } | |
| 384 | |
| 385 sub length { | |
| 386 my $self=shift; | |
| 387 my ($exon,$length); | |
| 388 my @exons=$self->all_Exons(); | |
| 389 foreach $exon (@exons) { | |
| 390 $length += $exon->length(); | |
| 391 } | |
| 392 return $length; | |
| 393 } | |
| 394 | |
| 395 sub all_labels { | |
| 396 my $self=shift; | |
| 397 my ($exon,@labels); | |
| 398 my @exons=$self->all_Exons(); | |
| 399 foreach $exon (@exons) { | |
| 400 push (@labels,$exon->all_labels()); | |
| 401 } | |
| 402 return @labels; | |
| 403 } | |
| 404 | |
| 405 # redefined here so that it will retrieve effective subseq without introns | |
| 406 # otherwise it would have retrieved an underlying DNA (possibly with introns) | |
| 407 # subsequence | |
| 408 # Drawback: this is really bulky, label->position and then a call to | |
| 409 # subseq that will do the opposite position-> label | |
| 410 # | |
| 411 # one day this can be rewritten as the main one so that the normal subseq | |
| 412 # will rely on this one and hence avoid this double (useless and lengthy) | |
| 413 # conversion between labels and positions | |
| 414 sub old_labelsubseq { | |
| 415 my ($self,$start,$length,$end)=@_; | |
| 416 my ($pos1,$pos2); | |
| 417 if ($start) { | |
| 418 unless ($self->valid($start)) { | |
| 419 $self->warn("Start label not valid"); return (-1); | |
| 420 } | |
| 421 $pos1=$self->position($start); | |
| 422 } | |
| 423 if ($end) { | |
| 424 if ($end == $start) { | |
| 425 $length=1; | |
| 426 } else { | |
| 427 unless ($self->valid($end)) { | |
| 428 $self->warn("End label not valid"); return (-1); | |
| 429 } | |
| 430 unless ($self->follows($start,$end) == 1) { | |
| 431 $self->warn("End label does not follow Start label!"); return (-1); | |
| 432 } | |
| 433 $pos2=$self->position($end); | |
| 434 undef $length; | |
| 435 } | |
| 436 } | |
| 437 return ($self->subseq($pos1,$pos2,$length)); | |
| 438 } | |
| 439 | |
| 440 # rewritten, eventually | |
| 441 | |
| 442 sub labelsubseq { | |
| 443 my ($self,$start,$length,$end,$unsecuremode)=@_; | |
| 444 unless (defined $unsecuremode && | |
| 445 $unsecuremode eq "unsecuremoderequested") | |
| 446 { # to skip security checks (faster) | |
| 447 if ($start) { | |
| 448 unless ($self->valid($start)) { | |
| 449 $self->warn("Start label not valid"); return (-1); | |
| 450 } | |
| 451 } else { | |
| 452 $start=$self->start; | |
| 453 } | |
| 454 if ($end) { | |
| 455 if ($end == $start) { | |
| 456 $length=1; | |
| 457 undef $end; | |
| 458 } else { | |
| 459 undef $length; # end argument overrides length argument | |
| 460 unless ($self->valid($end)) { | |
| 461 $self->warn("End label not valid"); return (-1); | |
| 462 } | |
| 463 unless ($self->follows($start,$end) == 1) { | |
| 464 $self->warn("End label does not follow Start label!"); return (-1); | |
| 465 } | |
| 466 } | |
| 467 } else { | |
| 468 $end=$self->end; | |
| 469 } | |
| 470 } | |
| 471 my ($seq,$exon,$startexon,$endexon); my @exonlabels; | |
| 472 my @exons=$self->all_Exons; | |
| 473 EXONCHECK: | |
| 474 foreach $exon (@exons) { | |
| 475 if ((!(defined($startexon)))&&($exon->valid($start))) { # checks only if not yet found | |
| 476 $startexon=$exon; | |
| 477 } | |
| 478 if ($exon->valid($end)) { | |
| 479 $endexon=$exon; | |
| 480 } | |
| 481 if ((!(defined($seq)) && (defined($startexon)))) { # initializes only once | |
| 482 if ((defined($endexon)) && ($endexon eq $startexon)) { # then perfect, we are finished | |
| 483 if ($length) { | |
| 484 $seq = $startexon->labelsubseq($start,$length,undef,"unsecuremoderequested"); | |
| 485 | |
| 486 | |
| 487 last EXONCHECK; | |
| 488 } else { | |
| 489 $seq = $startexon->labelsubseq($start,undef,$end,"unsecuremoderequested"); | |
| 490 } | |
| 491 last EXONCHECK; | |
| 492 } else { # get up to the end of the exon | |
| 493 $seq = $startexon->labelsubseq($start,undef,undef,"unsecuremoderequested"); | |
| 494 } | |
| 495 } | |
| 496 if (($startexon)&&($exon ne $startexon)) { | |
| 497 if (defined($endexon)) { # we arrived to the last exon | |
| 498 $seq .= $endexon->labelsubseq(undef,undef,$end,"unsecuremoderequested"); # get from the start of the exon | |
| 499 last EXONCHECK; | |
| 500 | |
| 501 } elsif (defined($startexon)) { # we are in a whole-exon-in-the-middle case | |
| 502 $seq .= $exon->seq; # we add it completely to the seq | |
| 503 } # else, we still have to reach the start point, exon useless, we move on | |
| 504 if ($length) { # if length argument specified | |
| 505 if (($seq && (CORE::length($seq) >= $length))) { | |
| 506 last EXONCHECK; | |
| 507 } | |
| 508 } | |
| 509 } | |
| 510 } | |
| 511 if ($length) { | |
| 512 return (substr($seq,0,$length)); | |
| 513 } else { | |
| 514 return ($seq); | |
| 515 } | |
| 516 } | |
| 517 | |
| 518 | |
| 519 # argument: label | |
| 520 # returns: the objref and progressive number of the Exon containing that label | |
| 521 # errorcode: -1 | |
| 522 sub in_which_Exon { | |
| 523 my ($self,$label)=@_; | |
| 524 my ($count,$exon); | |
| 525 my @exons=$self->all_Exons; | |
| 526 foreach $exon (@exons) { | |
| 527 $count++; # 1st exon is numbered "1" | |
| 528 if ($exon->valid($label)) { | |
| 529 return ($exon,$count) | |
| 530 } | |
| 531 } | |
| 532 return (-1); # if nothing found | |
| 533 } | |
| 534 | |
| 535 # recoded to exploit the new fast labelsubseq() | |
| 536 # valid only inside Transcript | |
| 537 sub subseq { | |
| 538 my ($self,$pos1,$pos2,$length) = @_; | |
| 539 my ($str,$startlabel,$endlabel); | |
| 540 if (defined ($pos1)) { | |
| 541 if ($pos1 == 0) { # if position = 0 complain | |
| 542 $self->warn("Position cannot be 0!"); return (-1); | |
| 543 } | |
| 544 if ((defined ($pos2))&&($pos1>$pos2)) { | |
| 545 $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); | |
| 546 } | |
| 547 $startlabel=$self->label($pos1); | |
| 548 unless ($self->valid($startlabel)) { | |
| 549 $self->warn("Start label not valid"); return (-1); | |
| 550 } | |
| 551 if ($startlabel < 1) { | |
| 552 $self->warn("position $pos1 not valid as start of subseq!"); return (-1); | |
| 553 } | |
| 554 } else { | |
| 555 $startlabel=$self->start; | |
| 556 } | |
| 557 if (defined ($pos2)) { | |
| 558 if ($pos2 == 0) { # if position = 0 complain | |
| 559 $self->warn("Position cannot be 0!"); return (-1); | |
| 560 } | |
| 561 undef $length; | |
| 562 if ((defined ($pos1))&&($pos1>$pos2)) { | |
| 563 $self->warn("1st position($pos1) cannot be > 2nd position($pos2)!"); return (-1); | |
| 564 } | |
| 565 $endlabel=$self->label($pos2); | |
| 566 unless ($self->valid($endlabel)) { | |
| 567 $self->warn("End label not valid"); return (-1); | |
| 568 } | |
| 569 if ($endlabel < 1) { | |
| 570 $self->warn("position $pos2 not valid as end of subseq!"); return (-1); | |
| 571 } | |
| 572 } else { | |
| 573 unless (defined ($length)) { | |
| 574 $endlabel=$self->end; | |
| 575 } | |
| 576 } | |
| 577 return ($self->labelsubseq($startlabel,$length,$endlabel,"unsecuremoderequested")); | |
| 578 } | |
| 579 | |
| 580 # works only inside the transcript, complains if asked outside | |
| 581 sub old_subseq { | |
| 582 my ($self,$pos1,$pos2,$length) = @_; | |
| 583 my ($str,$startcount,$endcount,$seq,$seqlength); | |
| 584 if (defined ($length)) { | |
| 585 if ($length < 1) { | |
| 586 $self->warn("No sense asking for a subseq of length < 1"); | |
| 587 return (-1); | |
| 588 } | |
| 589 } | |
| 590 my $firstlabel=$self->coordinate_start; # this is inside Transcript obj | |
| 591 my $coord_pos=$self->_inside_position($firstlabel); # TESTME old | |
| 592 $seq=$self->seq; | |
| 593 $seqlength=CORE::length($seq); | |
| 594 unless (defined ($pos1)) { | |
| 595 $startcount=1+$coord_pos-1; # i.e. coord_pos | |
| 596 } else { | |
| 597 if ($pos1 == 0) { # if position = 0 complain | |
| 598 $self->warn("Position cannot be 0!"); return (-1); | |
| 599 } elsif ($pos1 < 0) { | |
| 600 $pos1++; | |
| 601 } | |
| 602 if ((defined ($pos2))&&($pos1>$pos2)) { | |
| 603 $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!"); | |
| 604 return (-1); | |
| 605 } | |
| 606 $startcount=$pos1+$coord_pos-1; | |
| 607 } | |
| 608 unless (defined ($pos2)) { | |
| 609 ; | |
| 610 } else { | |
| 611 if ($pos2 == 0) { # if position = 0 complain | |
| 612 $self->warn("Position cannot be 0!"); return (-1); | |
| 613 } elsif ($pos2 < 0) { | |
| 614 $pos2++; | |
| 615 } | |
| 616 if ((defined ($pos1))&&($pos1>$pos2)) { | |
| 617 $self->warn("1st position ($pos1) cannot be > 2nd position ($pos2)!"); | |
| 618 return (-1); | |
| 619 } | |
| 620 $endcount=$pos2+$coord_pos-1; | |
| 621 if ($endcount > $seqlength) { | |
| 622 #print "\n###DEBUG###: pos1 $pos1 pos2 $pos2 coordpos $coord_pos endcount $endcount seqln $seqlength\n"; | |
| 623 $self->warn("Cannot access end position after the end of Transcript"); | |
| 624 return (-1); | |
| 625 } | |
| 626 $length=$endcount-$startcount+1; | |
| 627 } | |
| 628 #print "\n###DEBUG pos1 $pos1 pos2 $pos2 start $startcount end $endcount length $length coordpos $coord_pos\n"; | |
| 629 my $offset=$startcount-1; | |
| 630 if ($offset < 0) { | |
| 631 $self->warn("Cannot access startposition before the beginning of Transcript, returning from start",1); # ignorable | |
| 632 return (substr($seq,0,$length)); | |
| 633 } elsif ($offset >= $seqlength) { | |
| 634 $self->warn("Cannot access startposition after the end of Transcript"); | |
| 635 return (-1); | |
| 636 } else { | |
| 637 $str=substr($seq,$offset,$length); | |
| 638 if (CORE::length($str) < $length) { | |
| 639 $self->warn("Attention, cannot return the length requested ". | |
| 640 "for subseq",1) if $self->verbose > 0; # ignorable | |
| 641 } | |
| 642 return $str; | |
| 643 } | |
| 644 } | |
| 645 | |
| 646 # redefined so that it doesn't require other methods (after deletions) to | |
| 647 # reset it. | |
| 648 sub start { | |
| 649 my $self = shift; | |
| 650 my $exonsref=$self->{'exons'}; | |
| 651 my @exons=@{$exonsref}; | |
| 652 return ($exons[0]->start); | |
| 653 } | |
| 654 | |
| 655 sub end { | |
| 656 my $self = shift; | |
| 657 my $exonsref=$self->{'exons'}; | |
| 658 my @exons=@{$exonsref}; | |
| 659 return ($exons[-1]->end); | |
| 660 } | |
| 661 | |
| 662 | |
| 663 # internal methods begin here | |
| 664 | |
| 665 # returns: position of label in transcript's all_labels | |
| 666 # with STARTlabel == 1 | |
| 667 # errorcode 0 -> label not found | |
| 668 # argument: label | |
| 669 sub _inside_position { | |
| 670 my ($self,$label)=@_; | |
| 671 my ($start,$end,$strand)=($self->start(),$self->end(),$self->strand()); | |
| 672 my ($position,$checkme); | |
| 673 my @labels=$self->all_labels; | |
| 674 foreach $checkme (@labels) { | |
| 675 $position++; | |
| 676 if ($label == $checkme) { | |
| 677 return ($position); | |
| 678 } | |
| 679 } | |
| 680 return (0); | |
| 681 } | |
| 682 | |
| 683 # returns 1 OK or 0 ERROR | |
| 684 # arguments: reference to array of Exon object references | |
| 685 sub _checkexons { | |
| 686 my ($exon,$thisstart); | |
| 687 my $self=$exon; | |
| 688 my $exonsref=$_[0]; | |
| 689 my @exons=@{$exonsref}; | |
| 690 | |
| 691 my $firstexon = $exons[0]; | |
| 692 | |
| 693 unless (ref($firstexon) eq "Bio::LiveSeq::Exon") { | |
| 694 $self->warn("Object not of class Exon"); | |
| 695 return (0); | |
| 696 } | |
| 697 my $strand = $firstexon->strand; | |
| 698 | |
| 699 my $prevend = $firstexon->end; | |
| 700 shift @exons; # skip first one | |
| 701 foreach $exon (@exons) { | |
| 702 unless (ref($exon) eq "Bio::LiveSeq::Exon") { # object class check | |
| 703 $self->warn("Object not of class Exon"); | |
| 704 return (0); | |
| 705 } | |
| 706 if ($exon->strand != $strand) { # strand consistency check | |
| 707 $self->warn("Exons' strands not consistent when trying to create Transcript"); | |
| 708 return (0); | |
| 709 } | |
| 710 $thisstart = $exon->start; | |
| 711 unless ($exon->{'seq'}->follows($prevend,$thisstart,$strand)) { | |
| 712 $self->warn("Exons not in correct order when trying to create Transcript"); | |
| 713 return (0); | |
| 714 } | |
| 715 $prevend = $exon->end; | |
| 716 } | |
| 717 return (1); | |
| 718 } | |
| 719 | |
| 720 =head2 get_Translation | |
| 721 | |
| 722 Title : valid | |
| 723 Usage : $translation = $obj->get_Translation() | |
| 724 Function: retrieves the reference to the object of class Translation (if any) | |
| 725 attached to a LiveSeq object | |
| 726 Returns : object reference | |
| 727 Args : none | |
| 728 | |
| 729 =cut | |
| 730 | |
| 731 sub get_Translation { | |
| 732 my $self=shift; | |
| 733 return ($self->{'translation'}); # this is set when Translation->new is called | |
| 734 } | |
| 735 | |
| 736 # this checks so that deletion spanning multiple exons is | |
| 737 # handled accordingly and correctly | |
| 738 # arguments: begin and end label of a deletion | |
| 739 # this is called BEFORE any deletion in the chain | |
| 740 sub _deletecheck { | |
| 741 my ($self,$startlabel,$endlabel)=@_; | |
| 742 my $exonsref=$self->{'exons'}; | |
| 743 my @exons=@{$exonsref}; | |
| 744 my ($startexon,$endexon,$exon); | |
| 745 $startexon=$endexon=0; | |
| 746 foreach $exon (@exons) { | |
| 747 if (($startexon == 0)&&($exon->valid($startlabel))) { | |
| 748 $startexon=$exon; # exon containing start of deletion | |
| 749 } | |
| 750 if (($endexon == 0)&&($exon->valid($endlabel))) { | |
| 751 $endexon=$exon; # exon containing end of deletion | |
| 752 } | |
| 753 if (($startexon)&&($endexon)) { | |
| 754 last; # don't check further | |
| 755 } | |
| 756 } | |
| 757 my $nextend=$self->label(2,$endlabel); # retrieve the next label | |
| 758 my $prevstart=$self->label(-1,$startlabel); # retrieve the prev label | |
| 759 | |
| 760 if ($startexon eq $endexon) { # intra-exon deletion | |
| 761 if (($startexon->start eq $startlabel) && ($startexon->end eq $endlabel)) { | |
| 762 # let's delete the entire exon | |
| 763 my @newexons; | |
| 764 foreach $exon (@exons) { | |
| 765 unless ($exon eq $startexon) { | |
| 766 push(@newexons,$exon); | |
| 767 } | |
| 768 } | |
| 769 $self->{'exons'}=\@newexons; | |
| 770 } elsif ($startexon->start eq $startlabel) { # special cases | |
| 771 $startexon->{'start'}=$nextend; # set a new start of exon | |
| 772 } elsif ($startexon->end eq $endlabel) { | |
| 773 $startexon->{'end'}=$prevstart; # set a new end of exon | |
| 774 } else { | |
| 775 return; # no problem | |
| 776 } | |
| 777 } else { # two new exons to be created, inter-exons deletion | |
| 778 my @newexons; | |
| 779 my $exonobj; | |
| 780 my $dna=$self->{'seq'}; | |
| 781 my $strand=$self->strand; | |
| 782 my $notmiddle=1; # flag for skipping exons in the middle of deletion | |
| 783 foreach $exon (@exons) { | |
| 784 if ($exon eq $startexon) { | |
| 785 $exonobj=Bio::LiveSeq::Exon->new('-seq'=>$dna,'-start'=>$exon->start,'-end'=>$prevstart,'-strand'=>$strand); # new partial exon | |
| 786 push(@newexons,$exonobj); | |
| 787 $notmiddle=0; # now we enter totally deleted exons | |
| 788 } elsif ($exon eq $endexon) { | |
| 789 $exonobj=Bio::LiveSeq::Exon->new('-seq'=>$dna,'-start'=>$nextend,'-end'=>$exon->end,'-strand'=>$strand); # new partial exon | |
| 790 push(@newexons,$exonobj); | |
| 791 $notmiddle=1; # exiting totally deleted exons | |
| 792 } else { | |
| 793 if ($notmiddle) { # if before or after exons with deletion | |
| 794 push(@newexons,$exon); | |
| 795 }# else skip them | |
| 796 } | |
| 797 } | |
| 798 $self->{'exons'}=\@newexons; | |
| 799 } | |
| 800 } | |
| 801 | |
| 802 =head2 translation_table | |
| 803 | |
| 804 Title : translation_table | |
| 805 Usage : $name = $obj->translation_table; | |
| 806 : $name = $obj->translation_table(11); | |
| 807 Function: Returns or sets the translation_table used for translating the | |
| 808 transcript. | |
| 809 If it has never been set, it will return undef. | |
| 810 Returns : an integer | |
| 811 | |
| 812 =cut | |
| 813 | |
| 814 sub translation_table { | |
| 815 my ($self,$value) = @_; | |
| 816 if (defined $value) { | |
| 817 $self->{'translation_table'} = $value; | |
| 818 } | |
| 819 unless (exists $self->{'translation_table'}) { | |
| 820 return (undef); | |
| 821 } else { | |
| 822 return $self->{'translation_table'}; | |
| 823 } | |
| 824 } | |
| 825 | |
| 826 =head2 frame | |
| 827 | |
| 828 Title : frame | |
| 829 Usage : $frame = $transcript->frame($label); | |
| 830 Function: Returns the frame of a particular nucleotide. | |
| 831 Frame can be 0 1 or 2 and means the position in the codon triplet | |
| 832 of the particulat nucleotide. 0 is the first codon_position. | |
| 833 Codon_position (1 2 3) is simply frame+1. | |
| 834 If the label asked for is not inside the Transcript, -1 will be | |
| 835 returned. | |
| 836 Args : a label | |
| 837 Returns : 0 1 or 2 | |
| 838 Errorcode -1 | |
| 839 | |
| 840 =cut | |
| 841 | |
| 842 # args: label | |
| 843 # returns: frame of nucleotide (0 1 2) | |
| 844 # errorcode: -1 | |
| 845 sub frame { | |
| 846 my ($self,$inputlabel)=@_; | |
| 847 my @labels=$self->all_labels; | |
| 848 my ($label,$frame,$count); | |
| 849 foreach $label (@labels) { | |
| 850 if ($inputlabel == $label) { | |
| 851 return ($count % 3); | |
| 852 } | |
| 853 $count++; # 0 1 2 3 4.... | |
| 854 } | |
| 855 return (-1); # label not found amid Transcript labels | |
| 856 } | |
| 857 | |
| 858 1; |
