Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Seq/EncodedSeq.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: EncodedSeq.pm,v 1.5.2.1 2003/04/28 12:11:57 jason Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Seq::EncodedSeq | |
| 4 # | |
| 5 # Cared for by Aaron Mackey | |
| 6 # | |
| 7 # Copyright Aaron Mackey | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::Seq::EncodedSeq - subtype of L<Bio::LocatableSeq|Bio::LocatableSeq> to store DNA that encodes a protein | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 $obj = new Bio::Seq::EncodedSeq(-seq => $dna, | |
| 20 -encoding => "CCCCCCCIIIIICCCCC", | |
| 21 -start => 1, | |
| 22 -strand => 1, | |
| 23 -length => 17); | |
| 24 | |
| 25 # splice out (and possibly revcomp) the coding sequence | |
| 26 $cds = obj->cds; | |
| 27 | |
| 28 # obtain the protein translation of the sequence | |
| 29 $prot = $obj->translate; | |
| 30 | |
| 31 # other access/inspection routines as with Bio::LocatableSeq and | |
| 32 # Bio::SeqI; note that coordinates are relative only to the DNA | |
| 33 # sequence, not it's implicit encoded protein sequence. | |
| 34 | |
| 35 =head1 DESCRIPTION | |
| 36 | |
| 37 Bio::Seq::EncodedSeq is a L<Bio::LocatableSeq|Bio::LocatableSeq> | |
| 38 object that holds a DNA sequence as well as information about the | |
| 39 coding potential of that DNA sequence. It is meant to be useful in an | |
| 40 alignment context, where the DNA may contain frameshifts, gaps and/or | |
| 41 introns, or in describing the transcript of a gene. An EncodedSeq | |
| 42 provides the ability to access the "spliced" coding sequence, meaning | |
| 43 that all introns and gaps are removed, and any frameshifts are | |
| 44 adjusted to provide a "clean" CDS. | |
| 45 | |
| 46 In order to make simultaneous use of either the DNA or the implicit | |
| 47 encoded protein sequence coordinates, please see | |
| 48 L<Bio::Coordinate::EncodingPair>. | |
| 49 | |
| 50 =head1 ENCODING | |
| 51 | |
| 52 We use the term "encoding" here to refer to the series of symbols that | |
| 53 we use to identify which residues of a DNA sequence are protein-coding | |
| 54 (i.e. part of a codon), intronic, part of a 5' or 3', frameshift | |
| 55 "mutations", etc. From this information, a Bio::Seq::EncodedSeq is | |
| 56 able to "figure out" its translational CDS. There are two sets of | |
| 57 coding characters, one termed "implicit" and one termed "explicit". | |
| 58 | |
| 59 The "implict" encoding is a bit simpler than the "explicit" encoding: | |
| 60 'C' is used for any nucleotide that's part of a codon, 'U' for any | |
| 61 UTR, etc. The full list is shown below: | |
| 62 | |
| 63 Code Meaning | |
| 64 ---- ------- | |
| 65 C coding | |
| 66 I intronic | |
| 67 U untranslated | |
| 68 G gapped (for use in alignments) | |
| 69 F a "forward", +1 frameshift | |
| 70 B a "backward", -1 frameshift | |
| 71 | |
| 72 The "explicit" encoding is just an expansion of the "implicit" | |
| 73 encoding, to denote phase: | |
| 74 | |
| 75 Code Meaning | |
| 76 ---- ------- | |
| 77 C coding, 1st codon position | |
| 78 D coding, 2nd codon position | |
| 79 E coding, 3rd codon position | |
| 80 | |
| 81 I intronic, phase 0 (relative to intron begin) | |
| 82 J intronic, phase 1 | |
| 83 K intronic, phase 2 | |
| 84 | |
| 85 U untranslated 3'UTR | |
| 86 V untranslated 5'UTR | |
| 87 | |
| 88 G gapped (for use in alignments) | |
| 89 F a "forward", +1 frameshift | |
| 90 B a "backward", -1 frameshift | |
| 91 | |
| 92 Note that the explicit coding is meant to provide easy access to | |
| 93 position/phase specific nucleotides: | |
| 94 | |
| 95 $obj = new Bio::Seq::EncodedSeq (-seq => "ACAATCAGACTACG...", | |
| 96 -encoding => "CCCCCCIII..." | |
| 97 ); | |
| 98 | |
| 99 # fetch arrays of nucleotides at each codon position: | |
| 100 my @pos1 = $obj->dnaseq(encoding => 'C', explicit => 1); | |
| 101 my @pos2 = $obj->dnaseq(encoding => 'D'); | |
| 102 my @pos3 = $obj->dnaseq(encoding => 'E'); | |
| 103 | |
| 104 # fetch arrays of "3-1" codon dinucleotides, useful for genomic | |
| 105 # signature analyses without compounding influences of codon bias: | |
| 106 my @pairs = $obj->dnaseq(encoding => 'EC'); | |
| 107 | |
| 108 =head1 FEEDBACK | |
| 109 | |
| 110 =head2 Mailing Lists | |
| 111 | |
| 112 User feedback is an integral part of the evolution of this | |
| 113 and other Bioperl modules. Send your comments and suggestions preferably | |
| 114 to one of the Bioperl mailing lists. | |
| 115 Your participation is much appreciated. | |
| 116 | |
| 117 bioperl-l@bioperl.org - General discussion | |
| 118 http://www.bioperl.org/MailList.html - About the mailing lists | |
| 119 | |
| 120 =head2 Reporting Bugs | |
| 121 | |
| 122 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 123 the bugs and their resolution. | |
| 124 Bug reports can be submitted via email or the web: | |
| 125 | |
| 126 bioperl-bugs@bio.perl.org | |
| 127 http://bugzilla.bioperl.org/ | |
| 128 | |
| 129 =head1 AUTHOR - Aaron Mackey | |
| 130 | |
| 131 Email amackey@virginia.edu | |
| 132 | |
| 133 =head1 APPENDIX | |
| 134 | |
| 135 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
| 136 | |
| 137 =cut | |
| 138 | |
| 139 | |
| 140 # Let the code begin... | |
| 141 | |
| 142 package Bio::Seq::EncodedSeq; | |
| 143 | |
| 144 use strict; | |
| 145 use vars qw(@ISA); | |
| 146 | |
| 147 use Bio::LocatableSeq; | |
| 148 | |
| 149 @ISA = qw(Bio::LocatableSeq); | |
| 150 | |
| 151 =head2 new | |
| 152 | |
| 153 Title : new | |
| 154 Usage : $obj = Bio::Seq::EncodedSeq->new(-seq => "AGTACGTGTCATG", | |
| 155 -encoding => "CCCCCCFCCCCCC", | |
| 156 -id => "myseq", | |
| 157 -start => 1, | |
| 158 -end => 13, | |
| 159 -strand => 1 | |
| 160 ); | |
| 161 Function: creates a new Bio::Seq::EncodedSeq object from a supplied DNA | |
| 162 sequence | |
| 163 Returns : a new Bio::Seq::EncodedSeq object | |
| 164 | |
| 165 Args : seq - primary nucleotide sequence used to encode the | |
| 166 protein; note that any positions involved in a | |
| 167 gap ('G') or backward frameshift ('B') should | |
| 168 have one or more gap characters; if the encoding | |
| 169 specifies G or B, but no (or not enough) gap | |
| 170 characters exist, *they'll be added*; similary, | |
| 171 if there are gap characters without a | |
| 172 corresponding G or B encoding, G's will be | |
| 173 inserted into the encoding. This allows some | |
| 174 flexibility in specifying your sequence and | |
| 175 coding without having to calculate a lot of the | |
| 176 encoding for yourself. | |
| 177 | |
| 178 encoding - a string of characters (see Encoding Table) | |
| 179 describing backwards frameshifts implied by the | |
| 180 encoding but not present in the sequence will be | |
| 181 added (as '-'s) to the sequence. If not | |
| 182 supplied, it will be assumed that all positions | |
| 183 are coding (C). Encoding may include either | |
| 184 implicit phase encoding characters (i.e. "CCC") | |
| 185 and/or explicit encoding characters (i.e. "CDE"). | |
| 186 Additionally, prefixed numbers may be used to | |
| 187 denote repetition (i.e. "27C3I28C"). | |
| 188 | |
| 189 Alternatively, encoding may be a hashref | |
| 190 datastructure, with encoding characters as keys | |
| 191 and Bio::LocationI objects (or arrayrefs of | |
| 192 Bio::LocationI objects) as values, e.g.: | |
| 193 | |
| 194 { C => [ Bio::Location::Simple->new(1,9), | |
| 195 Bio::Location::Simple->new(11,13) ], | |
| 196 F => Bio::Location::Simple->new(10,10), | |
| 197 } # same as "CCCCCCCCCFCCC" | |
| 198 | |
| 199 Note that if the location ranges overlap, the | |
| 200 behavior of the encoding will be undefined | |
| 201 (well, it will be defined, but only according to | |
| 202 the order in which the hash keys are read, which | |
| 203 is basically undefined ... just don't do that). | |
| 204 | |
| 205 id, start, end, strand - as with Bio::LocatableSeq; note | |
| 206 that the coordinates are relative to the | |
| 207 encoding DNA sequence, not the implicit protein | |
| 208 sequence. If strand is reversed, then the | |
| 209 encoding is assumed to be relative to the | |
| 210 reverse strand as well. | |
| 211 | |
| 212 =cut | |
| 213 | |
| 214 #' | |
| 215 | |
| 216 sub new { | |
| 217 | |
| 218 my ($self, @args) = @_; | |
| 219 $self = $self->SUPER::new(@args, -alphabet => 'dna'); | |
| 220 my ($enc) = $self->_rearrange([qw(ENCODING)], @args); | |
| 221 # set the real encoding: | |
| 222 if ($enc) { | |
| 223 $self->encoding($enc); | |
| 224 } else { | |
| 225 $self->_recheck_encoding; | |
| 226 } | |
| 227 return $self; | |
| 228 } | |
| 229 | |
| 230 =head2 encoding | |
| 231 | |
| 232 Title : encoding | |
| 233 Usage : $obj->encoding("CCCCCC"); | |
| 234 $obj->encoding( -encoding => { I => $location } ); | |
| 235 $enc = $obj->encoding(-explicit => 1); | |
| 236 $enc = $obj->encoding("CCCCCC", -explicit => 1); | |
| 237 $enc = $obj->encoding(-location => $location, | |
| 238 -explicit => 1, | |
| 239 -absolute => 1 ); | |
| 240 Function: get/set the objects encoding, either globally or by location(s). | |
| 241 Returns : the (possibly new) encoding string. | |
| 242 Args : encoding - see the encoding argument to the new() function. | |
| 243 | |
| 244 explicit - whether or not to return explicit phase | |
| 245 information in the coding (i.e. "CCC" becomes | |
| 246 "CDE", "III" becomes "IJK", etc); defaults to 0. | |
| 247 | |
| 248 location - optional; location to get/set the encoding. | |
| 249 Defaults to the entire sequence. | |
| 250 | |
| 251 absolute - whether or not the locational elements (either | |
| 252 in the encoding hashref or the location | |
| 253 argument) are relative to the absolute start/end | |
| 254 of the Bio::LocatableSeq, or to the internal, | |
| 255 relative coordinate system (beginning at 1); | |
| 256 defaults to 0 (i.e. relative) | |
| 257 | |
| 258 =cut | |
| 259 | |
| 260 sub encoding { | |
| 261 | |
| 262 my ($self, @args) = @_; | |
| 263 my ($enc, $loc, $exp, $abs) = $self->_rearrange([qw(ENCODING LOCATION EXPLICIT ABSOLUTE)], @args); | |
| 264 | |
| 265 if (!$enc) { | |
| 266 # do nothing; _recheck_encoding will fix for us, if necessary | |
| 267 } elsif (ref $enc eq 'HASH') { | |
| 268 $self->throw( -class => 'Bio::Root::NotImplemented', | |
| 269 -text => "Hashref functionality not yet implemented;\nplease email me if you really need this."); | |
| 270 #TODO: finish all this | |
| 271 while (my ($char, $locs) = each %$enc) { | |
| 272 if (ref $locs eq 'ARRAY') { | |
| 273 } elsif (UNIVERSAL::isa($locs, "Bio::LocationI")) { | |
| 274 } else { | |
| 275 $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}"); | |
| 276 } | |
| 277 } | |
| 278 } elsif (! ref $enc) { | |
| 279 $enc = uc $enc; | |
| 280 $exp = 1 if (!defined $exp && $enc =~ m/[DEJKV]/o); | |
| 281 | |
| 282 if ($enc =~ m/\d/o) { # numerically "enhanced" encoding | |
| 283 my $numenc = $enc; | |
| 284 $enc = ''; | |
| 285 while ($numenc =~ m/\G(\d*)([CDEIJKUVGFB])/g) { | |
| 286 my ($num, $char) = ($1, $2); | |
| 287 $num = 1 unless $num; | |
| 288 $enc .= $char x $num; | |
| 289 } | |
| 290 } | |
| 291 | |
| 292 if (defined $exp && $exp == 0 && $enc =~ m/([^CIUGFB])/) { | |
| 293 $self->throw("Unrecognized character '$1' in implicit encoding"); | |
| 294 } elsif ($enc =~ m/[^CDEIJKUVGFB]/) { | |
| 295 $self->throw("Unrecognized character '$1' in explicit encoding"); | |
| 296 } | |
| 297 | |
| 298 if ($loc) { # a global location, over which to apply the specified encoding. | |
| 299 | |
| 300 # balk if too many non-gap characters present in encoding: | |
| 301 my ($ct) = $enc =~ tr/GB/GB/; | |
| 302 $ct = length($enc) - $ct; | |
| 303 $self->throw("Location length doesn't match number of coding chars in encoding!") | |
| 304 if ($loc->location_type eq 'EXACT' && $loc->length != $ct); | |
| 305 | |
| 306 my $start = $loc->start; | |
| 307 my $end = $loc->end; | |
| 308 | |
| 309 # strip any encoding that hangs off the ends of the sequence: | |
| 310 if ($start < $self->start) { | |
| 311 my $diff = $self->start - $start; | |
| 312 $start = $self->start; | |
| 313 $enc = substr($enc, $diff); | |
| 314 } | |
| 315 if ($end > $self->end) { | |
| 316 my $diff = $end - $self->end; | |
| 317 $end = $self->end; | |
| 318 $enc = substr($enc, -$diff); | |
| 319 } | |
| 320 | |
| 321 my $currenc = $self->{_encoding}; | |
| 322 my $currseq = $self->seq; | |
| 323 | |
| 324 my ($spanstart, $spanend) = ($self->column_from_residue_number($start), | |
| 325 $self->column_from_residue_number($end) ); | |
| 326 | |
| 327 if ($currseq) { | |
| 328 # strip any gaps in sequence spanned by this location: | |
| 329 my ($before, $in, $after) = $currseq =~ m/(.{@{[ $spanstart - ($loc->location_type eq 'IN-BETWEEN' ? 0 : 1) ]}}) | |
| 330 (.{@{[ $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1) ]}}) | |
| 331 (.*) | |
| 332 /x; | |
| 333 $in =~ s/[\.\-]+//g; | |
| 334 $currseq = $before . $in . $after; | |
| 335 # change seq without changing the alphabet | |
| 336 $self->seq($currseq,$self->alphabet()); | |
| 337 } | |
| 338 | |
| 339 $currenc = reverse $currenc if $self->strand < 0; | |
| 340 substr($currenc, $spanstart, $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1), | |
| 341 $self->strand >= 0 ? $enc : reverse $enc); | |
| 342 $currenc = reverse $currenc if $self->strand < 0; | |
| 343 | |
| 344 $self->{_encoding} = $currenc; | |
| 345 $self->_recheck_encoding; | |
| 346 | |
| 347 $currenc = $self->{_encoding}; | |
| 348 $currenc = reverse $currenc if $self->strand < 0; | |
| 349 $enc = substr($currenc, $spanstart, length $enc); | |
| 350 $enc = reverse $enc if $self->strand < 0; | |
| 351 | |
| 352 return $exp ? $enc: $self->_convert2implicit($enc); | |
| 353 | |
| 354 } else { | |
| 355 # presume a global redefinition; strip any current gap | |
| 356 # characters in the sequence so they don't corrupt the | |
| 357 # encoding | |
| 358 my $dna = $self->seq; | |
| 359 $dna =~ s/[\.\-]//g; | |
| 360 $self->seq($dna, 'dna'); | |
| 361 $self->{_encoding} = $enc; | |
| 362 } | |
| 363 } else { | |
| 364 $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}"); | |
| 365 } | |
| 366 | |
| 367 $self->_recheck_encoding(); | |
| 368 | |
| 369 return $exp ? $self->{_encoding} : $self->_convert2implicit($self->{_encoding}); | |
| 370 } | |
| 371 | |
| 372 sub _convert2implicit { | |
| 373 | |
| 374 my ($self, $enc) = @_; | |
| 375 | |
| 376 $enc =~ s/[DE]/C/g; | |
| 377 $enc =~ s/[JK]/I/g; | |
| 378 $enc =~ s/V/U/g; | |
| 379 | |
| 380 return $enc; | |
| 381 } | |
| 382 | |
| 383 sub _recheck_encoding { | |
| 384 | |
| 385 my $self = shift; | |
| 386 | |
| 387 my @enc = split //, ($self->{_encoding} || ''); | |
| 388 | |
| 389 my @nt = split(//, $self->SUPER::seq); | |
| 390 @nt = reverse @nt if $self->strand && $self->strand < 0; | |
| 391 | |
| 392 # make sure an encoding exists! | |
| 393 @enc = ('C') x scalar grep { !/[\.\-]/o } @nt | |
| 394 unless @enc; | |
| 395 | |
| 396 # check for gaps to be truly present in the sequence | |
| 397 # and vice versa | |
| 398 my $i; | |
| 399 for ($i = 0 ; $i < @nt && $i < @enc ; $i++) { | |
| 400 if ($nt[$i] =~ /[\.\-]/o && $enc[$i] !~ m/[GB]/o) { | |
| 401 splice(@enc, $i, 0, 'G'); | |
| 402 } elsif ($nt[$i] !~ /[\.\-]/o && $enc[$i] =~ m/[GB]/o) { | |
| 403 splice(@nt, $i, 0, '-'); | |
| 404 } | |
| 405 } | |
| 406 if ($i < @enc) { | |
| 407 # extra encoding; presumably all gaps? | |
| 408 for ( ; $i < @enc ; $i++) { | |
| 409 if ($enc[$i] =~ m/[GB]/o) { | |
| 410 push @nt, '-'; | |
| 411 } else { | |
| 412 $self->throw("Extraneous encoding info: " . join('', @enc[$i..$#enc])); | |
| 413 } | |
| 414 } | |
| 415 } elsif ($i < @nt) { | |
| 416 for ( ; $i < @nt ; $i++) { | |
| 417 if ($nt[$i] =~ m/[\.\-]/o) { | |
| 418 push @enc, 'G'; | |
| 419 } else { | |
| 420 push @enc, 'C'; | |
| 421 } | |
| 422 } | |
| 423 } | |
| 424 | |
| 425 my @cde_array = qw(C D E); | |
| 426 my @ijk_array = qw(I J K); | |
| 427 # convert any leftover implicit coding into explicit coding | |
| 428 my ($Cct, $Ict, $Uct, $Vct, $Vwarned) = (0, 0, 0, 0); | |
| 429 for ($i = 0 ; $i < @enc ; $i++) { | |
| 430 if ($enc[$i] =~ m/[CDE]/o) { | |
| 431 my $temp_index = $Cct %3; | |
| 432 $enc[$i] = $cde_array[$temp_index]; | |
| 433 $Cct++; $Ict = 0; $Uct = 1; | |
| 434 $self->warn("3' untranslated encoding (V) seen prior to other coding symbols") | |
| 435 if ($Vct && !$Vwarned++); | |
| 436 } elsif ($enc[$i] =~ m/[IJK]/o) { | |
| 437 $enc[$i] = $ijk_array[$Ict % 3]; | |
| 438 $Ict++; $Uct = 1; | |
| 439 $self->warn("3' untranslated encoding (V) seen before other coding symbols") | |
| 440 if ($Vct && !$Vwarned++); | |
| 441 } elsif ($enc[$i] =~ m/[UV]/o) { | |
| 442 if ($Uct == 1) { | |
| 443 $enc[$i] = 'V'; | |
| 444 $Vct = 1; | |
| 445 } | |
| 446 } elsif ($enc[$i] eq 'B') { | |
| 447 $Cct++; $Ict++ | |
| 448 } elsif ($enc[$i] eq 'G') { | |
| 449 # gap; leave alone | |
| 450 } | |
| 451 } | |
| 452 | |
| 453 @nt = reverse @nt if $self->strand && $self->strand < 0; | |
| 454 | |
| 455 $self->{'seq'} = join('', @nt); | |
| 456 # $self->seq(join('', @nt), 'dna'); | |
| 457 $self->{_encoding} = join '', @enc; | |
| 458 } | |
| 459 | |
| 460 =head2 cds | |
| 461 | |
| 462 Title : cds | |
| 463 Usage : $cds = $obj->cds(-nogaps => 1); | |
| 464 Function: obtain the "spliced" DNA sequence, by removing any | |
| 465 nucleotides that participate in an UTR, forward frameshift | |
| 466 or intron, and replacing any unknown nucleotide implied by | |
| 467 a backward frameshift or gap with N's. | |
| 468 Returns : a Bio::Seq::EncodedSeq object, with an encoding consisting only | |
| 469 of "CCCC..". | |
| 470 Args : nogaps - strip any gap characters (resulting from 'G' or 'B' | |
| 471 encodings), rather than replacing them with N's. | |
| 472 | |
| 473 =cut | |
| 474 | |
| 475 sub cds { | |
| 476 | |
| 477 my ($self, @args) = @_; | |
| 478 | |
| 479 my ($nogaps, $loc) = $self->_rearrange([qw(NOGAPS LOCATION)], @args); | |
| 480 $nogaps = 0 unless defined $nogaps; | |
| 481 | |
| 482 my @nt = split //, $self->strand < 0 ? $self->revcom->seq : $self->seq; | |
| 483 my @enc = split //, $self->_convert2implicit($self->{_encoding}); | |
| 484 | |
| 485 my ($start, $end) = (0, scalar @nt); | |
| 486 | |
| 487 if ($loc) { | |
| 488 $start = $loc->start; | |
| 489 $start++ if $loc->location_type eq 'IN-BETWEEN'; | |
| 490 $start = $self->column_from_residue_number($start); | |
| 491 $start--; | |
| 492 | |
| 493 $end = $loc->end; | |
| 494 $end = $self->column_from_residue_number($end); | |
| 495 | |
| 496 ($start, $end) = ($end, $start) if $self->strand < 0; | |
| 497 $start--; | |
| 498 } | |
| 499 | |
| 500 for (my $i = $start ; $i < $end ; $i++) { | |
| 501 if ($enc[$i] eq 'I' || $enc[$i] eq 'U' || $enc[$i] eq 'F') { | |
| 502 # remove introns, untranslated and forward frameshift nucleotides | |
| 503 $nt[$i] = undef; | |
| 504 } elsif ($enc[$i] eq 'G' || $enc[$i] eq 'B') { | |
| 505 # replace gaps and backward frameshifts with N's, unless asked not to. | |
| 506 $nt[$i] = $nogaps ? undef : 'N'; | |
| 507 } | |
| 508 } | |
| 509 | |
| 510 return ($self->can_call_new ? ref($self) : __PACKAGE__)->new | |
| 511 (-seq => join('', grep { defined } @nt[$start..--$end]), | |
| 512 -start => $self->start, | |
| 513 -end => $self->end, | |
| 514 -strand => 1, -alphabet => 'dna'); | |
| 515 } | |
| 516 | |
| 517 =head2 translate | |
| 518 | |
| 519 Title : translate | |
| 520 Usage : $prot = $obj->translate(@args); | |
| 521 Function: obtain the protein sequence encoded by the underlying DNA | |
| 522 sequence; same as $obj->cds()->translate(@args). | |
| 523 Returns : a Bio::PrimarySeq object. | |
| 524 Args : same as the translate() function of Bio::PrimarySeqI | |
| 525 | |
| 526 =cut | |
| 527 | |
| 528 sub translate { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_) }; | |
| 529 | |
| 530 =head2 protseq | |
| 531 | |
| 532 Title : seq | |
| 533 Usage : $protseq = $obj->protseq(); | |
| 534 Function: obtain the raw protein sequence encoded by the underlying | |
| 535 DNA sequence; This is the same as calling | |
| 536 $obj->translate()->seq(); | |
| 537 Returns : a string of single-letter amino acid codes | |
| 538 Args : same as the seq() function of Bio::PrimarySeq; note that this | |
| 539 function may not be used to set the protein sequence; see | |
| 540 the dnaseq() function for that. | |
| 541 | |
| 542 =cut | |
| 543 | |
| 544 sub protseq { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_)->seq }; | |
| 545 | |
| 546 =head2 dnaseq | |
| 547 | |
| 548 Title : dnaseq | |
| 549 Usage : $dnaseq = $obj->dnaseq(); | |
| 550 $obj->dnaseq("ACGTGTCGT", "CCCCCCCCC"); | |
| 551 $obj->dnaseq(-seq => "ATG", | |
| 552 -encoding => "CCC", | |
| 553 -location => $loc ); | |
| 554 @introns = $obj->$dnaseq(-encoding => 'I') | |
| 555 Function: get/set the underlying DNA sequence; will overwrite any | |
| 556 current DNA and/or encoding information present. | |
| 557 Returns : a string of single-letter nucleotide codes, including any | |
| 558 gaps implied by the encoding. | |
| 559 Args : seq - the DNA sequence to be used as a replacement | |
| 560 encoding - the encoding of the DNA sequence (see the new() | |
| 561 constructor); defaults to all 'C' if setting a | |
| 562 new DNA sequence. If no new DNA sequence is | |
| 563 being provided, then the encoding is used as a | |
| 564 "filter" for which to return fragments of | |
| 565 non-overlapping DNA that match the encoding. | |
| 566 location - optional, the location of the DNA sequence to | |
| 567 get/set; defaults to the entire sequence. | |
| 568 | |
| 569 =cut | |
| 570 | |
| 571 sub dnaseq { | |
| 572 | |
| 573 my ($self, @args) = @_; | |
| 574 my ($seq, $enc, $loc) = $self->_rearrange([qw(DNASEQ ENCODING LOCATION)], @args); | |
| 575 | |
| 576 $self | |
| 577 | |
| 578 } | |
| 579 | |
| 580 # need to overload this so that we truncate both the seq and the encoding! | |
| 581 sub trunc { | |
| 582 | |
| 583 my ($self, $start, $end) = @_; | |
| 584 my $new = $self->SUPER::trunc($start, $end); | |
| 585 $start--; | |
| 586 my $enc = $self->{_encoding}; | |
| 587 $enc = reverse $enc if $self->strand < 0; | |
| 588 $enc = substr($enc, $start, $end - $start); | |
| 589 $enc = reverse $enc if $self->strand < 0; | |
| 590 $new->encoding($enc); | |
| 591 return $new; | |
| 592 } |
