0
|
1 # $Id: EncodedSeq.pm,v 1.5.2.1 2003/04/28 12:11:57 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Seq::EncodedSeq
|
|
4 #
|
|
5 # Cared for by Aaron Mackey
|
|
6 #
|
|
7 # Copyright Aaron Mackey
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Seq::EncodedSeq - subtype of L<Bio::LocatableSeq|Bio::LocatableSeq> to store DNA that encodes a protein
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 $obj = new Bio::Seq::EncodedSeq(-seq => $dna,
|
|
20 -encoding => "CCCCCCCIIIIICCCCC",
|
|
21 -start => 1,
|
|
22 -strand => 1,
|
|
23 -length => 17);
|
|
24
|
|
25 # splice out (and possibly revcomp) the coding sequence
|
|
26 $cds = obj->cds;
|
|
27
|
|
28 # obtain the protein translation of the sequence
|
|
29 $prot = $obj->translate;
|
|
30
|
|
31 # other access/inspection routines as with Bio::LocatableSeq and
|
|
32 # Bio::SeqI; note that coordinates are relative only to the DNA
|
|
33 # sequence, not it's implicit encoded protein sequence.
|
|
34
|
|
35 =head1 DESCRIPTION
|
|
36
|
|
37 Bio::Seq::EncodedSeq is a L<Bio::LocatableSeq|Bio::LocatableSeq>
|
|
38 object that holds a DNA sequence as well as information about the
|
|
39 coding potential of that DNA sequence. It is meant to be useful in an
|
|
40 alignment context, where the DNA may contain frameshifts, gaps and/or
|
|
41 introns, or in describing the transcript of a gene. An EncodedSeq
|
|
42 provides the ability to access the "spliced" coding sequence, meaning
|
|
43 that all introns and gaps are removed, and any frameshifts are
|
|
44 adjusted to provide a "clean" CDS.
|
|
45
|
|
46 In order to make simultaneous use of either the DNA or the implicit
|
|
47 encoded protein sequence coordinates, please see
|
|
48 L<Bio::Coordinate::EncodingPair>.
|
|
49
|
|
50 =head1 ENCODING
|
|
51
|
|
52 We use the term "encoding" here to refer to the series of symbols that
|
|
53 we use to identify which residues of a DNA sequence are protein-coding
|
|
54 (i.e. part of a codon), intronic, part of a 5' or 3', frameshift
|
|
55 "mutations", etc. From this information, a Bio::Seq::EncodedSeq is
|
|
56 able to "figure out" its translational CDS. There are two sets of
|
|
57 coding characters, one termed "implicit" and one termed "explicit".
|
|
58
|
|
59 The "implict" encoding is a bit simpler than the "explicit" encoding:
|
|
60 'C' is used for any nucleotide that's part of a codon, 'U' for any
|
|
61 UTR, etc. The full list is shown below:
|
|
62
|
|
63 Code Meaning
|
|
64 ---- -------
|
|
65 C coding
|
|
66 I intronic
|
|
67 U untranslated
|
|
68 G gapped (for use in alignments)
|
|
69 F a "forward", +1 frameshift
|
|
70 B a "backward", -1 frameshift
|
|
71
|
|
72 The "explicit" encoding is just an expansion of the "implicit"
|
|
73 encoding, to denote phase:
|
|
74
|
|
75 Code Meaning
|
|
76 ---- -------
|
|
77 C coding, 1st codon position
|
|
78 D coding, 2nd codon position
|
|
79 E coding, 3rd codon position
|
|
80
|
|
81 I intronic, phase 0 (relative to intron begin)
|
|
82 J intronic, phase 1
|
|
83 K intronic, phase 2
|
|
84
|
|
85 U untranslated 3'UTR
|
|
86 V untranslated 5'UTR
|
|
87
|
|
88 G gapped (for use in alignments)
|
|
89 F a "forward", +1 frameshift
|
|
90 B a "backward", -1 frameshift
|
|
91
|
|
92 Note that the explicit coding is meant to provide easy access to
|
|
93 position/phase specific nucleotides:
|
|
94
|
|
95 $obj = new Bio::Seq::EncodedSeq (-seq => "ACAATCAGACTACG...",
|
|
96 -encoding => "CCCCCCIII..."
|
|
97 );
|
|
98
|
|
99 # fetch arrays of nucleotides at each codon position:
|
|
100 my @pos1 = $obj->dnaseq(encoding => 'C', explicit => 1);
|
|
101 my @pos2 = $obj->dnaseq(encoding => 'D');
|
|
102 my @pos3 = $obj->dnaseq(encoding => 'E');
|
|
103
|
|
104 # fetch arrays of "3-1" codon dinucleotides, useful for genomic
|
|
105 # signature analyses without compounding influences of codon bias:
|
|
106 my @pairs = $obj->dnaseq(encoding => 'EC');
|
|
107
|
|
108 =head1 FEEDBACK
|
|
109
|
|
110 =head2 Mailing Lists
|
|
111
|
|
112 User feedback is an integral part of the evolution of this
|
|
113 and other Bioperl modules. Send your comments and suggestions preferably
|
|
114 to one of the Bioperl mailing lists.
|
|
115 Your participation is much appreciated.
|
|
116
|
|
117 bioperl-l@bioperl.org - General discussion
|
|
118 http://www.bioperl.org/MailList.html - About the mailing lists
|
|
119
|
|
120 =head2 Reporting Bugs
|
|
121
|
|
122 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
123 the bugs and their resolution.
|
|
124 Bug reports can be submitted via email or the web:
|
|
125
|
|
126 bioperl-bugs@bio.perl.org
|
|
127 http://bugzilla.bioperl.org/
|
|
128
|
|
129 =head1 AUTHOR - Aaron Mackey
|
|
130
|
|
131 Email amackey@virginia.edu
|
|
132
|
|
133 =head1 APPENDIX
|
|
134
|
|
135 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
136
|
|
137 =cut
|
|
138
|
|
139
|
|
140 # Let the code begin...
|
|
141
|
|
142 package Bio::Seq::EncodedSeq;
|
|
143
|
|
144 use strict;
|
|
145 use vars qw(@ISA);
|
|
146
|
|
147 use Bio::LocatableSeq;
|
|
148
|
|
149 @ISA = qw(Bio::LocatableSeq);
|
|
150
|
|
151 =head2 new
|
|
152
|
|
153 Title : new
|
|
154 Usage : $obj = Bio::Seq::EncodedSeq->new(-seq => "AGTACGTGTCATG",
|
|
155 -encoding => "CCCCCCFCCCCCC",
|
|
156 -id => "myseq",
|
|
157 -start => 1,
|
|
158 -end => 13,
|
|
159 -strand => 1
|
|
160 );
|
|
161 Function: creates a new Bio::Seq::EncodedSeq object from a supplied DNA
|
|
162 sequence
|
|
163 Returns : a new Bio::Seq::EncodedSeq object
|
|
164
|
|
165 Args : seq - primary nucleotide sequence used to encode the
|
|
166 protein; note that any positions involved in a
|
|
167 gap ('G') or backward frameshift ('B') should
|
|
168 have one or more gap characters; if the encoding
|
|
169 specifies G or B, but no (or not enough) gap
|
|
170 characters exist, *they'll be added*; similary,
|
|
171 if there are gap characters without a
|
|
172 corresponding G or B encoding, G's will be
|
|
173 inserted into the encoding. This allows some
|
|
174 flexibility in specifying your sequence and
|
|
175 coding without having to calculate a lot of the
|
|
176 encoding for yourself.
|
|
177
|
|
178 encoding - a string of characters (see Encoding Table)
|
|
179 describing backwards frameshifts implied by the
|
|
180 encoding but not present in the sequence will be
|
|
181 added (as '-'s) to the sequence. If not
|
|
182 supplied, it will be assumed that all positions
|
|
183 are coding (C). Encoding may include either
|
|
184 implicit phase encoding characters (i.e. "CCC")
|
|
185 and/or explicit encoding characters (i.e. "CDE").
|
|
186 Additionally, prefixed numbers may be used to
|
|
187 denote repetition (i.e. "27C3I28C").
|
|
188
|
|
189 Alternatively, encoding may be a hashref
|
|
190 datastructure, with encoding characters as keys
|
|
191 and Bio::LocationI objects (or arrayrefs of
|
|
192 Bio::LocationI objects) as values, e.g.:
|
|
193
|
|
194 { C => [ Bio::Location::Simple->new(1,9),
|
|
195 Bio::Location::Simple->new(11,13) ],
|
|
196 F => Bio::Location::Simple->new(10,10),
|
|
197 } # same as "CCCCCCCCCFCCC"
|
|
198
|
|
199 Note that if the location ranges overlap, the
|
|
200 behavior of the encoding will be undefined
|
|
201 (well, it will be defined, but only according to
|
|
202 the order in which the hash keys are read, which
|
|
203 is basically undefined ... just don't do that).
|
|
204
|
|
205 id, start, end, strand - as with Bio::LocatableSeq; note
|
|
206 that the coordinates are relative to the
|
|
207 encoding DNA sequence, not the implicit protein
|
|
208 sequence. If strand is reversed, then the
|
|
209 encoding is assumed to be relative to the
|
|
210 reverse strand as well.
|
|
211
|
|
212 =cut
|
|
213
|
|
214 #'
|
|
215
|
|
216 sub new {
|
|
217
|
|
218 my ($self, @args) = @_;
|
|
219 $self = $self->SUPER::new(@args, -alphabet => 'dna');
|
|
220 my ($enc) = $self->_rearrange([qw(ENCODING)], @args);
|
|
221 # set the real encoding:
|
|
222 if ($enc) {
|
|
223 $self->encoding($enc);
|
|
224 } else {
|
|
225 $self->_recheck_encoding;
|
|
226 }
|
|
227 return $self;
|
|
228 }
|
|
229
|
|
230 =head2 encoding
|
|
231
|
|
232 Title : encoding
|
|
233 Usage : $obj->encoding("CCCCCC");
|
|
234 $obj->encoding( -encoding => { I => $location } );
|
|
235 $enc = $obj->encoding(-explicit => 1);
|
|
236 $enc = $obj->encoding("CCCCCC", -explicit => 1);
|
|
237 $enc = $obj->encoding(-location => $location,
|
|
238 -explicit => 1,
|
|
239 -absolute => 1 );
|
|
240 Function: get/set the objects encoding, either globally or by location(s).
|
|
241 Returns : the (possibly new) encoding string.
|
|
242 Args : encoding - see the encoding argument to the new() function.
|
|
243
|
|
244 explicit - whether or not to return explicit phase
|
|
245 information in the coding (i.e. "CCC" becomes
|
|
246 "CDE", "III" becomes "IJK", etc); defaults to 0.
|
|
247
|
|
248 location - optional; location to get/set the encoding.
|
|
249 Defaults to the entire sequence.
|
|
250
|
|
251 absolute - whether or not the locational elements (either
|
|
252 in the encoding hashref or the location
|
|
253 argument) are relative to the absolute start/end
|
|
254 of the Bio::LocatableSeq, or to the internal,
|
|
255 relative coordinate system (beginning at 1);
|
|
256 defaults to 0 (i.e. relative)
|
|
257
|
|
258 =cut
|
|
259
|
|
260 sub encoding {
|
|
261
|
|
262 my ($self, @args) = @_;
|
|
263 my ($enc, $loc, $exp, $abs) = $self->_rearrange([qw(ENCODING LOCATION EXPLICIT ABSOLUTE)], @args);
|
|
264
|
|
265 if (!$enc) {
|
|
266 # do nothing; _recheck_encoding will fix for us, if necessary
|
|
267 } elsif (ref $enc eq 'HASH') {
|
|
268 $self->throw( -class => 'Bio::Root::NotImplemented',
|
|
269 -text => "Hashref functionality not yet implemented;\nplease email me if you really need this.");
|
|
270 #TODO: finish all this
|
|
271 while (my ($char, $locs) = each %$enc) {
|
|
272 if (ref $locs eq 'ARRAY') {
|
|
273 } elsif (UNIVERSAL::isa($locs, "Bio::LocationI")) {
|
|
274 } else {
|
|
275 $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}");
|
|
276 }
|
|
277 }
|
|
278 } elsif (! ref $enc) {
|
|
279 $enc = uc $enc;
|
|
280 $exp = 1 if (!defined $exp && $enc =~ m/[DEJKV]/o);
|
|
281
|
|
282 if ($enc =~ m/\d/o) { # numerically "enhanced" encoding
|
|
283 my $numenc = $enc;
|
|
284 $enc = '';
|
|
285 while ($numenc =~ m/\G(\d*)([CDEIJKUVGFB])/g) {
|
|
286 my ($num, $char) = ($1, $2);
|
|
287 $num = 1 unless $num;
|
|
288 $enc .= $char x $num;
|
|
289 }
|
|
290 }
|
|
291
|
|
292 if (defined $exp && $exp == 0 && $enc =~ m/([^CIUGFB])/) {
|
|
293 $self->throw("Unrecognized character '$1' in implicit encoding");
|
|
294 } elsif ($enc =~ m/[^CDEIJKUVGFB]/) {
|
|
295 $self->throw("Unrecognized character '$1' in explicit encoding");
|
|
296 }
|
|
297
|
|
298 if ($loc) { # a global location, over which to apply the specified encoding.
|
|
299
|
|
300 # balk if too many non-gap characters present in encoding:
|
|
301 my ($ct) = $enc =~ tr/GB/GB/;
|
|
302 $ct = length($enc) - $ct;
|
|
303 $self->throw("Location length doesn't match number of coding chars in encoding!")
|
|
304 if ($loc->location_type eq 'EXACT' && $loc->length != $ct);
|
|
305
|
|
306 my $start = $loc->start;
|
|
307 my $end = $loc->end;
|
|
308
|
|
309 # strip any encoding that hangs off the ends of the sequence:
|
|
310 if ($start < $self->start) {
|
|
311 my $diff = $self->start - $start;
|
|
312 $start = $self->start;
|
|
313 $enc = substr($enc, $diff);
|
|
314 }
|
|
315 if ($end > $self->end) {
|
|
316 my $diff = $end - $self->end;
|
|
317 $end = $self->end;
|
|
318 $enc = substr($enc, -$diff);
|
|
319 }
|
|
320
|
|
321 my $currenc = $self->{_encoding};
|
|
322 my $currseq = $self->seq;
|
|
323
|
|
324 my ($spanstart, $spanend) = ($self->column_from_residue_number($start),
|
|
325 $self->column_from_residue_number($end) );
|
|
326
|
|
327 if ($currseq) {
|
|
328 # strip any gaps in sequence spanned by this location:
|
|
329 my ($before, $in, $after) = $currseq =~ m/(.{@{[ $spanstart - ($loc->location_type eq 'IN-BETWEEN' ? 0 : 1) ]}})
|
|
330 (.{@{[ $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1) ]}})
|
|
331 (.*)
|
|
332 /x;
|
|
333 $in =~ s/[\.\-]+//g;
|
|
334 $currseq = $before . $in . $after;
|
|
335 # change seq without changing the alphabet
|
|
336 $self->seq($currseq,$self->alphabet());
|
|
337 }
|
|
338
|
|
339 $currenc = reverse $currenc if $self->strand < 0;
|
|
340 substr($currenc, $spanstart, $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1),
|
|
341 $self->strand >= 0 ? $enc : reverse $enc);
|
|
342 $currenc = reverse $currenc if $self->strand < 0;
|
|
343
|
|
344 $self->{_encoding} = $currenc;
|
|
345 $self->_recheck_encoding;
|
|
346
|
|
347 $currenc = $self->{_encoding};
|
|
348 $currenc = reverse $currenc if $self->strand < 0;
|
|
349 $enc = substr($currenc, $spanstart, length $enc);
|
|
350 $enc = reverse $enc if $self->strand < 0;
|
|
351
|
|
352 return $exp ? $enc: $self->_convert2implicit($enc);
|
|
353
|
|
354 } else {
|
|
355 # presume a global redefinition; strip any current gap
|
|
356 # characters in the sequence so they don't corrupt the
|
|
357 # encoding
|
|
358 my $dna = $self->seq;
|
|
359 $dna =~ s/[\.\-]//g;
|
|
360 $self->seq($dna, 'dna');
|
|
361 $self->{_encoding} = $enc;
|
|
362 }
|
|
363 } else {
|
|
364 $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}");
|
|
365 }
|
|
366
|
|
367 $self->_recheck_encoding();
|
|
368
|
|
369 return $exp ? $self->{_encoding} : $self->_convert2implicit($self->{_encoding});
|
|
370 }
|
|
371
|
|
372 sub _convert2implicit {
|
|
373
|
|
374 my ($self, $enc) = @_;
|
|
375
|
|
376 $enc =~ s/[DE]/C/g;
|
|
377 $enc =~ s/[JK]/I/g;
|
|
378 $enc =~ s/V/U/g;
|
|
379
|
|
380 return $enc;
|
|
381 }
|
|
382
|
|
383 sub _recheck_encoding {
|
|
384
|
|
385 my $self = shift;
|
|
386
|
|
387 my @enc = split //, ($self->{_encoding} || '');
|
|
388
|
|
389 my @nt = split(//, $self->SUPER::seq);
|
|
390 @nt = reverse @nt if $self->strand && $self->strand < 0;
|
|
391
|
|
392 # make sure an encoding exists!
|
|
393 @enc = ('C') x scalar grep { !/[\.\-]/o } @nt
|
|
394 unless @enc;
|
|
395
|
|
396 # check for gaps to be truly present in the sequence
|
|
397 # and vice versa
|
|
398 my $i;
|
|
399 for ($i = 0 ; $i < @nt && $i < @enc ; $i++) {
|
|
400 if ($nt[$i] =~ /[\.\-]/o && $enc[$i] !~ m/[GB]/o) {
|
|
401 splice(@enc, $i, 0, 'G');
|
|
402 } elsif ($nt[$i] !~ /[\.\-]/o && $enc[$i] =~ m/[GB]/o) {
|
|
403 splice(@nt, $i, 0, '-');
|
|
404 }
|
|
405 }
|
|
406 if ($i < @enc) {
|
|
407 # extra encoding; presumably all gaps?
|
|
408 for ( ; $i < @enc ; $i++) {
|
|
409 if ($enc[$i] =~ m/[GB]/o) {
|
|
410 push @nt, '-';
|
|
411 } else {
|
|
412 $self->throw("Extraneous encoding info: " . join('', @enc[$i..$#enc]));
|
|
413 }
|
|
414 }
|
|
415 } elsif ($i < @nt) {
|
|
416 for ( ; $i < @nt ; $i++) {
|
|
417 if ($nt[$i] =~ m/[\.\-]/o) {
|
|
418 push @enc, 'G';
|
|
419 } else {
|
|
420 push @enc, 'C';
|
|
421 }
|
|
422 }
|
|
423 }
|
|
424
|
|
425 my @cde_array = qw(C D E);
|
|
426 my @ijk_array = qw(I J K);
|
|
427 # convert any leftover implicit coding into explicit coding
|
|
428 my ($Cct, $Ict, $Uct, $Vct, $Vwarned) = (0, 0, 0, 0);
|
|
429 for ($i = 0 ; $i < @enc ; $i++) {
|
|
430 if ($enc[$i] =~ m/[CDE]/o) {
|
|
431 my $temp_index = $Cct %3;
|
|
432 $enc[$i] = $cde_array[$temp_index];
|
|
433 $Cct++; $Ict = 0; $Uct = 1;
|
|
434 $self->warn("3' untranslated encoding (V) seen prior to other coding symbols")
|
|
435 if ($Vct && !$Vwarned++);
|
|
436 } elsif ($enc[$i] =~ m/[IJK]/o) {
|
|
437 $enc[$i] = $ijk_array[$Ict % 3];
|
|
438 $Ict++; $Uct = 1;
|
|
439 $self->warn("3' untranslated encoding (V) seen before other coding symbols")
|
|
440 if ($Vct && !$Vwarned++);
|
|
441 } elsif ($enc[$i] =~ m/[UV]/o) {
|
|
442 if ($Uct == 1) {
|
|
443 $enc[$i] = 'V';
|
|
444 $Vct = 1;
|
|
445 }
|
|
446 } elsif ($enc[$i] eq 'B') {
|
|
447 $Cct++; $Ict++
|
|
448 } elsif ($enc[$i] eq 'G') {
|
|
449 # gap; leave alone
|
|
450 }
|
|
451 }
|
|
452
|
|
453 @nt = reverse @nt if $self->strand && $self->strand < 0;
|
|
454
|
|
455 $self->{'seq'} = join('', @nt);
|
|
456 # $self->seq(join('', @nt), 'dna');
|
|
457 $self->{_encoding} = join '', @enc;
|
|
458 }
|
|
459
|
|
460 =head2 cds
|
|
461
|
|
462 Title : cds
|
|
463 Usage : $cds = $obj->cds(-nogaps => 1);
|
|
464 Function: obtain the "spliced" DNA sequence, by removing any
|
|
465 nucleotides that participate in an UTR, forward frameshift
|
|
466 or intron, and replacing any unknown nucleotide implied by
|
|
467 a backward frameshift or gap with N's.
|
|
468 Returns : a Bio::Seq::EncodedSeq object, with an encoding consisting only
|
|
469 of "CCCC..".
|
|
470 Args : nogaps - strip any gap characters (resulting from 'G' or 'B'
|
|
471 encodings), rather than replacing them with N's.
|
|
472
|
|
473 =cut
|
|
474
|
|
475 sub cds {
|
|
476
|
|
477 my ($self, @args) = @_;
|
|
478
|
|
479 my ($nogaps, $loc) = $self->_rearrange([qw(NOGAPS LOCATION)], @args);
|
|
480 $nogaps = 0 unless defined $nogaps;
|
|
481
|
|
482 my @nt = split //, $self->strand < 0 ? $self->revcom->seq : $self->seq;
|
|
483 my @enc = split //, $self->_convert2implicit($self->{_encoding});
|
|
484
|
|
485 my ($start, $end) = (0, scalar @nt);
|
|
486
|
|
487 if ($loc) {
|
|
488 $start = $loc->start;
|
|
489 $start++ if $loc->location_type eq 'IN-BETWEEN';
|
|
490 $start = $self->column_from_residue_number($start);
|
|
491 $start--;
|
|
492
|
|
493 $end = $loc->end;
|
|
494 $end = $self->column_from_residue_number($end);
|
|
495
|
|
496 ($start, $end) = ($end, $start) if $self->strand < 0;
|
|
497 $start--;
|
|
498 }
|
|
499
|
|
500 for (my $i = $start ; $i < $end ; $i++) {
|
|
501 if ($enc[$i] eq 'I' || $enc[$i] eq 'U' || $enc[$i] eq 'F') {
|
|
502 # remove introns, untranslated and forward frameshift nucleotides
|
|
503 $nt[$i] = undef;
|
|
504 } elsif ($enc[$i] eq 'G' || $enc[$i] eq 'B') {
|
|
505 # replace gaps and backward frameshifts with N's, unless asked not to.
|
|
506 $nt[$i] = $nogaps ? undef : 'N';
|
|
507 }
|
|
508 }
|
|
509
|
|
510 return ($self->can_call_new ? ref($self) : __PACKAGE__)->new
|
|
511 (-seq => join('', grep { defined } @nt[$start..--$end]),
|
|
512 -start => $self->start,
|
|
513 -end => $self->end,
|
|
514 -strand => 1, -alphabet => 'dna');
|
|
515 }
|
|
516
|
|
517 =head2 translate
|
|
518
|
|
519 Title : translate
|
|
520 Usage : $prot = $obj->translate(@args);
|
|
521 Function: obtain the protein sequence encoded by the underlying DNA
|
|
522 sequence; same as $obj->cds()->translate(@args).
|
|
523 Returns : a Bio::PrimarySeq object.
|
|
524 Args : same as the translate() function of Bio::PrimarySeqI
|
|
525
|
|
526 =cut
|
|
527
|
|
528 sub translate { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_) };
|
|
529
|
|
530 =head2 protseq
|
|
531
|
|
532 Title : seq
|
|
533 Usage : $protseq = $obj->protseq();
|
|
534 Function: obtain the raw protein sequence encoded by the underlying
|
|
535 DNA sequence; This is the same as calling
|
|
536 $obj->translate()->seq();
|
|
537 Returns : a string of single-letter amino acid codes
|
|
538 Args : same as the seq() function of Bio::PrimarySeq; note that this
|
|
539 function may not be used to set the protein sequence; see
|
|
540 the dnaseq() function for that.
|
|
541
|
|
542 =cut
|
|
543
|
|
544 sub protseq { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_)->seq };
|
|
545
|
|
546 =head2 dnaseq
|
|
547
|
|
548 Title : dnaseq
|
|
549 Usage : $dnaseq = $obj->dnaseq();
|
|
550 $obj->dnaseq("ACGTGTCGT", "CCCCCCCCC");
|
|
551 $obj->dnaseq(-seq => "ATG",
|
|
552 -encoding => "CCC",
|
|
553 -location => $loc );
|
|
554 @introns = $obj->$dnaseq(-encoding => 'I')
|
|
555 Function: get/set the underlying DNA sequence; will overwrite any
|
|
556 current DNA and/or encoding information present.
|
|
557 Returns : a string of single-letter nucleotide codes, including any
|
|
558 gaps implied by the encoding.
|
|
559 Args : seq - the DNA sequence to be used as a replacement
|
|
560 encoding - the encoding of the DNA sequence (see the new()
|
|
561 constructor); defaults to all 'C' if setting a
|
|
562 new DNA sequence. If no new DNA sequence is
|
|
563 being provided, then the encoding is used as a
|
|
564 "filter" for which to return fragments of
|
|
565 non-overlapping DNA that match the encoding.
|
|
566 location - optional, the location of the DNA sequence to
|
|
567 get/set; defaults to the entire sequence.
|
|
568
|
|
569 =cut
|
|
570
|
|
571 sub dnaseq {
|
|
572
|
|
573 my ($self, @args) = @_;
|
|
574 my ($seq, $enc, $loc) = $self->_rearrange([qw(DNASEQ ENCODING LOCATION)], @args);
|
|
575
|
|
576 $self
|
|
577
|
|
578 }
|
|
579
|
|
580 # need to overload this so that we truncate both the seq and the encoding!
|
|
581 sub trunc {
|
|
582
|
|
583 my ($self, $start, $end) = @_;
|
|
584 my $new = $self->SUPER::trunc($start, $end);
|
|
585 $start--;
|
|
586 my $enc = $self->{_encoding};
|
|
587 $enc = reverse $enc if $self->strand < 0;
|
|
588 $enc = substr($enc, $start, $end - $start);
|
|
589 $enc = reverse $enc if $self->strand < 0;
|
|
590 $new->encoding($enc);
|
|
591 return $new;
|
|
592 }
|