comparison variant_effect_predictor/Bio/Tools/SeqStats.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: SeqStats.pm,v 1.16.2.1 2003/02/28 13:17:06 heikki Exp $
2 #
3 # BioPerl module for Bio::Tools::SeqStats
4 #
5 # Cared for by
6 #
7 # Copyright Peter Schattner
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tools::SeqStats - Object holding statistics for one particular sequence
16
17 =head1 SYNOPSIS
18
19 # build a primary nucleic acid or protein sequence object somehow
20 # then build a statistics object from the sequence object
21
22 $seqobj = Bio::PrimarySeq->new(-seq=>'ACTGTGGCGTCAACTG',
23 -alphabet=>'dna',
24 -id=>'test');
25 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
26
27 # obtain a hash of counts of each type of monomer
28 # (ie amino or nucleic acid)
29 print "\nMonomer counts using statistics object\n";
30 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
31 $hash_ref = $seq_stats->count_monomers(); # eg for DNA sequence
32 foreach $base (sort keys %$hash_ref) {
33 print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n";
34 }
35
36 # or obtain the count directly without creating a new statistics object
37 print "\nMonomer counts without statistics object\n";
38 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
39 foreach $base (sort keys %$hash_ref) {
40 print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n";
41 }
42
43
44 # obtain hash of counts of each type of codon in a nucleic acid sequence
45 print "\nCodon counts using statistics object\n";
46 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
47 foreach $base (sort keys %$hash_ref) {
48 print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n";
49 }
50
51 # or
52 print "\nCodon counts without statistics object\n";
53 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
54 foreach $base (sort keys %$hash_ref) {
55 print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n";
56 }
57
58 # Obtain the molecular weight of a sequence. Since the sequence may contain
59 # ambiguous monomers, the molecular weight is returned as a (reference to) a
60 # two element array containing greatest lower bound (GLB) and least upper bound
61 # (LUB) of the molecular weight
62 $weight = $seq_stats->get_mol_wt();
63 print "\nMolecular weight (using statistics object) of sequence ", $seqobj->id(),
64 " is between ", $$weight[0], " and " ,
65 $$weight[1], "\n";
66
67 # or
68 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
69 print "\nMolecular weight (without statistics object) of sequence ", $seqobj->id(),
70 " is between ", $$weight[0], " and " ,
71 $$weight[1], "\n";
72
73
74 =head1 DESCRIPTION
75
76 Bio::Tools::SeqStats is a lightweight object for the calculation of
77 simple statistical and numerical properties of a sequence. By
78 "lightweight" I mean that only "primary" sequences are handled by the
79 object. The calling script needs to create the appropriate primary
80 sequence to be passed to SeqStats if statistics on a sequence feature
81 are required. Similarly if a codon count is desired for a
82 frame-shifted sequence and/or a negative strand sequence, the calling
83 script needs to create that sequence and pass it to the SeqStats
84 object.
85
86 Nota that nucleotide sequences in bioperl do not strictly separate RNA
87 and DNA sequences. By convension, sequences from RNA molecules are
88 shown as is they were DNA. Objects are supposed to make the
89 distinction when needed. This class is one of the few where this
90 distinctions needs to be made. Internally, it changes all Ts into Us
91 before weight and monomer count.
92
93
94 SeqStats can be called in two distinct manners. If only a single
95 computation is required on a given sequence object, the method can be
96 called easily using the SeqStats object directly:
97
98 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
99
100 Alternately, if several computations will be required on a given
101 sequence object, an "instance" statistics object can be constructed
102 and used for the method calls:
103
104 $seq_stats = Bio::Tools::SeqStats->new($seqobj);
105 $monomers = $seq_stats->count_monomers();
106 $codons = $seq_stats->count_codons();
107 $weight = $seq_stats->get_mol_wt();
108
109 As currently implemented the object can return the following values
110 from a sequence:
111
112 =over 3
113
114 =item *
115
116 The molecular weight of the sequence: get_mol_wt()
117
118 =item *
119
120 The number of each type of monomer present: count_monomers()
121
122 =item *
123
124 The number of each codon present in a nucleic acid sequence:
125 count_codons()
126
127 =back
128
129 For dna (and rna) sequences, single-stranded weights are returned. The
130 molecular weights are calculated for neutral - ie not ionized -
131 nucleic acids. The returned weight is the sum of the
132 base-sugar-phosphate residues of the chain plus one weight of water to
133 to account for the additional OH on the phosphate of the 5' residue
134 and the additional H on the sugar ring of the 3' residue. Note that
135 this leads to a difference of 18 in calculated molecular weights
136 compared to some other available programs (eg Informax VectorNTI).
137
138 Note that since sequences may contain ambiguous monomers (eg "M"
139 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
140 returns a two-element array containing the greatest lower bound and
141 least upper bound of the molecule. (For a sequence with no ambiguous
142 monomers, the two elements of the returned array will be equal.) The
143 method count_codons() handles ambiguous bases by simply counting all
144 ambiguous codons together and issuing a warning to that effect.
145
146
147 =head1 DEVELOPERS NOTES
148
149 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
150
151 =head1 FEEDBACK
152
153 =head2 Mailing Lists
154
155 User feedback is an integral part of the evolution of this and other
156 Bioperl modules. Send your comments and suggestions preferably to one
157 of the Bioperl mailing lists. Your participation is much appreciated.
158
159 bioperl-l@bioperl.org - General discussion
160 http://bio.perl.org/MailList.html - About the mailing lists
161
162 =head2 Reporting Bugs
163
164 Report bugs to the Bioperl bug tracking system to help us keep track
165 the bugs and their resolution.
166 Bug reports can be submitted via email or the web:
167
168 bioperl-bugs@bio.perl.org
169 http://bugzilla.bioperl.org/
170
171 =head1 AUTHOR - Peter Schattner
172
173 Email schattner@alum.mit.edu
174
175 =head1 APPENDIX
176
177 The rest of the documentation details each of the object
178 methods. Internal methods are usually preceded with a _
179
180 =cut
181
182
183 package Bio::Tools::SeqStats;
184 use strict;
185 use vars qw(@ISA %Alphabets %Alphabets_strict $amino_weights
186 $rna_weights $dna_weights %Weights );
187 use Bio::Seq;
188 use Bio::Root::Root;
189 @ISA = qw(Bio::Root::Root);
190
191 BEGIN {
192 %Alphabets = (
193 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
194 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
195 'protein' => [ qw(A R N D C Q E G H I L K M F
196 P S T W X Y V B Z *) ], # sac: added B, Z
197 );
198
199 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
200 %Alphabets_strict = (
201 'dna' => [ qw( A C G T ) ],
202 'rna' => [ qw( A C G U ) ],
203 'protein' => [ qw(A R N D C Q E G H I L K M F
204 P S T W Y V) ],
205 );
206
207
208 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
209 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
210
211 # Amino Acid alphabet
212
213 # ------------------------------------------
214 # Symbol Meaning
215 # ------------------------------------------
216
217 my $amino_A_wt = 89.09;
218 my $amino_C_wt = 121.15;
219 my $amino_D_wt = 133.1;
220 my $amino_E_wt = 147.13;
221 my $amino_F_wt = 165.19;
222 my $amino_G_wt = 75.07;
223 my $amino_H_wt = 155.16;
224 my $amino_I_wt = 131.18;
225 my $amino_K_wt = 146.19;
226 my $amino_L_wt = 131.18;
227 my $amino_M_wt = 149.22;
228 my $amino_N_wt = 132.12;
229 my $amino_P_wt = 115.13;
230 my $amino_Q_wt = 146.15;
231 my $amino_R_wt = 174.21;
232 my $amino_S_wt = 105.09;
233 my $amino_T_wt = 119.12;
234 my $amino_V_wt = 117.15;
235 my $amino_W_wt = 204.22;
236 my $amino_Y_wt = 181.19;
237
238 $amino_weights = {
239 'A' => [$amino_A_wt, $amino_A_wt], # Alanine
240 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
241 'C' => [$amino_C_wt, $amino_C_wt], # Cystine
242 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
243 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
244 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
245 'G' => [$amino_G_wt, $amino_G_wt], # Glycine
246 'H' => [$amino_H_wt, $amino_H_wt], # Histidine
247 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
248 'K' => [$amino_K_wt, $amino_K_wt], # Lysine
249 'L' => [$amino_L_wt, $amino_L_wt], # Leucine
250 'M' => [$amino_M_wt, $amino_M_wt], # Methionine
251 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
252 'P' => [$amino_P_wt, $amino_P_wt], # Proline
253 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
254 'R' => [$amino_R_wt, $amino_R_wt], # Arginine
255 'S' => [$amino_S_wt, $amino_S_wt], # Serine
256 'T' => [$amino_T_wt, $amino_T_wt], # Threonine
257 'V' => [$amino_V_wt, $amino_V_wt], # Valine
258 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
259 'X' => [$amino_G_wt, $amino_W_wt], # Unknown
260 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
261 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
262 };
263
264 # Extended Dna / Rna alphabet
265 use vars ( qw($C $O $N $H $P $water) );
266 use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
267 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
268 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
269 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
270 use vars ( qw($dna_weights $rna_weights %Weights));
271
272 $C = 12.01;
273 $O = 16.00;
274 $N = 14.01;
275 $H = 1.01;
276 $P = 30.97;
277 $water = 18.015;
278
279 $adenine = 5 * $C + 5 * $N + 5 * $H;
280 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
281 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
282 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
283 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
284
285 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P; #neutral (unionized) form
286 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
287
288 # the following are single strand molecular weights / base
289 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
290 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
291 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
292 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
293
294 $rna_A_wt = $adenine + $ribose_phosphate - $water;
295 $rna_C_wt = $cytosine + $ribose_phosphate - $water;
296 $rna_G_wt = $guanine + $ribose_phosphate - $water;
297 $rna_U_wt = $uracil + $ribose_phosphate - $water;
298
299 $dna_weights = {
300 'A' => [$dna_A_wt,$dna_A_wt], # Adenine
301 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
302 'G' => [$dna_G_wt,$dna_G_wt], # Guanine
303 'T' => [$dna_T_wt,$dna_T_wt], # Thymine
304 'M' => [$dna_C_wt,$dna_A_wt], # A or C
305 'R' => [$dna_A_wt,$dna_G_wt], # A or G
306 'W' => [$dna_T_wt,$dna_A_wt], # A or T
307 'S' => [$dna_C_wt,$dna_G_wt], # C or G
308 'Y' => [$dna_C_wt,$dna_T_wt], # C or T
309 'K' => [$dna_T_wt,$dna_G_wt], # G or T
310 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
311 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
312 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
313 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
314 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
315 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
316 };
317
318 $rna_weights = {
319 'A' => [$rna_A_wt,$rna_A_wt], # Adenine
320 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
321 'G' => [$rna_G_wt,$rna_G_wt], # Guanine
322 'U' => [$rna_U_wt,$rna_U_wt], # Uracil
323 'M' => [$rna_C_wt,$rna_A_wt], # A or C
324 'R' => [$rna_A_wt,$rna_G_wt], # A or G
325 'W' => [$rna_U_wt,$rna_A_wt], # A or U
326 'S' => [$rna_C_wt,$rna_G_wt], # C or G
327 'Y' => [$rna_C_wt,$rna_U_wt], # C or U
328 'K' => [$rna_U_wt,$rna_G_wt], # G or U
329 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
330 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
331 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
332 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
333 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
334 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
335 };
336
337 %Weights = (
338 'dna' => $dna_weights,
339 'rna' => $rna_weights,
340 'protein' => $amino_weights,
341 );
342 }
343
344 sub new {
345 my($class,@args) = @_;
346 my $self = $class->SUPER::new(@args);
347
348 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
349 unless ($seqobj->isa("Bio::PrimarySeqI")) {
350 $self->throw(" SeqStats works only on PrimarySeqI objects \n");
351 }
352 if ( !defined $seqobj->alphabet || ! defined $Alphabets{$seqobj->alphabet}) {
353 $self->throw("Must have a valid alphabet defined for seq (".
354 join(",",keys %Alphabets));
355 }
356 $self->{'_seqref'} = $seqobj;
357 # check the letters in the sequence
358 $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
359 return $self;
360 }
361
362 =head2 count_monomers
363
364 Title : count_monomers
365 Usage : $rcount = $seq_stats->count_monomers();
366 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
367 Function: Counts the number of each type of monomer (amino acid or
368 base) in the sequence.
369 Ts are counted as Us in RNA sequences.
370 Example :
371 Returns : Reference to a hash in which keys are letters of the
372 genetic alphabet used and values are number of occurrences
373 of the letter in the sequence.
374 Args : None or reference to sequence object
375 Throws : Throws an exception if type of sequence is unknown (ie amino
376 or nucleic)or if unknown letter in alphabet. Ambiguous
377 elements are allowed.
378
379 =cut
380
381 sub count_monomers{
382 my %count = ();
383 my $seqobj;
384 my $_is_strict;
385 my $element = '';
386 my $_is_instance = 1 ;
387 my $self = shift @_;
388 my $object_argument = shift @_;
389
390 # First we need to determine if the present object is an instance
391 # object or if the sequence object has been passed as an argument
392
393 if (defined $object_argument) {
394 $_is_instance = 0;
395 }
396
397 # If we are using an instance object...
398 if ($_is_instance) {
399 if ($self->{'_monomer_count'}) {
400 return $self->{'_monomer_count'}; # return count if previously calculated
401 }
402 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
403 $seqobj = $self->{'_seqref'};
404 } else {
405 # otherwise...
406 $seqobj = $object_argument;
407
408 # Following two lines lead to error in "throw" routine
409 $seqobj->isa("Bio::PrimarySeqI") ||
410 $self->throw(" SeqStats works only on PrimarySeqI objects \n");
411 # is alphabet OK? Is it strict?
412 $_is_strict = _is_alphabet_strict($seqobj);
413 }
414
415 my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
416 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters
417
418 # convert everything to upper case to be safe
419 my $seqstring = uc $seqobj->seq();
420
421 # Since T is used in RichSeq RNA sequences, do conversion locally
422 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
423
424 # For each letter, count the number of times it appears in
425 # the sequence
426 LETTER:
427 foreach $element (@$alphabet) {
428 # skip terminator symbol which may confuse regex
429 next LETTER if $element eq '*';
430 $count{$element} = ( $seqstring =~ s/$element/$element/g);
431 }
432
433 if ($_is_instance) {
434 $self->{'_monomer_count'} = \%count; # Save in case called again later
435 }
436
437 return \%count;
438 }
439
440 =head2 get_mol_wt
441
442 Title : get_mol_wt
443 Usage : $wt = $seqobj->get_mol_wt() or
444 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
445 Function: Calculate molecular weight of sequence
446 Ts are counted as Us in RNA sequences.
447 Example :
448
449 Returns : Reference to two element array containing lower and upper
450 bounds of molecule molecular weight. (For dna (and rna)
451 sequences, single-stranded weights are returned.) If
452 sequence contains no ambiguous elements, both entries in
453 array are equal to molecular weight of molecule.
454 Args : None or reference to sequence object
455 Throws : Exception if type of sequence is unknown (ie not amino or
456 nucleic) or if unknown letter in alphabet. Ambiguous
457 elements are allowed.
458
459 =cut
460
461 sub get_mol_wt {
462
463 my $seqobj;
464 my $_is_strict;
465 my $element = '';
466 my $_is_instance = 1 ;
467 my $self = shift @_;
468 my $object_argument = shift @_;
469 my ($weight_array, $rcount);
470
471 if (defined $object_argument) {
472 $_is_instance = 0;
473 }
474
475 if ($_is_instance) {
476 if ($weight_array = $self->{'_mol_wt'}) {
477 # return mol. weight if previously calculated
478 return $weight_array;
479 }
480 $seqobj = $self->{'_seqref'};
481 $rcount = $self->count_monomers();
482 } else {
483 $seqobj = $object_argument;
484 $seqobj->isa("Bio::PrimarySeqI") ||
485 die("Error: SeqStats works only on PrimarySeqI objects \n");
486 $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK?
487 $rcount = $self->count_monomers($seqobj);
488 }
489
490 # We will also need to know what type of monomer we are dealing with
491 my $moltype = $seqobj->alphabet();
492
493 # In general,the molecular weight is bounded below by the sum of the
494 # weights of lower bounds of each alphabet symbol times the number of
495 # occurrences of the symbol in the sequence. A similar upper bound on
496 # the weight is also calculated.
497
498 # Note that for "strict" (ie unambiguous) sequences there is an
499 # inefficiency since the upper bound = the lower bound (and is
500 # calculated twice). However, this decrease in performance will be
501 # minor and leads to (IMO) significantly more readable code.
502
503 my $weight_lower_bound = 0;
504 my $weight_upper_bound = 0;
505 my $weight_table = $Weights{$moltype};
506
507
508 # compute weight of all the residues
509 foreach $element (keys %$rcount) {
510 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
511 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
512 }
513 if ($moltype =~ /protein/) {
514 # remove of H2O during peptide bond formation.
515 $weight_lower_bound -= $water * ($seqobj->length - 1);
516 $weight_upper_bound -= $water * ($seqobj->length - 1);
517 } else {
518 # Correction because phosphate of 5' residue has additional OH and
519 # sugar ring of 3' residue has additional H
520 $weight_lower_bound += $water;
521 $weight_upper_bound += $water;
522 }
523
524 $weight_lower_bound = sprintf("%.0f", $weight_lower_bound);
525 $weight_upper_bound = sprintf("%.0f", $weight_upper_bound);
526
527 $weight_array = [$weight_lower_bound, $weight_upper_bound];
528
529 if ($_is_instance) {
530 $self->{'_mol_wt'} = $weight_array; # Save in case called again later
531 }
532 return $weight_array;
533 }
534
535
536 =head2 count_codons
537
538 Title : count_codons
539 Usage : $rcount = $seqstats->count_codons (); or
540 $rcount = Bio::Tools::SeqStats->count_codons($seqobj);
541
542 Function: Counts the number of each type of codons in a given frame
543 for a dna or rna sequence.
544 Example :
545 Returns : Reference to a hash in which keys are codons of the genetic
546 alphabet used and values are number of occurrences of the
547 codons in the sequence. All codons with "ambiguous" bases
548 are counted together.
549 Args : None or reference to sequence object
550
551 Throws : an exception if type of sequence is unknown or protein.
552
553 =cut
554
555 sub count_codons {
556 my $rcount = {};
557 my $codon ;
558 my $seqobj;
559 my $_is_strict;
560 my $element = '';
561 my $_is_instance = 1 ;
562 my $self = shift @_;
563 my $object_argument = shift @_;
564
565 if (defined $object_argument) {
566 $_is_instance = 0;
567 }
568
569 if ($_is_instance) {
570 if ($rcount = $self->{'_codon_count'}) {
571 return $rcount; # return count if previously calculated
572 }
573 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
574 $seqobj = $self->{'_seqref'};
575 } else {
576 $seqobj = $object_argument;
577 $seqobj->isa("Bio::PrimarySeqI") ||
578 die(" Error: SeqStats works only on PrimarySeqI objects \n");
579 $_is_strict = _is_alphabet_strict($seqobj);
580 }
581
582 # Codon counts only make sense for nucleic acid sequences
583 my $alphabet = $seqobj->alphabet();
584
585 unless ($alphabet =~ /[dr]na/) {
586 $seqobj->throw(" Codon counts only meaningful for dna or rna, ".
587 "not for $alphabet sequences. \n");
588 }
589
590 # If sequence contains ambiguous bases, warn that codons
591 # containing them will all be lumped together in the count.
592
593 if (!$_is_strict ) {
594 $seqobj->warn(" Sequence $seqobj contains ambiguous bases. \n".
595 " All codons with ambiguous bases will be added together in count. \n");
596 }
597
598 my $seq = $seqobj->seq();
599
600 # Now step through the string by threes and count the codons
601
602 CODON:
603 while (length($seq) > 2) {
604 $codon = substr($seq,0,3);
605 $seq = substr($seq,3);
606 if ($codon =~ /[^ACTGU]/) {
607 $$rcount{'ambiguous'}++; #lump together ambiguous codons
608 next CODON;
609 }
610 if (!defined $$rcount{$codon}) {
611 $$rcount{$codon}= 1 ;
612 next CODON;
613 }
614 $$rcount{$codon}++; # default
615 }
616
617
618 if ($_is_instance) {
619 $self->{'_codon_count'} = $rcount; # Save in case called again later
620 }
621
622 return $rcount;
623 }
624
625
626 =head2 _is_alphabet_strict
627
628 Title : _is_alphabet_strict
629 Usage :
630 Function: internal function to determine whether there are
631 any ambiguous elements in the current sequence
632 Example :
633 Returns : 1 if strict alphabet is being used,
634 0 if ambiguous elements are present
635 Args :
636
637 Throws : an exception if type of sequence is unknown (ie amino or
638 nucleic) or if unknown letter in alphabet. Ambiguous
639 monomers are allowed.
640
641 =cut
642
643 sub _is_alphabet_strict {
644
645 my ($seqobj) = @_;
646 my $moltype = $seqobj->alphabet();
647
648 # convert everything to upper case to be safe
649 my $seqstring = uc $seqobj->seq();
650
651 # Since T is used in RichSeq RNA sequences, do conversion locally
652 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
653
654 # First we check if only the 'strict' letters are present in the
655 # sequence string If not, we check whether the remaining letters
656 # are ambiguous monomers or whether there are illegal letters in
657 # the string
658
659 # $alpha_array is a ref to an array of the 'strictly' allowed letters
660 my $alpha_array = $Alphabets_strict{$moltype} ;
661
662 # $alphabet contains the allowed letters in string form
663 my $alphabet = join ('', @$alpha_array) ;
664 unless ($seqstring =~ /[^$alphabet]/) {
665 return 1 ;
666 }
667
668 # Next try to match with the alphabet's ambiguous letters
669 $alpha_array = $Alphabets{$moltype} ;
670 $alphabet = join ('', @$alpha_array) ;
671
672 unless ($seqstring =~ /[^$alphabet]/) {
673 return 0 ;
674 }
675
676 # If we got here there is an illegal letter in the sequence
677 $seqobj->throw(" Alphabet not OK for $seqobj \n");
678
679 }
680
681 =head2 _print_data
682
683 Title : _print_data
684 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
685 Function: Displays dna / rna parameters (used for debugging)
686 Returns : 1
687 Args : None
688
689 Used for debugging.
690
691 =cut
692
693 sub _print_data {
694
695 print "\n adenine = : $adenine \n";
696 print "\n guanine = : $guanine \n";
697 print "\n cytosine = : $cytosine \n";
698 print "\n thymine = : $thymine \n";
699 print "\n uracil = : $uracil \n";
700
701 print "\n dna_A_wt = : $dna_A_wt \n";
702 print "\n dna_C_wt = : $dna_C_wt \n";
703 print "\n dna_G_wt = : $dna_G_wt \n";
704 print "\n dna_T_wt = : $dna_T_wt \n";
705
706 print "\n rna_A_wt = : $rna_A_wt \n";
707 print "\n rna_C_wt = : $rna_C_wt \n";
708 print "\n rna_G_wt = : $rna_G_wt \n";
709 print "\n rna_U_wt = : $rna_U_wt \n";
710
711 return 1;
712 }