Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/SeqStats.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/SeqStats.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,712 @@ +# $Id: SeqStats.pm,v 1.16.2.1 2003/02/28 13:17:06 heikki Exp $ +# +# BioPerl module for Bio::Tools::SeqStats +# +# Cared for by +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::SeqStats - Object holding statistics for one particular sequence + +=head1 SYNOPSIS + + # build a primary nucleic acid or protein sequence object somehow + # then build a statistics object from the sequence object + + $seqobj = Bio::PrimarySeq->new(-seq=>'ACTGTGGCGTCAACTG', + -alphabet=>'dna', + -id=>'test'); + $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj); + + # obtain a hash of counts of each type of monomer + # (ie amino or nucleic acid) + print "\nMonomer counts using statistics object\n"; + $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj); + $hash_ref = $seq_stats->count_monomers(); # eg for DNA sequence + foreach $base (sort keys %$hash_ref) { + print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n"; + } + + # or obtain the count directly without creating a new statistics object + print "\nMonomer counts without statistics object\n"; + $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj); + foreach $base (sort keys %$hash_ref) { + print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n"; + } + + + # obtain hash of counts of each type of codon in a nucleic acid sequence + print "\nCodon counts using statistics object\n"; + $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence + foreach $base (sort keys %$hash_ref) { + print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n"; + } + + # or + print "\nCodon counts without statistics object\n"; + $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj); + foreach $base (sort keys %$hash_ref) { + print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n"; + } + + # Obtain the molecular weight of a sequence. Since the sequence may contain + # ambiguous monomers, the molecular weight is returned as a (reference to) a + # two element array containing greatest lower bound (GLB) and least upper bound + # (LUB) of the molecular weight + $weight = $seq_stats->get_mol_wt(); + print "\nMolecular weight (using statistics object) of sequence ", $seqobj->id(), + " is between ", $$weight[0], " and " , + $$weight[1], "\n"; + + # or + $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj); + print "\nMolecular weight (without statistics object) of sequence ", $seqobj->id(), + " is between ", $$weight[0], " and " , + $$weight[1], "\n"; + + +=head1 DESCRIPTION + +Bio::Tools::SeqStats is a lightweight object for the calculation of +simple statistical and numerical properties of a sequence. By +"lightweight" I mean that only "primary" sequences are handled by the +object. The calling script needs to create the appropriate primary +sequence to be passed to SeqStats if statistics on a sequence feature +are required. Similarly if a codon count is desired for a +frame-shifted sequence and/or a negative strand sequence, the calling +script needs to create that sequence and pass it to the SeqStats +object. + +Nota that nucleotide sequences in bioperl do not strictly separate RNA +and DNA sequences. By convension, sequences from RNA molecules are +shown as is they were DNA. Objects are supposed to make the +distinction when needed. This class is one of the few where this +distinctions needs to be made. Internally, it changes all Ts into Us +before weight and monomer count. + + +SeqStats can be called in two distinct manners. If only a single +computation is required on a given sequence object, the method can be +called easily using the SeqStats object directly: + + $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj); + +Alternately, if several computations will be required on a given +sequence object, an "instance" statistics object can be constructed +and used for the method calls: + + $seq_stats = Bio::Tools::SeqStats->new($seqobj); + $monomers = $seq_stats->count_monomers(); + $codons = $seq_stats->count_codons(); + $weight = $seq_stats->get_mol_wt(); + +As currently implemented the object can return the following values +from a sequence: + +=over 3 + +=item * + +The molecular weight of the sequence: get_mol_wt() + +=item * + +The number of each type of monomer present: count_monomers() + +=item * + +The number of each codon present in a nucleic acid sequence: +count_codons() + +=back + +For dna (and rna) sequences, single-stranded weights are returned. The +molecular weights are calculated for neutral - ie not ionized - +nucleic acids. The returned weight is the sum of the +base-sugar-phosphate residues of the chain plus one weight of water to +to account for the additional OH on the phosphate of the 5' residue +and the additional H on the sugar ring of the 3' residue. Note that +this leads to a difference of 18 in calculated molecular weights +compared to some other available programs (eg Informax VectorNTI). + +Note that since sequences may contain ambiguous monomers (eg "M" +meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt +returns a two-element array containing the greatest lower bound and +least upper bound of the molecule. (For a sequence with no ambiguous +monomers, the two elements of the returned array will be equal.) The +method count_codons() handles ambiguous bases by simply counting all +ambiguous codons together and issuing a warning to that effect. + + +=head1 DEVELOPERS NOTES + +Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email schattner@alum.mit.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +package Bio::Tools::SeqStats; +use strict; +use vars qw(@ISA %Alphabets %Alphabets_strict $amino_weights + $rna_weights $dna_weights %Weights ); +use Bio::Seq; +use Bio::Root::Root; +@ISA = qw(Bio::Root::Root); + +BEGIN { + %Alphabets = ( + 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ], + 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ], + 'protein' => [ qw(A R N D C Q E G H I L K M F + P S T W X Y V B Z *) ], # sac: added B, Z + ); + +# SAC: new strict alphabet: doesn't allow any ambiguity characters. + %Alphabets_strict = ( + 'dna' => [ qw( A C G T ) ], + 'rna' => [ qw( A C G U ) ], + 'protein' => [ qw(A R N D C Q E G H I L K M F + P S T W Y V) ], + ); + + +# IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: +# Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. + +# Amino Acid alphabet + +# ------------------------------------------ +# Symbol Meaning +# ------------------------------------------ + + my $amino_A_wt = 89.09; + my $amino_C_wt = 121.15; + my $amino_D_wt = 133.1; + my $amino_E_wt = 147.13; + my $amino_F_wt = 165.19; + my $amino_G_wt = 75.07; + my $amino_H_wt = 155.16; + my $amino_I_wt = 131.18; + my $amino_K_wt = 146.19; + my $amino_L_wt = 131.18; + my $amino_M_wt = 149.22; + my $amino_N_wt = 132.12; + my $amino_P_wt = 115.13; + my $amino_Q_wt = 146.15; + my $amino_R_wt = 174.21; + my $amino_S_wt = 105.09; + my $amino_T_wt = 119.12; + my $amino_V_wt = 117.15; + my $amino_W_wt = 204.22; + my $amino_Y_wt = 181.19; + + $amino_weights = { + 'A' => [$amino_A_wt, $amino_A_wt], # Alanine + 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine + 'C' => [$amino_C_wt, $amino_C_wt], # Cystine + 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid + 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid + 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine + 'G' => [$amino_G_wt, $amino_G_wt], # Glycine + 'H' => [$amino_H_wt, $amino_H_wt], # Histidine + 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine + 'K' => [$amino_K_wt, $amino_K_wt], # Lysine + 'L' => [$amino_L_wt, $amino_L_wt], # Leucine + 'M' => [$amino_M_wt, $amino_M_wt], # Methionine + 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine + 'P' => [$amino_P_wt, $amino_P_wt], # Proline + 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine + 'R' => [$amino_R_wt, $amino_R_wt], # Arginine + 'S' => [$amino_S_wt, $amino_S_wt], # Serine + 'T' => [$amino_T_wt, $amino_T_wt], # Threonine + 'V' => [$amino_V_wt, $amino_V_wt], # Valine + 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan + 'X' => [$amino_G_wt, $amino_W_wt], # Unknown + 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine + 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine + }; + + # Extended Dna / Rna alphabet + use vars ( qw($C $O $N $H $P $water) ); + use vars ( qw($adenine $guanine $cytosine $thymine $uracil)); + use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi)); + use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt + $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt)); + use vars ( qw($dna_weights $rna_weights %Weights)); + + $C = 12.01; + $O = 16.00; + $N = 14.01; + $H = 1.01; + $P = 30.97; + $water = 18.015; + + $adenine = 5 * $C + 5 * $N + 5 * $H; + $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H; + $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H; + $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H; + $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H; + + $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P; #neutral (unionized) form + $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P; + + # the following are single strand molecular weights / base + $dna_A_wt = $adenine + $deoxyribose_phosphate - $water; + $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water; + $dna_G_wt = $guanine + $deoxyribose_phosphate - $water; + $dna_T_wt = $thymine + $deoxyribose_phosphate - $water; + + $rna_A_wt = $adenine + $ribose_phosphate - $water; + $rna_C_wt = $cytosine + $ribose_phosphate - $water; + $rna_G_wt = $guanine + $ribose_phosphate - $water; + $rna_U_wt = $uracil + $ribose_phosphate - $water; + + $dna_weights = { + 'A' => [$dna_A_wt,$dna_A_wt], # Adenine + 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine + 'G' => [$dna_G_wt,$dna_G_wt], # Guanine + 'T' => [$dna_T_wt,$dna_T_wt], # Thymine + 'M' => [$dna_C_wt,$dna_A_wt], # A or C + 'R' => [$dna_A_wt,$dna_G_wt], # A or G + 'W' => [$dna_T_wt,$dna_A_wt], # A or T + 'S' => [$dna_C_wt,$dna_G_wt], # C or G + 'Y' => [$dna_C_wt,$dna_T_wt], # C or T + 'K' => [$dna_T_wt,$dna_G_wt], # G or T + 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G + 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T + 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T + 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T + 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C + 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C + }; + + $rna_weights = { + 'A' => [$rna_A_wt,$rna_A_wt], # Adenine + 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine + 'G' => [$rna_G_wt,$rna_G_wt], # Guanine + 'U' => [$rna_U_wt,$rna_U_wt], # Uracil + 'M' => [$rna_C_wt,$rna_A_wt], # A or C + 'R' => [$rna_A_wt,$rna_G_wt], # A or G + 'W' => [$rna_U_wt,$rna_A_wt], # A or U + 'S' => [$rna_C_wt,$rna_G_wt], # C or G + 'Y' => [$rna_C_wt,$rna_U_wt], # C or U + 'K' => [$rna_U_wt,$rna_G_wt], # G or U + 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G + 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U + 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U + 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U + 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C + 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C + }; + + %Weights = ( + 'dna' => $dna_weights, + 'rna' => $rna_weights, + 'protein' => $amino_weights, + ); +} + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($seqobj) = $self->_rearrange([qw(SEQ)],@args); + unless ($seqobj->isa("Bio::PrimarySeqI")) { + $self->throw(" SeqStats works only on PrimarySeqI objects \n"); + } + if ( !defined $seqobj->alphabet || ! defined $Alphabets{$seqobj->alphabet}) { + $self->throw("Must have a valid alphabet defined for seq (". + join(",",keys %Alphabets)); + } + $self->{'_seqref'} = $seqobj; + # check the letters in the sequence + $self->{'_is_strict'} = _is_alphabet_strict($seqobj); + return $self; +} + +=head2 count_monomers + + Title : count_monomers + Usage : $rcount = $seq_stats->count_monomers(); + or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj); + Function: Counts the number of each type of monomer (amino acid or + base) in the sequence. + Ts are counted as Us in RNA sequences. + Example : + Returns : Reference to a hash in which keys are letters of the + genetic alphabet used and values are number of occurrences + of the letter in the sequence. + Args : None or reference to sequence object + Throws : Throws an exception if type of sequence is unknown (ie amino + or nucleic)or if unknown letter in alphabet. Ambiguous + elements are allowed. + +=cut + +sub count_monomers{ + my %count = (); + my $seqobj; + my $_is_strict; + my $element = ''; + my $_is_instance = 1 ; + my $self = shift @_; + my $object_argument = shift @_; + + # First we need to determine if the present object is an instance + # object or if the sequence object has been passed as an argument + + if (defined $object_argument) { + $_is_instance = 0; + } + + # If we are using an instance object... + if ($_is_instance) { + if ($self->{'_monomer_count'}) { + return $self->{'_monomer_count'}; # return count if previously calculated + } + $_is_strict = $self->{'_is_strict'}; # retrieve "strictness" + $seqobj = $self->{'_seqref'}; + } else { + # otherwise... + $seqobj = $object_argument; + + # Following two lines lead to error in "throw" routine + $seqobj->isa("Bio::PrimarySeqI") || + $self->throw(" SeqStats works only on PrimarySeqI objects \n"); + # is alphabet OK? Is it strict? + $_is_strict = _is_alphabet_strict($seqobj); + } + + my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} : + $Alphabets{$seqobj->alphabet} ; # get array of allowed letters + + # convert everything to upper case to be safe + my $seqstring = uc $seqobj->seq(); + + # Since T is used in RichSeq RNA sequences, do conversion locally + $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna'; + + # For each letter, count the number of times it appears in + # the sequence + LETTER: + foreach $element (@$alphabet) { + # skip terminator symbol which may confuse regex + next LETTER if $element eq '*'; + $count{$element} = ( $seqstring =~ s/$element/$element/g); + } + + if ($_is_instance) { + $self->{'_monomer_count'} = \%count; # Save in case called again later + } + + return \%count; +} + +=head2 get_mol_wt + + Title : get_mol_wt + Usage : $wt = $seqobj->get_mol_wt() or + $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj); + Function: Calculate molecular weight of sequence + Ts are counted as Us in RNA sequences. + Example : + + Returns : Reference to two element array containing lower and upper + bounds of molecule molecular weight. (For dna (and rna) + sequences, single-stranded weights are returned.) If + sequence contains no ambiguous elements, both entries in + array are equal to molecular weight of molecule. + Args : None or reference to sequence object + Throws : Exception if type of sequence is unknown (ie not amino or + nucleic) or if unknown letter in alphabet. Ambiguous + elements are allowed. + +=cut + +sub get_mol_wt { + + my $seqobj; + my $_is_strict; + my $element = ''; + my $_is_instance = 1 ; + my $self = shift @_; + my $object_argument = shift @_; + my ($weight_array, $rcount); + + if (defined $object_argument) { + $_is_instance = 0; + } + + if ($_is_instance) { + if ($weight_array = $self->{'_mol_wt'}) { + # return mol. weight if previously calculated + return $weight_array; + } + $seqobj = $self->{'_seqref'}; + $rcount = $self->count_monomers(); + } else { + $seqobj = $object_argument; + $seqobj->isa("Bio::PrimarySeqI") || + die("Error: SeqStats works only on PrimarySeqI objects \n"); + $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK? + $rcount = $self->count_monomers($seqobj); + } + + # We will also need to know what type of monomer we are dealing with + my $moltype = $seqobj->alphabet(); + + # In general,the molecular weight is bounded below by the sum of the + # weights of lower bounds of each alphabet symbol times the number of + # occurrences of the symbol in the sequence. A similar upper bound on + # the weight is also calculated. + + # Note that for "strict" (ie unambiguous) sequences there is an + # inefficiency since the upper bound = the lower bound (and is + # calculated twice). However, this decrease in performance will be + # minor and leads to (IMO) significantly more readable code. + + my $weight_lower_bound = 0; + my $weight_upper_bound = 0; + my $weight_table = $Weights{$moltype}; + + + # compute weight of all the residues + foreach $element (keys %$rcount) { + $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0]; + $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1]; + } + if ($moltype =~ /protein/) { + # remove of H2O during peptide bond formation. + $weight_lower_bound -= $water * ($seqobj->length - 1); + $weight_upper_bound -= $water * ($seqobj->length - 1); + } else { + # Correction because phosphate of 5' residue has additional OH and + # sugar ring of 3' residue has additional H + $weight_lower_bound += $water; + $weight_upper_bound += $water; + } + + $weight_lower_bound = sprintf("%.0f", $weight_lower_bound); + $weight_upper_bound = sprintf("%.0f", $weight_upper_bound); + + $weight_array = [$weight_lower_bound, $weight_upper_bound]; + + if ($_is_instance) { + $self->{'_mol_wt'} = $weight_array; # Save in case called again later + } + return $weight_array; +} + + +=head2 count_codons + + Title : count_codons + Usage : $rcount = $seqstats->count_codons (); or + $rcount = Bio::Tools::SeqStats->count_codons($seqobj); + + Function: Counts the number of each type of codons in a given frame + for a dna or rna sequence. + Example : + Returns : Reference to a hash in which keys are codons of the genetic + alphabet used and values are number of occurrences of the + codons in the sequence. All codons with "ambiguous" bases + are counted together. + Args : None or reference to sequence object + + Throws : an exception if type of sequence is unknown or protein. + +=cut + +sub count_codons { + my $rcount = {}; + my $codon ; + my $seqobj; + my $_is_strict; + my $element = ''; + my $_is_instance = 1 ; + my $self = shift @_; + my $object_argument = shift @_; + + if (defined $object_argument) { + $_is_instance = 0; + } + + if ($_is_instance) { + if ($rcount = $self->{'_codon_count'}) { + return $rcount; # return count if previously calculated + } + $_is_strict = $self->{'_is_strict'}; # retrieve "strictness" + $seqobj = $self->{'_seqref'}; + } else { + $seqobj = $object_argument; + $seqobj->isa("Bio::PrimarySeqI") || + die(" Error: SeqStats works only on PrimarySeqI objects \n"); + $_is_strict = _is_alphabet_strict($seqobj); + } + + # Codon counts only make sense for nucleic acid sequences + my $alphabet = $seqobj->alphabet(); + + unless ($alphabet =~ /[dr]na/) { + $seqobj->throw(" Codon counts only meaningful for dna or rna, ". + "not for $alphabet sequences. \n"); + } + + # If sequence contains ambiguous bases, warn that codons + # containing them will all be lumped together in the count. + + if (!$_is_strict ) { + $seqobj->warn(" Sequence $seqobj contains ambiguous bases. \n". + " All codons with ambiguous bases will be added together in count. \n"); + } + + my $seq = $seqobj->seq(); + + # Now step through the string by threes and count the codons + + CODON: + while (length($seq) > 2) { + $codon = substr($seq,0,3); + $seq = substr($seq,3); + if ($codon =~ /[^ACTGU]/) { + $$rcount{'ambiguous'}++; #lump together ambiguous codons + next CODON; + } + if (!defined $$rcount{$codon}) { + $$rcount{$codon}= 1 ; + next CODON; + } + $$rcount{$codon}++; # default + } + + + if ($_is_instance) { + $self->{'_codon_count'} = $rcount; # Save in case called again later + } + + return $rcount; +} + + +=head2 _is_alphabet_strict + + Title : _is_alphabet_strict + Usage : + Function: internal function to determine whether there are + any ambiguous elements in the current sequence + Example : + Returns : 1 if strict alphabet is being used, + 0 if ambiguous elements are present + Args : + + Throws : an exception if type of sequence is unknown (ie amino or + nucleic) or if unknown letter in alphabet. Ambiguous + monomers are allowed. + +=cut + +sub _is_alphabet_strict { + + my ($seqobj) = @_; + my $moltype = $seqobj->alphabet(); + + # convert everything to upper case to be safe + my $seqstring = uc $seqobj->seq(); + + # Since T is used in RichSeq RNA sequences, do conversion locally + $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna'; + + # First we check if only the 'strict' letters are present in the + # sequence string If not, we check whether the remaining letters + # are ambiguous monomers or whether there are illegal letters in + # the string + + # $alpha_array is a ref to an array of the 'strictly' allowed letters + my $alpha_array = $Alphabets_strict{$moltype} ; + + # $alphabet contains the allowed letters in string form + my $alphabet = join ('', @$alpha_array) ; + unless ($seqstring =~ /[^$alphabet]/) { + return 1 ; + } + + # Next try to match with the alphabet's ambiguous letters + $alpha_array = $Alphabets{$moltype} ; + $alphabet = join ('', @$alpha_array) ; + + unless ($seqstring =~ /[^$alphabet]/) { + return 0 ; + } + + # If we got here there is an illegal letter in the sequence + $seqobj->throw(" Alphabet not OK for $seqobj \n"); + +} + +=head2 _print_data + + Title : _print_data + Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data(); + Function: Displays dna / rna parameters (used for debugging) + Returns : 1 + Args : None + +Used for debugging. + +=cut + +sub _print_data { + + print "\n adenine = : $adenine \n"; + print "\n guanine = : $guanine \n"; + print "\n cytosine = : $cytosine \n"; + print "\n thymine = : $thymine \n"; + print "\n uracil = : $uracil \n"; + + print "\n dna_A_wt = : $dna_A_wt \n"; + print "\n dna_C_wt = : $dna_C_wt \n"; + print "\n dna_G_wt = : $dna_G_wt \n"; + print "\n dna_T_wt = : $dna_T_wt \n"; + + print "\n rna_A_wt = : $rna_A_wt \n"; + print "\n rna_C_wt = : $rna_C_wt \n"; + print "\n rna_G_wt = : $rna_G_wt \n"; + print "\n rna_U_wt = : $rna_U_wt \n"; + + return 1; +}