Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/SeqStats.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: SeqStats.pm,v 1.16.2.1 2003/02/28 13:17:06 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Tools::SeqStats | |
| 4 # | |
| 5 # Cared for by | |
| 6 # | |
| 7 # Copyright Peter Schattner | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::Tools::SeqStats - Object holding statistics for one particular sequence | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 # build a primary nucleic acid or protein sequence object somehow | |
| 20 # then build a statistics object from the sequence object | |
| 21 | |
| 22 $seqobj = Bio::PrimarySeq->new(-seq=>'ACTGTGGCGTCAACTG', | |
| 23 -alphabet=>'dna', | |
| 24 -id=>'test'); | |
| 25 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj); | |
| 26 | |
| 27 # obtain a hash of counts of each type of monomer | |
| 28 # (ie amino or nucleic acid) | |
| 29 print "\nMonomer counts using statistics object\n"; | |
| 30 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj); | |
| 31 $hash_ref = $seq_stats->count_monomers(); # eg for DNA sequence | |
| 32 foreach $base (sort keys %$hash_ref) { | |
| 33 print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n"; | |
| 34 } | |
| 35 | |
| 36 # or obtain the count directly without creating a new statistics object | |
| 37 print "\nMonomer counts without statistics object\n"; | |
| 38 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj); | |
| 39 foreach $base (sort keys %$hash_ref) { | |
| 40 print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n"; | |
| 41 } | |
| 42 | |
| 43 | |
| 44 # obtain hash of counts of each type of codon in a nucleic acid sequence | |
| 45 print "\nCodon counts using statistics object\n"; | |
| 46 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence | |
| 47 foreach $base (sort keys %$hash_ref) { | |
| 48 print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n"; | |
| 49 } | |
| 50 | |
| 51 # or | |
| 52 print "\nCodon counts without statistics object\n"; | |
| 53 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj); | |
| 54 foreach $base (sort keys %$hash_ref) { | |
| 55 print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n"; | |
| 56 } | |
| 57 | |
| 58 # Obtain the molecular weight of a sequence. Since the sequence may contain | |
| 59 # ambiguous monomers, the molecular weight is returned as a (reference to) a | |
| 60 # two element array containing greatest lower bound (GLB) and least upper bound | |
| 61 # (LUB) of the molecular weight | |
| 62 $weight = $seq_stats->get_mol_wt(); | |
| 63 print "\nMolecular weight (using statistics object) of sequence ", $seqobj->id(), | |
| 64 " is between ", $$weight[0], " and " , | |
| 65 $$weight[1], "\n"; | |
| 66 | |
| 67 # or | |
| 68 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj); | |
| 69 print "\nMolecular weight (without statistics object) of sequence ", $seqobj->id(), | |
| 70 " is between ", $$weight[0], " and " , | |
| 71 $$weight[1], "\n"; | |
| 72 | |
| 73 | |
| 74 =head1 DESCRIPTION | |
| 75 | |
| 76 Bio::Tools::SeqStats is a lightweight object for the calculation of | |
| 77 simple statistical and numerical properties of a sequence. By | |
| 78 "lightweight" I mean that only "primary" sequences are handled by the | |
| 79 object. The calling script needs to create the appropriate primary | |
| 80 sequence to be passed to SeqStats if statistics on a sequence feature | |
| 81 are required. Similarly if a codon count is desired for a | |
| 82 frame-shifted sequence and/or a negative strand sequence, the calling | |
| 83 script needs to create that sequence and pass it to the SeqStats | |
| 84 object. | |
| 85 | |
| 86 Nota that nucleotide sequences in bioperl do not strictly separate RNA | |
| 87 and DNA sequences. By convension, sequences from RNA molecules are | |
| 88 shown as is they were DNA. Objects are supposed to make the | |
| 89 distinction when needed. This class is one of the few where this | |
| 90 distinctions needs to be made. Internally, it changes all Ts into Us | |
| 91 before weight and monomer count. | |
| 92 | |
| 93 | |
| 94 SeqStats can be called in two distinct manners. If only a single | |
| 95 computation is required on a given sequence object, the method can be | |
| 96 called easily using the SeqStats object directly: | |
| 97 | |
| 98 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj); | |
| 99 | |
| 100 Alternately, if several computations will be required on a given | |
| 101 sequence object, an "instance" statistics object can be constructed | |
| 102 and used for the method calls: | |
| 103 | |
| 104 $seq_stats = Bio::Tools::SeqStats->new($seqobj); | |
| 105 $monomers = $seq_stats->count_monomers(); | |
| 106 $codons = $seq_stats->count_codons(); | |
| 107 $weight = $seq_stats->get_mol_wt(); | |
| 108 | |
| 109 As currently implemented the object can return the following values | |
| 110 from a sequence: | |
| 111 | |
| 112 =over 3 | |
| 113 | |
| 114 =item * | |
| 115 | |
| 116 The molecular weight of the sequence: get_mol_wt() | |
| 117 | |
| 118 =item * | |
| 119 | |
| 120 The number of each type of monomer present: count_monomers() | |
| 121 | |
| 122 =item * | |
| 123 | |
| 124 The number of each codon present in a nucleic acid sequence: | |
| 125 count_codons() | |
| 126 | |
| 127 =back | |
| 128 | |
| 129 For dna (and rna) sequences, single-stranded weights are returned. The | |
| 130 molecular weights are calculated for neutral - ie not ionized - | |
| 131 nucleic acids. The returned weight is the sum of the | |
| 132 base-sugar-phosphate residues of the chain plus one weight of water to | |
| 133 to account for the additional OH on the phosphate of the 5' residue | |
| 134 and the additional H on the sugar ring of the 3' residue. Note that | |
| 135 this leads to a difference of 18 in calculated molecular weights | |
| 136 compared to some other available programs (eg Informax VectorNTI). | |
| 137 | |
| 138 Note that since sequences may contain ambiguous monomers (eg "M" | |
| 139 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt | |
| 140 returns a two-element array containing the greatest lower bound and | |
| 141 least upper bound of the molecule. (For a sequence with no ambiguous | |
| 142 monomers, the two elements of the returned array will be equal.) The | |
| 143 method count_codons() handles ambiguous bases by simply counting all | |
| 144 ambiguous codons together and issuing a warning to that effect. | |
| 145 | |
| 146 | |
| 147 =head1 DEVELOPERS NOTES | |
| 148 | |
| 149 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats | |
| 150 | |
| 151 =head1 FEEDBACK | |
| 152 | |
| 153 =head2 Mailing Lists | |
| 154 | |
| 155 User feedback is an integral part of the evolution of this and other | |
| 156 Bioperl modules. Send your comments and suggestions preferably to one | |
| 157 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 158 | |
| 159 bioperl-l@bioperl.org - General discussion | |
| 160 http://bio.perl.org/MailList.html - About the mailing lists | |
| 161 | |
| 162 =head2 Reporting Bugs | |
| 163 | |
| 164 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 165 the bugs and their resolution. | |
| 166 Bug reports can be submitted via email or the web: | |
| 167 | |
| 168 bioperl-bugs@bio.perl.org | |
| 169 http://bugzilla.bioperl.org/ | |
| 170 | |
| 171 =head1 AUTHOR - Peter Schattner | |
| 172 | |
| 173 Email schattner@alum.mit.edu | |
| 174 | |
| 175 =head1 APPENDIX | |
| 176 | |
| 177 The rest of the documentation details each of the object | |
| 178 methods. Internal methods are usually preceded with a _ | |
| 179 | |
| 180 =cut | |
| 181 | |
| 182 | |
| 183 package Bio::Tools::SeqStats; | |
| 184 use strict; | |
| 185 use vars qw(@ISA %Alphabets %Alphabets_strict $amino_weights | |
| 186 $rna_weights $dna_weights %Weights ); | |
| 187 use Bio::Seq; | |
| 188 use Bio::Root::Root; | |
| 189 @ISA = qw(Bio::Root::Root); | |
| 190 | |
| 191 BEGIN { | |
| 192 %Alphabets = ( | |
| 193 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ], | |
| 194 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ], | |
| 195 'protein' => [ qw(A R N D C Q E G H I L K M F | |
| 196 P S T W X Y V B Z *) ], # sac: added B, Z | |
| 197 ); | |
| 198 | |
| 199 # SAC: new strict alphabet: doesn't allow any ambiguity characters. | |
| 200 %Alphabets_strict = ( | |
| 201 'dna' => [ qw( A C G T ) ], | |
| 202 'rna' => [ qw( A C G U ) ], | |
| 203 'protein' => [ qw(A R N D C Q E G H I L K M F | |
| 204 P S T W Y V) ], | |
| 205 ); | |
| 206 | |
| 207 | |
| 208 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: | |
| 209 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. | |
| 210 | |
| 211 # Amino Acid alphabet | |
| 212 | |
| 213 # ------------------------------------------ | |
| 214 # Symbol Meaning | |
| 215 # ------------------------------------------ | |
| 216 | |
| 217 my $amino_A_wt = 89.09; | |
| 218 my $amino_C_wt = 121.15; | |
| 219 my $amino_D_wt = 133.1; | |
| 220 my $amino_E_wt = 147.13; | |
| 221 my $amino_F_wt = 165.19; | |
| 222 my $amino_G_wt = 75.07; | |
| 223 my $amino_H_wt = 155.16; | |
| 224 my $amino_I_wt = 131.18; | |
| 225 my $amino_K_wt = 146.19; | |
| 226 my $amino_L_wt = 131.18; | |
| 227 my $amino_M_wt = 149.22; | |
| 228 my $amino_N_wt = 132.12; | |
| 229 my $amino_P_wt = 115.13; | |
| 230 my $amino_Q_wt = 146.15; | |
| 231 my $amino_R_wt = 174.21; | |
| 232 my $amino_S_wt = 105.09; | |
| 233 my $amino_T_wt = 119.12; | |
| 234 my $amino_V_wt = 117.15; | |
| 235 my $amino_W_wt = 204.22; | |
| 236 my $amino_Y_wt = 181.19; | |
| 237 | |
| 238 $amino_weights = { | |
| 239 'A' => [$amino_A_wt, $amino_A_wt], # Alanine | |
| 240 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine | |
| 241 'C' => [$amino_C_wt, $amino_C_wt], # Cystine | |
| 242 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid | |
| 243 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid | |
| 244 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine | |
| 245 'G' => [$amino_G_wt, $amino_G_wt], # Glycine | |
| 246 'H' => [$amino_H_wt, $amino_H_wt], # Histidine | |
| 247 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine | |
| 248 'K' => [$amino_K_wt, $amino_K_wt], # Lysine | |
| 249 'L' => [$amino_L_wt, $amino_L_wt], # Leucine | |
| 250 'M' => [$amino_M_wt, $amino_M_wt], # Methionine | |
| 251 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine | |
| 252 'P' => [$amino_P_wt, $amino_P_wt], # Proline | |
| 253 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine | |
| 254 'R' => [$amino_R_wt, $amino_R_wt], # Arginine | |
| 255 'S' => [$amino_S_wt, $amino_S_wt], # Serine | |
| 256 'T' => [$amino_T_wt, $amino_T_wt], # Threonine | |
| 257 'V' => [$amino_V_wt, $amino_V_wt], # Valine | |
| 258 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan | |
| 259 'X' => [$amino_G_wt, $amino_W_wt], # Unknown | |
| 260 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine | |
| 261 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine | |
| 262 }; | |
| 263 | |
| 264 # Extended Dna / Rna alphabet | |
| 265 use vars ( qw($C $O $N $H $P $water) ); | |
| 266 use vars ( qw($adenine $guanine $cytosine $thymine $uracil)); | |
| 267 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi)); | |
| 268 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt | |
| 269 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt)); | |
| 270 use vars ( qw($dna_weights $rna_weights %Weights)); | |
| 271 | |
| 272 $C = 12.01; | |
| 273 $O = 16.00; | |
| 274 $N = 14.01; | |
| 275 $H = 1.01; | |
| 276 $P = 30.97; | |
| 277 $water = 18.015; | |
| 278 | |
| 279 $adenine = 5 * $C + 5 * $N + 5 * $H; | |
| 280 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H; | |
| 281 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H; | |
| 282 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H; | |
| 283 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H; | |
| 284 | |
| 285 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P; #neutral (unionized) form | |
| 286 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P; | |
| 287 | |
| 288 # the following are single strand molecular weights / base | |
| 289 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water; | |
| 290 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water; | |
| 291 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water; | |
| 292 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water; | |
| 293 | |
| 294 $rna_A_wt = $adenine + $ribose_phosphate - $water; | |
| 295 $rna_C_wt = $cytosine + $ribose_phosphate - $water; | |
| 296 $rna_G_wt = $guanine + $ribose_phosphate - $water; | |
| 297 $rna_U_wt = $uracil + $ribose_phosphate - $water; | |
| 298 | |
| 299 $dna_weights = { | |
| 300 'A' => [$dna_A_wt,$dna_A_wt], # Adenine | |
| 301 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine | |
| 302 'G' => [$dna_G_wt,$dna_G_wt], # Guanine | |
| 303 'T' => [$dna_T_wt,$dna_T_wt], # Thymine | |
| 304 'M' => [$dna_C_wt,$dna_A_wt], # A or C | |
| 305 'R' => [$dna_A_wt,$dna_G_wt], # A or G | |
| 306 'W' => [$dna_T_wt,$dna_A_wt], # A or T | |
| 307 'S' => [$dna_C_wt,$dna_G_wt], # C or G | |
| 308 'Y' => [$dna_C_wt,$dna_T_wt], # C or T | |
| 309 'K' => [$dna_T_wt,$dna_G_wt], # G or T | |
| 310 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G | |
| 311 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T | |
| 312 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T | |
| 313 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T | |
| 314 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C | |
| 315 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C | |
| 316 }; | |
| 317 | |
| 318 $rna_weights = { | |
| 319 'A' => [$rna_A_wt,$rna_A_wt], # Adenine | |
| 320 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine | |
| 321 'G' => [$rna_G_wt,$rna_G_wt], # Guanine | |
| 322 'U' => [$rna_U_wt,$rna_U_wt], # Uracil | |
| 323 'M' => [$rna_C_wt,$rna_A_wt], # A or C | |
| 324 'R' => [$rna_A_wt,$rna_G_wt], # A or G | |
| 325 'W' => [$rna_U_wt,$rna_A_wt], # A or U | |
| 326 'S' => [$rna_C_wt,$rna_G_wt], # C or G | |
| 327 'Y' => [$rna_C_wt,$rna_U_wt], # C or U | |
| 328 'K' => [$rna_U_wt,$rna_G_wt], # G or U | |
| 329 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G | |
| 330 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U | |
| 331 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U | |
| 332 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U | |
| 333 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C | |
| 334 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C | |
| 335 }; | |
| 336 | |
| 337 %Weights = ( | |
| 338 'dna' => $dna_weights, | |
| 339 'rna' => $rna_weights, | |
| 340 'protein' => $amino_weights, | |
| 341 ); | |
| 342 } | |
| 343 | |
| 344 sub new { | |
| 345 my($class,@args) = @_; | |
| 346 my $self = $class->SUPER::new(@args); | |
| 347 | |
| 348 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args); | |
| 349 unless ($seqobj->isa("Bio::PrimarySeqI")) { | |
| 350 $self->throw(" SeqStats works only on PrimarySeqI objects \n"); | |
| 351 } | |
| 352 if ( !defined $seqobj->alphabet || ! defined $Alphabets{$seqobj->alphabet}) { | |
| 353 $self->throw("Must have a valid alphabet defined for seq (". | |
| 354 join(",",keys %Alphabets)); | |
| 355 } | |
| 356 $self->{'_seqref'} = $seqobj; | |
| 357 # check the letters in the sequence | |
| 358 $self->{'_is_strict'} = _is_alphabet_strict($seqobj); | |
| 359 return $self; | |
| 360 } | |
| 361 | |
| 362 =head2 count_monomers | |
| 363 | |
| 364 Title : count_monomers | |
| 365 Usage : $rcount = $seq_stats->count_monomers(); | |
| 366 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj); | |
| 367 Function: Counts the number of each type of monomer (amino acid or | |
| 368 base) in the sequence. | |
| 369 Ts are counted as Us in RNA sequences. | |
| 370 Example : | |
| 371 Returns : Reference to a hash in which keys are letters of the | |
| 372 genetic alphabet used and values are number of occurrences | |
| 373 of the letter in the sequence. | |
| 374 Args : None or reference to sequence object | |
| 375 Throws : Throws an exception if type of sequence is unknown (ie amino | |
| 376 or nucleic)or if unknown letter in alphabet. Ambiguous | |
| 377 elements are allowed. | |
| 378 | |
| 379 =cut | |
| 380 | |
| 381 sub count_monomers{ | |
| 382 my %count = (); | |
| 383 my $seqobj; | |
| 384 my $_is_strict; | |
| 385 my $element = ''; | |
| 386 my $_is_instance = 1 ; | |
| 387 my $self = shift @_; | |
| 388 my $object_argument = shift @_; | |
| 389 | |
| 390 # First we need to determine if the present object is an instance | |
| 391 # object or if the sequence object has been passed as an argument | |
| 392 | |
| 393 if (defined $object_argument) { | |
| 394 $_is_instance = 0; | |
| 395 } | |
| 396 | |
| 397 # If we are using an instance object... | |
| 398 if ($_is_instance) { | |
| 399 if ($self->{'_monomer_count'}) { | |
| 400 return $self->{'_monomer_count'}; # return count if previously calculated | |
| 401 } | |
| 402 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness" | |
| 403 $seqobj = $self->{'_seqref'}; | |
| 404 } else { | |
| 405 # otherwise... | |
| 406 $seqobj = $object_argument; | |
| 407 | |
| 408 # Following two lines lead to error in "throw" routine | |
| 409 $seqobj->isa("Bio::PrimarySeqI") || | |
| 410 $self->throw(" SeqStats works only on PrimarySeqI objects \n"); | |
| 411 # is alphabet OK? Is it strict? | |
| 412 $_is_strict = _is_alphabet_strict($seqobj); | |
| 413 } | |
| 414 | |
| 415 my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} : | |
| 416 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters | |
| 417 | |
| 418 # convert everything to upper case to be safe | |
| 419 my $seqstring = uc $seqobj->seq(); | |
| 420 | |
| 421 # Since T is used in RichSeq RNA sequences, do conversion locally | |
| 422 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna'; | |
| 423 | |
| 424 # For each letter, count the number of times it appears in | |
| 425 # the sequence | |
| 426 LETTER: | |
| 427 foreach $element (@$alphabet) { | |
| 428 # skip terminator symbol which may confuse regex | |
| 429 next LETTER if $element eq '*'; | |
| 430 $count{$element} = ( $seqstring =~ s/$element/$element/g); | |
| 431 } | |
| 432 | |
| 433 if ($_is_instance) { | |
| 434 $self->{'_monomer_count'} = \%count; # Save in case called again later | |
| 435 } | |
| 436 | |
| 437 return \%count; | |
| 438 } | |
| 439 | |
| 440 =head2 get_mol_wt | |
| 441 | |
| 442 Title : get_mol_wt | |
| 443 Usage : $wt = $seqobj->get_mol_wt() or | |
| 444 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj); | |
| 445 Function: Calculate molecular weight of sequence | |
| 446 Ts are counted as Us in RNA sequences. | |
| 447 Example : | |
| 448 | |
| 449 Returns : Reference to two element array containing lower and upper | |
| 450 bounds of molecule molecular weight. (For dna (and rna) | |
| 451 sequences, single-stranded weights are returned.) If | |
| 452 sequence contains no ambiguous elements, both entries in | |
| 453 array are equal to molecular weight of molecule. | |
| 454 Args : None or reference to sequence object | |
| 455 Throws : Exception if type of sequence is unknown (ie not amino or | |
| 456 nucleic) or if unknown letter in alphabet. Ambiguous | |
| 457 elements are allowed. | |
| 458 | |
| 459 =cut | |
| 460 | |
| 461 sub get_mol_wt { | |
| 462 | |
| 463 my $seqobj; | |
| 464 my $_is_strict; | |
| 465 my $element = ''; | |
| 466 my $_is_instance = 1 ; | |
| 467 my $self = shift @_; | |
| 468 my $object_argument = shift @_; | |
| 469 my ($weight_array, $rcount); | |
| 470 | |
| 471 if (defined $object_argument) { | |
| 472 $_is_instance = 0; | |
| 473 } | |
| 474 | |
| 475 if ($_is_instance) { | |
| 476 if ($weight_array = $self->{'_mol_wt'}) { | |
| 477 # return mol. weight if previously calculated | |
| 478 return $weight_array; | |
| 479 } | |
| 480 $seqobj = $self->{'_seqref'}; | |
| 481 $rcount = $self->count_monomers(); | |
| 482 } else { | |
| 483 $seqobj = $object_argument; | |
| 484 $seqobj->isa("Bio::PrimarySeqI") || | |
| 485 die("Error: SeqStats works only on PrimarySeqI objects \n"); | |
| 486 $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK? | |
| 487 $rcount = $self->count_monomers($seqobj); | |
| 488 } | |
| 489 | |
| 490 # We will also need to know what type of monomer we are dealing with | |
| 491 my $moltype = $seqobj->alphabet(); | |
| 492 | |
| 493 # In general,the molecular weight is bounded below by the sum of the | |
| 494 # weights of lower bounds of each alphabet symbol times the number of | |
| 495 # occurrences of the symbol in the sequence. A similar upper bound on | |
| 496 # the weight is also calculated. | |
| 497 | |
| 498 # Note that for "strict" (ie unambiguous) sequences there is an | |
| 499 # inefficiency since the upper bound = the lower bound (and is | |
| 500 # calculated twice). However, this decrease in performance will be | |
| 501 # minor and leads to (IMO) significantly more readable code. | |
| 502 | |
| 503 my $weight_lower_bound = 0; | |
| 504 my $weight_upper_bound = 0; | |
| 505 my $weight_table = $Weights{$moltype}; | |
| 506 | |
| 507 | |
| 508 # compute weight of all the residues | |
| 509 foreach $element (keys %$rcount) { | |
| 510 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0]; | |
| 511 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1]; | |
| 512 } | |
| 513 if ($moltype =~ /protein/) { | |
| 514 # remove of H2O during peptide bond formation. | |
| 515 $weight_lower_bound -= $water * ($seqobj->length - 1); | |
| 516 $weight_upper_bound -= $water * ($seqobj->length - 1); | |
| 517 } else { | |
| 518 # Correction because phosphate of 5' residue has additional OH and | |
| 519 # sugar ring of 3' residue has additional H | |
| 520 $weight_lower_bound += $water; | |
| 521 $weight_upper_bound += $water; | |
| 522 } | |
| 523 | |
| 524 $weight_lower_bound = sprintf("%.0f", $weight_lower_bound); | |
| 525 $weight_upper_bound = sprintf("%.0f", $weight_upper_bound); | |
| 526 | |
| 527 $weight_array = [$weight_lower_bound, $weight_upper_bound]; | |
| 528 | |
| 529 if ($_is_instance) { | |
| 530 $self->{'_mol_wt'} = $weight_array; # Save in case called again later | |
| 531 } | |
| 532 return $weight_array; | |
| 533 } | |
| 534 | |
| 535 | |
| 536 =head2 count_codons | |
| 537 | |
| 538 Title : count_codons | |
| 539 Usage : $rcount = $seqstats->count_codons (); or | |
| 540 $rcount = Bio::Tools::SeqStats->count_codons($seqobj); | |
| 541 | |
| 542 Function: Counts the number of each type of codons in a given frame | |
| 543 for a dna or rna sequence. | |
| 544 Example : | |
| 545 Returns : Reference to a hash in which keys are codons of the genetic | |
| 546 alphabet used and values are number of occurrences of the | |
| 547 codons in the sequence. All codons with "ambiguous" bases | |
| 548 are counted together. | |
| 549 Args : None or reference to sequence object | |
| 550 | |
| 551 Throws : an exception if type of sequence is unknown or protein. | |
| 552 | |
| 553 =cut | |
| 554 | |
| 555 sub count_codons { | |
| 556 my $rcount = {}; | |
| 557 my $codon ; | |
| 558 my $seqobj; | |
| 559 my $_is_strict; | |
| 560 my $element = ''; | |
| 561 my $_is_instance = 1 ; | |
| 562 my $self = shift @_; | |
| 563 my $object_argument = shift @_; | |
| 564 | |
| 565 if (defined $object_argument) { | |
| 566 $_is_instance = 0; | |
| 567 } | |
| 568 | |
| 569 if ($_is_instance) { | |
| 570 if ($rcount = $self->{'_codon_count'}) { | |
| 571 return $rcount; # return count if previously calculated | |
| 572 } | |
| 573 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness" | |
| 574 $seqobj = $self->{'_seqref'}; | |
| 575 } else { | |
| 576 $seqobj = $object_argument; | |
| 577 $seqobj->isa("Bio::PrimarySeqI") || | |
| 578 die(" Error: SeqStats works only on PrimarySeqI objects \n"); | |
| 579 $_is_strict = _is_alphabet_strict($seqobj); | |
| 580 } | |
| 581 | |
| 582 # Codon counts only make sense for nucleic acid sequences | |
| 583 my $alphabet = $seqobj->alphabet(); | |
| 584 | |
| 585 unless ($alphabet =~ /[dr]na/) { | |
| 586 $seqobj->throw(" Codon counts only meaningful for dna or rna, ". | |
| 587 "not for $alphabet sequences. \n"); | |
| 588 } | |
| 589 | |
| 590 # If sequence contains ambiguous bases, warn that codons | |
| 591 # containing them will all be lumped together in the count. | |
| 592 | |
| 593 if (!$_is_strict ) { | |
| 594 $seqobj->warn(" Sequence $seqobj contains ambiguous bases. \n". | |
| 595 " All codons with ambiguous bases will be added together in count. \n"); | |
| 596 } | |
| 597 | |
| 598 my $seq = $seqobj->seq(); | |
| 599 | |
| 600 # Now step through the string by threes and count the codons | |
| 601 | |
| 602 CODON: | |
| 603 while (length($seq) > 2) { | |
| 604 $codon = substr($seq,0,3); | |
| 605 $seq = substr($seq,3); | |
| 606 if ($codon =~ /[^ACTGU]/) { | |
| 607 $$rcount{'ambiguous'}++; #lump together ambiguous codons | |
| 608 next CODON; | |
| 609 } | |
| 610 if (!defined $$rcount{$codon}) { | |
| 611 $$rcount{$codon}= 1 ; | |
| 612 next CODON; | |
| 613 } | |
| 614 $$rcount{$codon}++; # default | |
| 615 } | |
| 616 | |
| 617 | |
| 618 if ($_is_instance) { | |
| 619 $self->{'_codon_count'} = $rcount; # Save in case called again later | |
| 620 } | |
| 621 | |
| 622 return $rcount; | |
| 623 } | |
| 624 | |
| 625 | |
| 626 =head2 _is_alphabet_strict | |
| 627 | |
| 628 Title : _is_alphabet_strict | |
| 629 Usage : | |
| 630 Function: internal function to determine whether there are | |
| 631 any ambiguous elements in the current sequence | |
| 632 Example : | |
| 633 Returns : 1 if strict alphabet is being used, | |
| 634 0 if ambiguous elements are present | |
| 635 Args : | |
| 636 | |
| 637 Throws : an exception if type of sequence is unknown (ie amino or | |
| 638 nucleic) or if unknown letter in alphabet. Ambiguous | |
| 639 monomers are allowed. | |
| 640 | |
| 641 =cut | |
| 642 | |
| 643 sub _is_alphabet_strict { | |
| 644 | |
| 645 my ($seqobj) = @_; | |
| 646 my $moltype = $seqobj->alphabet(); | |
| 647 | |
| 648 # convert everything to upper case to be safe | |
| 649 my $seqstring = uc $seqobj->seq(); | |
| 650 | |
| 651 # Since T is used in RichSeq RNA sequences, do conversion locally | |
| 652 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna'; | |
| 653 | |
| 654 # First we check if only the 'strict' letters are present in the | |
| 655 # sequence string If not, we check whether the remaining letters | |
| 656 # are ambiguous monomers or whether there are illegal letters in | |
| 657 # the string | |
| 658 | |
| 659 # $alpha_array is a ref to an array of the 'strictly' allowed letters | |
| 660 my $alpha_array = $Alphabets_strict{$moltype} ; | |
| 661 | |
| 662 # $alphabet contains the allowed letters in string form | |
| 663 my $alphabet = join ('', @$alpha_array) ; | |
| 664 unless ($seqstring =~ /[^$alphabet]/) { | |
| 665 return 1 ; | |
| 666 } | |
| 667 | |
| 668 # Next try to match with the alphabet's ambiguous letters | |
| 669 $alpha_array = $Alphabets{$moltype} ; | |
| 670 $alphabet = join ('', @$alpha_array) ; | |
| 671 | |
| 672 unless ($seqstring =~ /[^$alphabet]/) { | |
| 673 return 0 ; | |
| 674 } | |
| 675 | |
| 676 # If we got here there is an illegal letter in the sequence | |
| 677 $seqobj->throw(" Alphabet not OK for $seqobj \n"); | |
| 678 | |
| 679 } | |
| 680 | |
| 681 =head2 _print_data | |
| 682 | |
| 683 Title : _print_data | |
| 684 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data(); | |
| 685 Function: Displays dna / rna parameters (used for debugging) | |
| 686 Returns : 1 | |
| 687 Args : None | |
| 688 | |
| 689 Used for debugging. | |
| 690 | |
| 691 =cut | |
| 692 | |
| 693 sub _print_data { | |
| 694 | |
| 695 print "\n adenine = : $adenine \n"; | |
| 696 print "\n guanine = : $guanine \n"; | |
| 697 print "\n cytosine = : $cytosine \n"; | |
| 698 print "\n thymine = : $thymine \n"; | |
| 699 print "\n uracil = : $uracil \n"; | |
| 700 | |
| 701 print "\n dna_A_wt = : $dna_A_wt \n"; | |
| 702 print "\n dna_C_wt = : $dna_C_wt \n"; | |
| 703 print "\n dna_G_wt = : $dna_G_wt \n"; | |
| 704 print "\n dna_T_wt = : $dna_T_wt \n"; | |
| 705 | |
| 706 print "\n rna_A_wt = : $rna_A_wt \n"; | |
| 707 print "\n rna_C_wt = : $rna_C_wt \n"; | |
| 708 print "\n rna_G_wt = : $rna_G_wt \n"; | |
| 709 print "\n rna_U_wt = : $rna_U_wt \n"; | |
| 710 | |
| 711 return 1; | |
| 712 } |
