Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/SeqStats.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: SeqStats.pm,v 1.16.2.1 2003/02/28 13:17:06 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::Tools::SeqStats | |
4 # | |
5 # Cared for by | |
6 # | |
7 # Copyright Peter Schattner | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Tools::SeqStats - Object holding statistics for one particular sequence | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # build a primary nucleic acid or protein sequence object somehow | |
20 # then build a statistics object from the sequence object | |
21 | |
22 $seqobj = Bio::PrimarySeq->new(-seq=>'ACTGTGGCGTCAACTG', | |
23 -alphabet=>'dna', | |
24 -id=>'test'); | |
25 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj); | |
26 | |
27 # obtain a hash of counts of each type of monomer | |
28 # (ie amino or nucleic acid) | |
29 print "\nMonomer counts using statistics object\n"; | |
30 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj); | |
31 $hash_ref = $seq_stats->count_monomers(); # eg for DNA sequence | |
32 foreach $base (sort keys %$hash_ref) { | |
33 print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n"; | |
34 } | |
35 | |
36 # or obtain the count directly without creating a new statistics object | |
37 print "\nMonomer counts without statistics object\n"; | |
38 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj); | |
39 foreach $base (sort keys %$hash_ref) { | |
40 print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n"; | |
41 } | |
42 | |
43 | |
44 # obtain hash of counts of each type of codon in a nucleic acid sequence | |
45 print "\nCodon counts using statistics object\n"; | |
46 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence | |
47 foreach $base (sort keys %$hash_ref) { | |
48 print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n"; | |
49 } | |
50 | |
51 # or | |
52 print "\nCodon counts without statistics object\n"; | |
53 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj); | |
54 foreach $base (sort keys %$hash_ref) { | |
55 print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n"; | |
56 } | |
57 | |
58 # Obtain the molecular weight of a sequence. Since the sequence may contain | |
59 # ambiguous monomers, the molecular weight is returned as a (reference to) a | |
60 # two element array containing greatest lower bound (GLB) and least upper bound | |
61 # (LUB) of the molecular weight | |
62 $weight = $seq_stats->get_mol_wt(); | |
63 print "\nMolecular weight (using statistics object) of sequence ", $seqobj->id(), | |
64 " is between ", $$weight[0], " and " , | |
65 $$weight[1], "\n"; | |
66 | |
67 # or | |
68 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj); | |
69 print "\nMolecular weight (without statistics object) of sequence ", $seqobj->id(), | |
70 " is between ", $$weight[0], " and " , | |
71 $$weight[1], "\n"; | |
72 | |
73 | |
74 =head1 DESCRIPTION | |
75 | |
76 Bio::Tools::SeqStats is a lightweight object for the calculation of | |
77 simple statistical and numerical properties of a sequence. By | |
78 "lightweight" I mean that only "primary" sequences are handled by the | |
79 object. The calling script needs to create the appropriate primary | |
80 sequence to be passed to SeqStats if statistics on a sequence feature | |
81 are required. Similarly if a codon count is desired for a | |
82 frame-shifted sequence and/or a negative strand sequence, the calling | |
83 script needs to create that sequence and pass it to the SeqStats | |
84 object. | |
85 | |
86 Nota that nucleotide sequences in bioperl do not strictly separate RNA | |
87 and DNA sequences. By convension, sequences from RNA molecules are | |
88 shown as is they were DNA. Objects are supposed to make the | |
89 distinction when needed. This class is one of the few where this | |
90 distinctions needs to be made. Internally, it changes all Ts into Us | |
91 before weight and monomer count. | |
92 | |
93 | |
94 SeqStats can be called in two distinct manners. If only a single | |
95 computation is required on a given sequence object, the method can be | |
96 called easily using the SeqStats object directly: | |
97 | |
98 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj); | |
99 | |
100 Alternately, if several computations will be required on a given | |
101 sequence object, an "instance" statistics object can be constructed | |
102 and used for the method calls: | |
103 | |
104 $seq_stats = Bio::Tools::SeqStats->new($seqobj); | |
105 $monomers = $seq_stats->count_monomers(); | |
106 $codons = $seq_stats->count_codons(); | |
107 $weight = $seq_stats->get_mol_wt(); | |
108 | |
109 As currently implemented the object can return the following values | |
110 from a sequence: | |
111 | |
112 =over 3 | |
113 | |
114 =item * | |
115 | |
116 The molecular weight of the sequence: get_mol_wt() | |
117 | |
118 =item * | |
119 | |
120 The number of each type of monomer present: count_monomers() | |
121 | |
122 =item * | |
123 | |
124 The number of each codon present in a nucleic acid sequence: | |
125 count_codons() | |
126 | |
127 =back | |
128 | |
129 For dna (and rna) sequences, single-stranded weights are returned. The | |
130 molecular weights are calculated for neutral - ie not ionized - | |
131 nucleic acids. The returned weight is the sum of the | |
132 base-sugar-phosphate residues of the chain plus one weight of water to | |
133 to account for the additional OH on the phosphate of the 5' residue | |
134 and the additional H on the sugar ring of the 3' residue. Note that | |
135 this leads to a difference of 18 in calculated molecular weights | |
136 compared to some other available programs (eg Informax VectorNTI). | |
137 | |
138 Note that since sequences may contain ambiguous monomers (eg "M" | |
139 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt | |
140 returns a two-element array containing the greatest lower bound and | |
141 least upper bound of the molecule. (For a sequence with no ambiguous | |
142 monomers, the two elements of the returned array will be equal.) The | |
143 method count_codons() handles ambiguous bases by simply counting all | |
144 ambiguous codons together and issuing a warning to that effect. | |
145 | |
146 | |
147 =head1 DEVELOPERS NOTES | |
148 | |
149 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats | |
150 | |
151 =head1 FEEDBACK | |
152 | |
153 =head2 Mailing Lists | |
154 | |
155 User feedback is an integral part of the evolution of this and other | |
156 Bioperl modules. Send your comments and suggestions preferably to one | |
157 of the Bioperl mailing lists. Your participation is much appreciated. | |
158 | |
159 bioperl-l@bioperl.org - General discussion | |
160 http://bio.perl.org/MailList.html - About the mailing lists | |
161 | |
162 =head2 Reporting Bugs | |
163 | |
164 Report bugs to the Bioperl bug tracking system to help us keep track | |
165 the bugs and their resolution. | |
166 Bug reports can be submitted via email or the web: | |
167 | |
168 bioperl-bugs@bio.perl.org | |
169 http://bugzilla.bioperl.org/ | |
170 | |
171 =head1 AUTHOR - Peter Schattner | |
172 | |
173 Email schattner@alum.mit.edu | |
174 | |
175 =head1 APPENDIX | |
176 | |
177 The rest of the documentation details each of the object | |
178 methods. Internal methods are usually preceded with a _ | |
179 | |
180 =cut | |
181 | |
182 | |
183 package Bio::Tools::SeqStats; | |
184 use strict; | |
185 use vars qw(@ISA %Alphabets %Alphabets_strict $amino_weights | |
186 $rna_weights $dna_weights %Weights ); | |
187 use Bio::Seq; | |
188 use Bio::Root::Root; | |
189 @ISA = qw(Bio::Root::Root); | |
190 | |
191 BEGIN { | |
192 %Alphabets = ( | |
193 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ], | |
194 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ], | |
195 'protein' => [ qw(A R N D C Q E G H I L K M F | |
196 P S T W X Y V B Z *) ], # sac: added B, Z | |
197 ); | |
198 | |
199 # SAC: new strict alphabet: doesn't allow any ambiguity characters. | |
200 %Alphabets_strict = ( | |
201 'dna' => [ qw( A C G T ) ], | |
202 'rna' => [ qw( A C G U ) ], | |
203 'protein' => [ qw(A R N D C Q E G H I L K M F | |
204 P S T W Y V) ], | |
205 ); | |
206 | |
207 | |
208 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: | |
209 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. | |
210 | |
211 # Amino Acid alphabet | |
212 | |
213 # ------------------------------------------ | |
214 # Symbol Meaning | |
215 # ------------------------------------------ | |
216 | |
217 my $amino_A_wt = 89.09; | |
218 my $amino_C_wt = 121.15; | |
219 my $amino_D_wt = 133.1; | |
220 my $amino_E_wt = 147.13; | |
221 my $amino_F_wt = 165.19; | |
222 my $amino_G_wt = 75.07; | |
223 my $amino_H_wt = 155.16; | |
224 my $amino_I_wt = 131.18; | |
225 my $amino_K_wt = 146.19; | |
226 my $amino_L_wt = 131.18; | |
227 my $amino_M_wt = 149.22; | |
228 my $amino_N_wt = 132.12; | |
229 my $amino_P_wt = 115.13; | |
230 my $amino_Q_wt = 146.15; | |
231 my $amino_R_wt = 174.21; | |
232 my $amino_S_wt = 105.09; | |
233 my $amino_T_wt = 119.12; | |
234 my $amino_V_wt = 117.15; | |
235 my $amino_W_wt = 204.22; | |
236 my $amino_Y_wt = 181.19; | |
237 | |
238 $amino_weights = { | |
239 'A' => [$amino_A_wt, $amino_A_wt], # Alanine | |
240 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine | |
241 'C' => [$amino_C_wt, $amino_C_wt], # Cystine | |
242 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid | |
243 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid | |
244 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine | |
245 'G' => [$amino_G_wt, $amino_G_wt], # Glycine | |
246 'H' => [$amino_H_wt, $amino_H_wt], # Histidine | |
247 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine | |
248 'K' => [$amino_K_wt, $amino_K_wt], # Lysine | |
249 'L' => [$amino_L_wt, $amino_L_wt], # Leucine | |
250 'M' => [$amino_M_wt, $amino_M_wt], # Methionine | |
251 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine | |
252 'P' => [$amino_P_wt, $amino_P_wt], # Proline | |
253 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine | |
254 'R' => [$amino_R_wt, $amino_R_wt], # Arginine | |
255 'S' => [$amino_S_wt, $amino_S_wt], # Serine | |
256 'T' => [$amino_T_wt, $amino_T_wt], # Threonine | |
257 'V' => [$amino_V_wt, $amino_V_wt], # Valine | |
258 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan | |
259 'X' => [$amino_G_wt, $amino_W_wt], # Unknown | |
260 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine | |
261 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine | |
262 }; | |
263 | |
264 # Extended Dna / Rna alphabet | |
265 use vars ( qw($C $O $N $H $P $water) ); | |
266 use vars ( qw($adenine $guanine $cytosine $thymine $uracil)); | |
267 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi)); | |
268 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt | |
269 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt)); | |
270 use vars ( qw($dna_weights $rna_weights %Weights)); | |
271 | |
272 $C = 12.01; | |
273 $O = 16.00; | |
274 $N = 14.01; | |
275 $H = 1.01; | |
276 $P = 30.97; | |
277 $water = 18.015; | |
278 | |
279 $adenine = 5 * $C + 5 * $N + 5 * $H; | |
280 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H; | |
281 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H; | |
282 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H; | |
283 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H; | |
284 | |
285 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P; #neutral (unionized) form | |
286 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P; | |
287 | |
288 # the following are single strand molecular weights / base | |
289 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water; | |
290 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water; | |
291 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water; | |
292 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water; | |
293 | |
294 $rna_A_wt = $adenine + $ribose_phosphate - $water; | |
295 $rna_C_wt = $cytosine + $ribose_phosphate - $water; | |
296 $rna_G_wt = $guanine + $ribose_phosphate - $water; | |
297 $rna_U_wt = $uracil + $ribose_phosphate - $water; | |
298 | |
299 $dna_weights = { | |
300 'A' => [$dna_A_wt,$dna_A_wt], # Adenine | |
301 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine | |
302 'G' => [$dna_G_wt,$dna_G_wt], # Guanine | |
303 'T' => [$dna_T_wt,$dna_T_wt], # Thymine | |
304 'M' => [$dna_C_wt,$dna_A_wt], # A or C | |
305 'R' => [$dna_A_wt,$dna_G_wt], # A or G | |
306 'W' => [$dna_T_wt,$dna_A_wt], # A or T | |
307 'S' => [$dna_C_wt,$dna_G_wt], # C or G | |
308 'Y' => [$dna_C_wt,$dna_T_wt], # C or T | |
309 'K' => [$dna_T_wt,$dna_G_wt], # G or T | |
310 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G | |
311 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T | |
312 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T | |
313 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T | |
314 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C | |
315 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C | |
316 }; | |
317 | |
318 $rna_weights = { | |
319 'A' => [$rna_A_wt,$rna_A_wt], # Adenine | |
320 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine | |
321 'G' => [$rna_G_wt,$rna_G_wt], # Guanine | |
322 'U' => [$rna_U_wt,$rna_U_wt], # Uracil | |
323 'M' => [$rna_C_wt,$rna_A_wt], # A or C | |
324 'R' => [$rna_A_wt,$rna_G_wt], # A or G | |
325 'W' => [$rna_U_wt,$rna_A_wt], # A or U | |
326 'S' => [$rna_C_wt,$rna_G_wt], # C or G | |
327 'Y' => [$rna_C_wt,$rna_U_wt], # C or U | |
328 'K' => [$rna_U_wt,$rna_G_wt], # G or U | |
329 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G | |
330 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U | |
331 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U | |
332 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U | |
333 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C | |
334 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C | |
335 }; | |
336 | |
337 %Weights = ( | |
338 'dna' => $dna_weights, | |
339 'rna' => $rna_weights, | |
340 'protein' => $amino_weights, | |
341 ); | |
342 } | |
343 | |
344 sub new { | |
345 my($class,@args) = @_; | |
346 my $self = $class->SUPER::new(@args); | |
347 | |
348 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args); | |
349 unless ($seqobj->isa("Bio::PrimarySeqI")) { | |
350 $self->throw(" SeqStats works only on PrimarySeqI objects \n"); | |
351 } | |
352 if ( !defined $seqobj->alphabet || ! defined $Alphabets{$seqobj->alphabet}) { | |
353 $self->throw("Must have a valid alphabet defined for seq (". | |
354 join(",",keys %Alphabets)); | |
355 } | |
356 $self->{'_seqref'} = $seqobj; | |
357 # check the letters in the sequence | |
358 $self->{'_is_strict'} = _is_alphabet_strict($seqobj); | |
359 return $self; | |
360 } | |
361 | |
362 =head2 count_monomers | |
363 | |
364 Title : count_monomers | |
365 Usage : $rcount = $seq_stats->count_monomers(); | |
366 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj); | |
367 Function: Counts the number of each type of monomer (amino acid or | |
368 base) in the sequence. | |
369 Ts are counted as Us in RNA sequences. | |
370 Example : | |
371 Returns : Reference to a hash in which keys are letters of the | |
372 genetic alphabet used and values are number of occurrences | |
373 of the letter in the sequence. | |
374 Args : None or reference to sequence object | |
375 Throws : Throws an exception if type of sequence is unknown (ie amino | |
376 or nucleic)or if unknown letter in alphabet. Ambiguous | |
377 elements are allowed. | |
378 | |
379 =cut | |
380 | |
381 sub count_monomers{ | |
382 my %count = (); | |
383 my $seqobj; | |
384 my $_is_strict; | |
385 my $element = ''; | |
386 my $_is_instance = 1 ; | |
387 my $self = shift @_; | |
388 my $object_argument = shift @_; | |
389 | |
390 # First we need to determine if the present object is an instance | |
391 # object or if the sequence object has been passed as an argument | |
392 | |
393 if (defined $object_argument) { | |
394 $_is_instance = 0; | |
395 } | |
396 | |
397 # If we are using an instance object... | |
398 if ($_is_instance) { | |
399 if ($self->{'_monomer_count'}) { | |
400 return $self->{'_monomer_count'}; # return count if previously calculated | |
401 } | |
402 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness" | |
403 $seqobj = $self->{'_seqref'}; | |
404 } else { | |
405 # otherwise... | |
406 $seqobj = $object_argument; | |
407 | |
408 # Following two lines lead to error in "throw" routine | |
409 $seqobj->isa("Bio::PrimarySeqI") || | |
410 $self->throw(" SeqStats works only on PrimarySeqI objects \n"); | |
411 # is alphabet OK? Is it strict? | |
412 $_is_strict = _is_alphabet_strict($seqobj); | |
413 } | |
414 | |
415 my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} : | |
416 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters | |
417 | |
418 # convert everything to upper case to be safe | |
419 my $seqstring = uc $seqobj->seq(); | |
420 | |
421 # Since T is used in RichSeq RNA sequences, do conversion locally | |
422 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna'; | |
423 | |
424 # For each letter, count the number of times it appears in | |
425 # the sequence | |
426 LETTER: | |
427 foreach $element (@$alphabet) { | |
428 # skip terminator symbol which may confuse regex | |
429 next LETTER if $element eq '*'; | |
430 $count{$element} = ( $seqstring =~ s/$element/$element/g); | |
431 } | |
432 | |
433 if ($_is_instance) { | |
434 $self->{'_monomer_count'} = \%count; # Save in case called again later | |
435 } | |
436 | |
437 return \%count; | |
438 } | |
439 | |
440 =head2 get_mol_wt | |
441 | |
442 Title : get_mol_wt | |
443 Usage : $wt = $seqobj->get_mol_wt() or | |
444 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj); | |
445 Function: Calculate molecular weight of sequence | |
446 Ts are counted as Us in RNA sequences. | |
447 Example : | |
448 | |
449 Returns : Reference to two element array containing lower and upper | |
450 bounds of molecule molecular weight. (For dna (and rna) | |
451 sequences, single-stranded weights are returned.) If | |
452 sequence contains no ambiguous elements, both entries in | |
453 array are equal to molecular weight of molecule. | |
454 Args : None or reference to sequence object | |
455 Throws : Exception if type of sequence is unknown (ie not amino or | |
456 nucleic) or if unknown letter in alphabet. Ambiguous | |
457 elements are allowed. | |
458 | |
459 =cut | |
460 | |
461 sub get_mol_wt { | |
462 | |
463 my $seqobj; | |
464 my $_is_strict; | |
465 my $element = ''; | |
466 my $_is_instance = 1 ; | |
467 my $self = shift @_; | |
468 my $object_argument = shift @_; | |
469 my ($weight_array, $rcount); | |
470 | |
471 if (defined $object_argument) { | |
472 $_is_instance = 0; | |
473 } | |
474 | |
475 if ($_is_instance) { | |
476 if ($weight_array = $self->{'_mol_wt'}) { | |
477 # return mol. weight if previously calculated | |
478 return $weight_array; | |
479 } | |
480 $seqobj = $self->{'_seqref'}; | |
481 $rcount = $self->count_monomers(); | |
482 } else { | |
483 $seqobj = $object_argument; | |
484 $seqobj->isa("Bio::PrimarySeqI") || | |
485 die("Error: SeqStats works only on PrimarySeqI objects \n"); | |
486 $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK? | |
487 $rcount = $self->count_monomers($seqobj); | |
488 } | |
489 | |
490 # We will also need to know what type of monomer we are dealing with | |
491 my $moltype = $seqobj->alphabet(); | |
492 | |
493 # In general,the molecular weight is bounded below by the sum of the | |
494 # weights of lower bounds of each alphabet symbol times the number of | |
495 # occurrences of the symbol in the sequence. A similar upper bound on | |
496 # the weight is also calculated. | |
497 | |
498 # Note that for "strict" (ie unambiguous) sequences there is an | |
499 # inefficiency since the upper bound = the lower bound (and is | |
500 # calculated twice). However, this decrease in performance will be | |
501 # minor and leads to (IMO) significantly more readable code. | |
502 | |
503 my $weight_lower_bound = 0; | |
504 my $weight_upper_bound = 0; | |
505 my $weight_table = $Weights{$moltype}; | |
506 | |
507 | |
508 # compute weight of all the residues | |
509 foreach $element (keys %$rcount) { | |
510 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0]; | |
511 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1]; | |
512 } | |
513 if ($moltype =~ /protein/) { | |
514 # remove of H2O during peptide bond formation. | |
515 $weight_lower_bound -= $water * ($seqobj->length - 1); | |
516 $weight_upper_bound -= $water * ($seqobj->length - 1); | |
517 } else { | |
518 # Correction because phosphate of 5' residue has additional OH and | |
519 # sugar ring of 3' residue has additional H | |
520 $weight_lower_bound += $water; | |
521 $weight_upper_bound += $water; | |
522 } | |
523 | |
524 $weight_lower_bound = sprintf("%.0f", $weight_lower_bound); | |
525 $weight_upper_bound = sprintf("%.0f", $weight_upper_bound); | |
526 | |
527 $weight_array = [$weight_lower_bound, $weight_upper_bound]; | |
528 | |
529 if ($_is_instance) { | |
530 $self->{'_mol_wt'} = $weight_array; # Save in case called again later | |
531 } | |
532 return $weight_array; | |
533 } | |
534 | |
535 | |
536 =head2 count_codons | |
537 | |
538 Title : count_codons | |
539 Usage : $rcount = $seqstats->count_codons (); or | |
540 $rcount = Bio::Tools::SeqStats->count_codons($seqobj); | |
541 | |
542 Function: Counts the number of each type of codons in a given frame | |
543 for a dna or rna sequence. | |
544 Example : | |
545 Returns : Reference to a hash in which keys are codons of the genetic | |
546 alphabet used and values are number of occurrences of the | |
547 codons in the sequence. All codons with "ambiguous" bases | |
548 are counted together. | |
549 Args : None or reference to sequence object | |
550 | |
551 Throws : an exception if type of sequence is unknown or protein. | |
552 | |
553 =cut | |
554 | |
555 sub count_codons { | |
556 my $rcount = {}; | |
557 my $codon ; | |
558 my $seqobj; | |
559 my $_is_strict; | |
560 my $element = ''; | |
561 my $_is_instance = 1 ; | |
562 my $self = shift @_; | |
563 my $object_argument = shift @_; | |
564 | |
565 if (defined $object_argument) { | |
566 $_is_instance = 0; | |
567 } | |
568 | |
569 if ($_is_instance) { | |
570 if ($rcount = $self->{'_codon_count'}) { | |
571 return $rcount; # return count if previously calculated | |
572 } | |
573 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness" | |
574 $seqobj = $self->{'_seqref'}; | |
575 } else { | |
576 $seqobj = $object_argument; | |
577 $seqobj->isa("Bio::PrimarySeqI") || | |
578 die(" Error: SeqStats works only on PrimarySeqI objects \n"); | |
579 $_is_strict = _is_alphabet_strict($seqobj); | |
580 } | |
581 | |
582 # Codon counts only make sense for nucleic acid sequences | |
583 my $alphabet = $seqobj->alphabet(); | |
584 | |
585 unless ($alphabet =~ /[dr]na/) { | |
586 $seqobj->throw(" Codon counts only meaningful for dna or rna, ". | |
587 "not for $alphabet sequences. \n"); | |
588 } | |
589 | |
590 # If sequence contains ambiguous bases, warn that codons | |
591 # containing them will all be lumped together in the count. | |
592 | |
593 if (!$_is_strict ) { | |
594 $seqobj->warn(" Sequence $seqobj contains ambiguous bases. \n". | |
595 " All codons with ambiguous bases will be added together in count. \n"); | |
596 } | |
597 | |
598 my $seq = $seqobj->seq(); | |
599 | |
600 # Now step through the string by threes and count the codons | |
601 | |
602 CODON: | |
603 while (length($seq) > 2) { | |
604 $codon = substr($seq,0,3); | |
605 $seq = substr($seq,3); | |
606 if ($codon =~ /[^ACTGU]/) { | |
607 $$rcount{'ambiguous'}++; #lump together ambiguous codons | |
608 next CODON; | |
609 } | |
610 if (!defined $$rcount{$codon}) { | |
611 $$rcount{$codon}= 1 ; | |
612 next CODON; | |
613 } | |
614 $$rcount{$codon}++; # default | |
615 } | |
616 | |
617 | |
618 if ($_is_instance) { | |
619 $self->{'_codon_count'} = $rcount; # Save in case called again later | |
620 } | |
621 | |
622 return $rcount; | |
623 } | |
624 | |
625 | |
626 =head2 _is_alphabet_strict | |
627 | |
628 Title : _is_alphabet_strict | |
629 Usage : | |
630 Function: internal function to determine whether there are | |
631 any ambiguous elements in the current sequence | |
632 Example : | |
633 Returns : 1 if strict alphabet is being used, | |
634 0 if ambiguous elements are present | |
635 Args : | |
636 | |
637 Throws : an exception if type of sequence is unknown (ie amino or | |
638 nucleic) or if unknown letter in alphabet. Ambiguous | |
639 monomers are allowed. | |
640 | |
641 =cut | |
642 | |
643 sub _is_alphabet_strict { | |
644 | |
645 my ($seqobj) = @_; | |
646 my $moltype = $seqobj->alphabet(); | |
647 | |
648 # convert everything to upper case to be safe | |
649 my $seqstring = uc $seqobj->seq(); | |
650 | |
651 # Since T is used in RichSeq RNA sequences, do conversion locally | |
652 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna'; | |
653 | |
654 # First we check if only the 'strict' letters are present in the | |
655 # sequence string If not, we check whether the remaining letters | |
656 # are ambiguous monomers or whether there are illegal letters in | |
657 # the string | |
658 | |
659 # $alpha_array is a ref to an array of the 'strictly' allowed letters | |
660 my $alpha_array = $Alphabets_strict{$moltype} ; | |
661 | |
662 # $alphabet contains the allowed letters in string form | |
663 my $alphabet = join ('', @$alpha_array) ; | |
664 unless ($seqstring =~ /[^$alphabet]/) { | |
665 return 1 ; | |
666 } | |
667 | |
668 # Next try to match with the alphabet's ambiguous letters | |
669 $alpha_array = $Alphabets{$moltype} ; | |
670 $alphabet = join ('', @$alpha_array) ; | |
671 | |
672 unless ($seqstring =~ /[^$alphabet]/) { | |
673 return 0 ; | |
674 } | |
675 | |
676 # If we got here there is an illegal letter in the sequence | |
677 $seqobj->throw(" Alphabet not OK for $seqobj \n"); | |
678 | |
679 } | |
680 | |
681 =head2 _print_data | |
682 | |
683 Title : _print_data | |
684 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data(); | |
685 Function: Displays dna / rna parameters (used for debugging) | |
686 Returns : 1 | |
687 Args : None | |
688 | |
689 Used for debugging. | |
690 | |
691 =cut | |
692 | |
693 sub _print_data { | |
694 | |
695 print "\n adenine = : $adenine \n"; | |
696 print "\n guanine = : $guanine \n"; | |
697 print "\n cytosine = : $cytosine \n"; | |
698 print "\n thymine = : $thymine \n"; | |
699 print "\n uracil = : $uracil \n"; | |
700 | |
701 print "\n dna_A_wt = : $dna_A_wt \n"; | |
702 print "\n dna_C_wt = : $dna_C_wt \n"; | |
703 print "\n dna_G_wt = : $dna_G_wt \n"; | |
704 print "\n dna_T_wt = : $dna_T_wt \n"; | |
705 | |
706 print "\n rna_A_wt = : $rna_A_wt \n"; | |
707 print "\n rna_C_wt = : $rna_C_wt \n"; | |
708 print "\n rna_G_wt = : $rna_G_wt \n"; | |
709 print "\n rna_U_wt = : $rna_U_wt \n"; | |
710 | |
711 return 1; | |
712 } |