annotate variant_effect_predictor/Bio/Tools/SeqStats.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
1 # $Id: SeqStats.pm,v 1.16.2.1 2003/02/28 13:17:06 heikki Exp $
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
2 #
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
3 # BioPerl module for Bio::Tools::SeqStats
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
4 #
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
5 # Cared for by
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
6 #
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
7 # Copyright Peter Schattner
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
8 #
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
9 # You may distribute this module under the same terms as perl itself
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
10
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
11 # POD documentation - main docs before the code
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
12
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
13 =head1 NAME
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
14
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
15 Bio::Tools::SeqStats - Object holding statistics for one particular sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
16
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
17 =head1 SYNOPSIS
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
18
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
19 # build a primary nucleic acid or protein sequence object somehow
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
20 # then build a statistics object from the sequence object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
21
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
22 $seqobj = Bio::PrimarySeq->new(-seq=>'ACTGTGGCGTCAACTG',
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
23 -alphabet=>'dna',
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
24 -id=>'test');
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
25 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
26
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
27 # obtain a hash of counts of each type of monomer
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
28 # (ie amino or nucleic acid)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
29 print "\nMonomer counts using statistics object\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
30 $seq_stats = Bio::Tools::SeqStats->new(-seq=>$seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
31 $hash_ref = $seq_stats->count_monomers(); # eg for DNA sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
32 foreach $base (sort keys %$hash_ref) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
33 print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
34 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
35
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
36 # or obtain the count directly without creating a new statistics object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
37 print "\nMonomer counts without statistics object\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
38 $hash_ref = Bio::Tools::SeqStats->count_monomers($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
39 foreach $base (sort keys %$hash_ref) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
40 print "Number of bases of type ", $base, "= ", %$hash_ref->{$base},"\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
41 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
42
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
43
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
44 # obtain hash of counts of each type of codon in a nucleic acid sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
45 print "\nCodon counts using statistics object\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
46 $hash_ref = $seq_stats-> count_codons(); # for nucleic acid sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
47 foreach $base (sort keys %$hash_ref) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
48 print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
49 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
50
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
51 # or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
52 print "\nCodon counts without statistics object\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
53 $hash_ref = Bio::Tools::SeqStats->count_codons($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
54 foreach $base (sort keys %$hash_ref) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
55 print "Number of codons of type ", $base, "= ", %$hash_ref->{$base},"\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
56 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
57
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
58 # Obtain the molecular weight of a sequence. Since the sequence may contain
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
59 # ambiguous monomers, the molecular weight is returned as a (reference to) a
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
60 # two element array containing greatest lower bound (GLB) and least upper bound
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
61 # (LUB) of the molecular weight
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
62 $weight = $seq_stats->get_mol_wt();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
63 print "\nMolecular weight (using statistics object) of sequence ", $seqobj->id(),
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
64 " is between ", $$weight[0], " and " ,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
65 $$weight[1], "\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
66
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
67 # or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
68 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
69 print "\nMolecular weight (without statistics object) of sequence ", $seqobj->id(),
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
70 " is between ", $$weight[0], " and " ,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
71 $$weight[1], "\n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
72
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
73
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
74 =head1 DESCRIPTION
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
75
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
76 Bio::Tools::SeqStats is a lightweight object for the calculation of
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
77 simple statistical and numerical properties of a sequence. By
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
78 "lightweight" I mean that only "primary" sequences are handled by the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
79 object. The calling script needs to create the appropriate primary
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
80 sequence to be passed to SeqStats if statistics on a sequence feature
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
81 are required. Similarly if a codon count is desired for a
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
82 frame-shifted sequence and/or a negative strand sequence, the calling
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
83 script needs to create that sequence and pass it to the SeqStats
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
84 object.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
85
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
86 Nota that nucleotide sequences in bioperl do not strictly separate RNA
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
87 and DNA sequences. By convension, sequences from RNA molecules are
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
88 shown as is they were DNA. Objects are supposed to make the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
89 distinction when needed. This class is one of the few where this
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
90 distinctions needs to be made. Internally, it changes all Ts into Us
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
91 before weight and monomer count.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
92
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
93
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
94 SeqStats can be called in two distinct manners. If only a single
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
95 computation is required on a given sequence object, the method can be
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
96 called easily using the SeqStats object directly:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
97
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
98 $weight = Bio::Tools::SeqStats->get_mol_wt($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
99
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
100 Alternately, if several computations will be required on a given
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
101 sequence object, an "instance" statistics object can be constructed
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
102 and used for the method calls:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
103
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
104 $seq_stats = Bio::Tools::SeqStats->new($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
105 $monomers = $seq_stats->count_monomers();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
106 $codons = $seq_stats->count_codons();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
107 $weight = $seq_stats->get_mol_wt();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
108
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
109 As currently implemented the object can return the following values
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
110 from a sequence:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
111
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
112 =over 3
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
113
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
114 =item *
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
115
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
116 The molecular weight of the sequence: get_mol_wt()
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
117
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
118 =item *
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
119
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
120 The number of each type of monomer present: count_monomers()
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
121
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
122 =item *
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
123
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
124 The number of each codon present in a nucleic acid sequence:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
125 count_codons()
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
126
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
127 =back
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
128
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
129 For dna (and rna) sequences, single-stranded weights are returned. The
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
130 molecular weights are calculated for neutral - ie not ionized -
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
131 nucleic acids. The returned weight is the sum of the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
132 base-sugar-phosphate residues of the chain plus one weight of water to
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
133 to account for the additional OH on the phosphate of the 5' residue
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
134 and the additional H on the sugar ring of the 3' residue. Note that
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
135 this leads to a difference of 18 in calculated molecular weights
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
136 compared to some other available programs (eg Informax VectorNTI).
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
137
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
138 Note that since sequences may contain ambiguous monomers (eg "M"
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
139 meaning "A" or "C" in a nucleic acid sequence), the method get_mol_wt
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
140 returns a two-element array containing the greatest lower bound and
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
141 least upper bound of the molecule. (For a sequence with no ambiguous
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
142 monomers, the two elements of the returned array will be equal.) The
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
143 method count_codons() handles ambiguous bases by simply counting all
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
144 ambiguous codons together and issuing a warning to that effect.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
145
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
146
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
147 =head1 DEVELOPERS NOTES
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
148
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
149 Ewan moved it from Bio::SeqStats to Bio::Tools::SeqStats
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
150
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
151 =head1 FEEDBACK
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
152
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
153 =head2 Mailing Lists
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
154
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
155 User feedback is an integral part of the evolution of this and other
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
156 Bioperl modules. Send your comments and suggestions preferably to one
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
157 of the Bioperl mailing lists. Your participation is much appreciated.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
158
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
159 bioperl-l@bioperl.org - General discussion
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
160 http://bio.perl.org/MailList.html - About the mailing lists
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
161
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
162 =head2 Reporting Bugs
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
163
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
164 Report bugs to the Bioperl bug tracking system to help us keep track
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
165 the bugs and their resolution.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
166 Bug reports can be submitted via email or the web:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
167
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
168 bioperl-bugs@bio.perl.org
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
169 http://bugzilla.bioperl.org/
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
170
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
171 =head1 AUTHOR - Peter Schattner
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
172
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
173 Email schattner@alum.mit.edu
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
174
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
175 =head1 APPENDIX
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
176
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
177 The rest of the documentation details each of the object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
178 methods. Internal methods are usually preceded with a _
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
179
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
180 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
181
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
182
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
183 package Bio::Tools::SeqStats;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
184 use strict;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
185 use vars qw(@ISA %Alphabets %Alphabets_strict $amino_weights
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
186 $rna_weights $dna_weights %Weights );
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
187 use Bio::Seq;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
188 use Bio::Root::Root;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
189 @ISA = qw(Bio::Root::Root);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
190
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
191 BEGIN {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
192 %Alphabets = (
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
193 'dna' => [ qw(A C G T R Y M K S W H B V D X N) ],
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
194 'rna' => [ qw(A C G U R Y M K S W H B V D X N) ],
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
195 'protein' => [ qw(A R N D C Q E G H I L K M F
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
196 P S T W X Y V B Z *) ], # sac: added B, Z
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
197 );
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
198
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
199 # SAC: new strict alphabet: doesn't allow any ambiguity characters.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
200 %Alphabets_strict = (
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
201 'dna' => [ qw( A C G T ) ],
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
202 'rna' => [ qw( A C G U ) ],
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
203 'protein' => [ qw(A R N D C Q E G H I L K M F
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
204 P S T W Y V) ],
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
205 );
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
206
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
207
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
208 # IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
209 # Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
210
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
211 # Amino Acid alphabet
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
212
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
213 # ------------------------------------------
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
214 # Symbol Meaning
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
215 # ------------------------------------------
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
216
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
217 my $amino_A_wt = 89.09;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
218 my $amino_C_wt = 121.15;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
219 my $amino_D_wt = 133.1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
220 my $amino_E_wt = 147.13;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
221 my $amino_F_wt = 165.19;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
222 my $amino_G_wt = 75.07;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
223 my $amino_H_wt = 155.16;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
224 my $amino_I_wt = 131.18;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
225 my $amino_K_wt = 146.19;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
226 my $amino_L_wt = 131.18;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
227 my $amino_M_wt = 149.22;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
228 my $amino_N_wt = 132.12;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
229 my $amino_P_wt = 115.13;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
230 my $amino_Q_wt = 146.15;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
231 my $amino_R_wt = 174.21;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
232 my $amino_S_wt = 105.09;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
233 my $amino_T_wt = 119.12;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
234 my $amino_V_wt = 117.15;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
235 my $amino_W_wt = 204.22;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
236 my $amino_Y_wt = 181.19;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
237
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
238 $amino_weights = {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
239 'A' => [$amino_A_wt, $amino_A_wt], # Alanine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
240 'B' => [$amino_N_wt, $amino_D_wt], # Aspartic Acid, Asparagine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
241 'C' => [$amino_C_wt, $amino_C_wt], # Cystine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
242 'D' => [$amino_D_wt, $amino_D_wt], # Aspartic Acid
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
243 'E' => [$amino_E_wt, $amino_E_wt], # Glutamic Acid
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
244 'F' => [$amino_F_wt, $amino_F_wt], # Phenylalanine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
245 'G' => [$amino_G_wt, $amino_G_wt], # Glycine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
246 'H' => [$amino_H_wt, $amino_H_wt], # Histidine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
247 'I' => [$amino_I_wt, $amino_I_wt], # Isoleucine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
248 'K' => [$amino_K_wt, $amino_K_wt], # Lysine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
249 'L' => [$amino_L_wt, $amino_L_wt], # Leucine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
250 'M' => [$amino_M_wt, $amino_M_wt], # Methionine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
251 'N' => [$amino_N_wt, $amino_N_wt], # Asparagine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
252 'P' => [$amino_P_wt, $amino_P_wt], # Proline
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
253 'Q' => [$amino_Q_wt, $amino_Q_wt], # Glutamine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
254 'R' => [$amino_R_wt, $amino_R_wt], # Arginine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
255 'S' => [$amino_S_wt, $amino_S_wt], # Serine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
256 'T' => [$amino_T_wt, $amino_T_wt], # Threonine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
257 'V' => [$amino_V_wt, $amino_V_wt], # Valine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
258 'W' => [$amino_W_wt, $amino_W_wt], # Tryptophan
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
259 'X' => [$amino_G_wt, $amino_W_wt], # Unknown
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
260 'Y' => [$amino_Y_wt, $amino_Y_wt], # Tyrosine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
261 'Z' => [$amino_Q_wt, $amino_E_wt], # Glutamic Acid, Glutamine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
262 };
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
263
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
264 # Extended Dna / Rna alphabet
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
265 use vars ( qw($C $O $N $H $P $water) );
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
266 use vars ( qw($adenine $guanine $cytosine $thymine $uracil));
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
267 use vars ( qw($ribose_phosphate $deoxyribose_phosphate $ppi));
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
268 use vars ( qw($dna_A_wt $dna_C_wt $dna_G_wt $dna_T_wt
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
269 $rna_A_wt $rna_C_wt $rna_G_wt $rna_U_wt));
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
270 use vars ( qw($dna_weights $rna_weights %Weights));
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
271
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
272 $C = 12.01;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
273 $O = 16.00;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
274 $N = 14.01;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
275 $H = 1.01;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
276 $P = 30.97;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
277 $water = 18.015;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
278
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
279 $adenine = 5 * $C + 5 * $N + 5 * $H;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
280 $guanine = 5 * $C + 5 * $N + 1 * $O + 5 * $H;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
281 $cytosine = 4 * $C + 3 * $N + 1 * $O + 5 * $H;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
282 $thymine = 5 * $C + 2 * $N + 2 * $O + 6 * $H;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
283 $uracil = 4 * $C + 2 * $N + 2 * $O + 4 * $H;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
284
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
285 $ribose_phosphate = 5 * $C + 7 * $O + 9 * $H + 1 * $P; #neutral (unionized) form
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
286 $deoxyribose_phosphate = 5 * $C + 6 * $O + 9 * $H + 1 * $P;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
287
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
288 # the following are single strand molecular weights / base
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
289 $dna_A_wt = $adenine + $deoxyribose_phosphate - $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
290 $dna_C_wt = $cytosine + $deoxyribose_phosphate - $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
291 $dna_G_wt = $guanine + $deoxyribose_phosphate - $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
292 $dna_T_wt = $thymine + $deoxyribose_phosphate - $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
293
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
294 $rna_A_wt = $adenine + $ribose_phosphate - $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
295 $rna_C_wt = $cytosine + $ribose_phosphate - $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
296 $rna_G_wt = $guanine + $ribose_phosphate - $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
297 $rna_U_wt = $uracil + $ribose_phosphate - $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
298
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
299 $dna_weights = {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
300 'A' => [$dna_A_wt,$dna_A_wt], # Adenine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
301 'C' => [$dna_C_wt,$dna_C_wt], # Cytosine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
302 'G' => [$dna_G_wt,$dna_G_wt], # Guanine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
303 'T' => [$dna_T_wt,$dna_T_wt], # Thymine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
304 'M' => [$dna_C_wt,$dna_A_wt], # A or C
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
305 'R' => [$dna_A_wt,$dna_G_wt], # A or G
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
306 'W' => [$dna_T_wt,$dna_A_wt], # A or T
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
307 'S' => [$dna_C_wt,$dna_G_wt], # C or G
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
308 'Y' => [$dna_C_wt,$dna_T_wt], # C or T
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
309 'K' => [$dna_T_wt,$dna_G_wt], # G or T
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
310 'V' => [$dna_C_wt,$dna_G_wt], # A or C or G
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
311 'H' => [$dna_C_wt,$dna_A_wt], # A or C or T
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
312 'D' => [$dna_T_wt,$dna_G_wt], # A or G or T
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
313 'B' => [$dna_C_wt,$dna_G_wt], # C or G or T
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
314 'X' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
315 'N' => [$dna_C_wt,$dna_G_wt], # G or A or T or C
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
316 };
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
317
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
318 $rna_weights = {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
319 'A' => [$rna_A_wt,$rna_A_wt], # Adenine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
320 'C' => [$rna_C_wt,$rna_C_wt], # Cytosine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
321 'G' => [$rna_G_wt,$rna_G_wt], # Guanine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
322 'U' => [$rna_U_wt,$rna_U_wt], # Uracil
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
323 'M' => [$rna_C_wt,$rna_A_wt], # A or C
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
324 'R' => [$rna_A_wt,$rna_G_wt], # A or G
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
325 'W' => [$rna_U_wt,$rna_A_wt], # A or U
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
326 'S' => [$rna_C_wt,$rna_G_wt], # C or G
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
327 'Y' => [$rna_C_wt,$rna_U_wt], # C or U
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
328 'K' => [$rna_U_wt,$rna_G_wt], # G or U
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
329 'V' => [$rna_C_wt,$rna_G_wt], # A or C or G
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
330 'H' => [$rna_C_wt,$rna_A_wt], # A or C or U
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
331 'D' => [$rna_U_wt,$rna_G_wt], # A or G or U
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
332 'B' => [$rna_C_wt,$rna_G_wt], # C or G or U
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
333 'X' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
334 'N' => [$rna_C_wt,$rna_G_wt], # G or A or U or C
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
335 };
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
336
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
337 %Weights = (
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
338 'dna' => $dna_weights,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
339 'rna' => $rna_weights,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
340 'protein' => $amino_weights,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
341 );
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
342 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
343
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
344 sub new {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
345 my($class,@args) = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
346 my $self = $class->SUPER::new(@args);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
347
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
348 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
349 unless ($seqobj->isa("Bio::PrimarySeqI")) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
350 $self->throw(" SeqStats works only on PrimarySeqI objects \n");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
351 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
352 if ( !defined $seqobj->alphabet || ! defined $Alphabets{$seqobj->alphabet}) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
353 $self->throw("Must have a valid alphabet defined for seq (".
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
354 join(",",keys %Alphabets));
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
355 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
356 $self->{'_seqref'} = $seqobj;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
357 # check the letters in the sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
358 $self->{'_is_strict'} = _is_alphabet_strict($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
359 return $self;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
360 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
361
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
362 =head2 count_monomers
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
363
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
364 Title : count_monomers
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
365 Usage : $rcount = $seq_stats->count_monomers();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
366 or $rcount = $seq_stats->Bio::Tools::SeqStats->($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
367 Function: Counts the number of each type of monomer (amino acid or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
368 base) in the sequence.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
369 Ts are counted as Us in RNA sequences.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
370 Example :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
371 Returns : Reference to a hash in which keys are letters of the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
372 genetic alphabet used and values are number of occurrences
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
373 of the letter in the sequence.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
374 Args : None or reference to sequence object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
375 Throws : Throws an exception if type of sequence is unknown (ie amino
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
376 or nucleic)or if unknown letter in alphabet. Ambiguous
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
377 elements are allowed.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
378
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
379 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
380
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
381 sub count_monomers{
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
382 my %count = ();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
383 my $seqobj;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
384 my $_is_strict;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
385 my $element = '';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
386 my $_is_instance = 1 ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
387 my $self = shift @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
388 my $object_argument = shift @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
389
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
390 # First we need to determine if the present object is an instance
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
391 # object or if the sequence object has been passed as an argument
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
392
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
393 if (defined $object_argument) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
394 $_is_instance = 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
395 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
396
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
397 # If we are using an instance object...
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
398 if ($_is_instance) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
399 if ($self->{'_monomer_count'}) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
400 return $self->{'_monomer_count'}; # return count if previously calculated
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
401 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
402 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
403 $seqobj = $self->{'_seqref'};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
404 } else {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
405 # otherwise...
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
406 $seqobj = $object_argument;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
407
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
408 # Following two lines lead to error in "throw" routine
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
409 $seqobj->isa("Bio::PrimarySeqI") ||
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
410 $self->throw(" SeqStats works only on PrimarySeqI objects \n");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
411 # is alphabet OK? Is it strict?
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
412 $_is_strict = _is_alphabet_strict($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
413 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
414
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
415 my $alphabet = $_is_strict ? $Alphabets_strict{$seqobj->alphabet} :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
416 $Alphabets{$seqobj->alphabet} ; # get array of allowed letters
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
417
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
418 # convert everything to upper case to be safe
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
419 my $seqstring = uc $seqobj->seq();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
420
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
421 # Since T is used in RichSeq RNA sequences, do conversion locally
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
422 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
423
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
424 # For each letter, count the number of times it appears in
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
425 # the sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
426 LETTER:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
427 foreach $element (@$alphabet) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
428 # skip terminator symbol which may confuse regex
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
429 next LETTER if $element eq '*';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
430 $count{$element} = ( $seqstring =~ s/$element/$element/g);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
431 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
432
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
433 if ($_is_instance) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
434 $self->{'_monomer_count'} = \%count; # Save in case called again later
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
435 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
436
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
437 return \%count;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
438 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
439
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
440 =head2 get_mol_wt
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
441
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
442 Title : get_mol_wt
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
443 Usage : $wt = $seqobj->get_mol_wt() or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
444 $wt = Bio::Tools::SeqStats ->get_mol_wt($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
445 Function: Calculate molecular weight of sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
446 Ts are counted as Us in RNA sequences.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
447 Example :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
448
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
449 Returns : Reference to two element array containing lower and upper
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
450 bounds of molecule molecular weight. (For dna (and rna)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
451 sequences, single-stranded weights are returned.) If
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
452 sequence contains no ambiguous elements, both entries in
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
453 array are equal to molecular weight of molecule.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
454 Args : None or reference to sequence object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
455 Throws : Exception if type of sequence is unknown (ie not amino or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
456 nucleic) or if unknown letter in alphabet. Ambiguous
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
457 elements are allowed.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
458
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
459 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
460
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
461 sub get_mol_wt {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
462
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
463 my $seqobj;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
464 my $_is_strict;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
465 my $element = '';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
466 my $_is_instance = 1 ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
467 my $self = shift @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
468 my $object_argument = shift @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
469 my ($weight_array, $rcount);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
470
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
471 if (defined $object_argument) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
472 $_is_instance = 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
473 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
474
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
475 if ($_is_instance) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
476 if ($weight_array = $self->{'_mol_wt'}) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
477 # return mol. weight if previously calculated
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
478 return $weight_array;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
479 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
480 $seqobj = $self->{'_seqref'};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
481 $rcount = $self->count_monomers();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
482 } else {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
483 $seqobj = $object_argument;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
484 $seqobj->isa("Bio::PrimarySeqI") ||
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
485 die("Error: SeqStats works only on PrimarySeqI objects \n");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
486 $_is_strict = _is_alphabet_strict($seqobj); # is alphabet OK?
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
487 $rcount = $self->count_monomers($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
488 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
489
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
490 # We will also need to know what type of monomer we are dealing with
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
491 my $moltype = $seqobj->alphabet();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
492
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
493 # In general,the molecular weight is bounded below by the sum of the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
494 # weights of lower bounds of each alphabet symbol times the number of
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
495 # occurrences of the symbol in the sequence. A similar upper bound on
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
496 # the weight is also calculated.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
497
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
498 # Note that for "strict" (ie unambiguous) sequences there is an
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
499 # inefficiency since the upper bound = the lower bound (and is
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
500 # calculated twice). However, this decrease in performance will be
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
501 # minor and leads to (IMO) significantly more readable code.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
502
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
503 my $weight_lower_bound = 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
504 my $weight_upper_bound = 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
505 my $weight_table = $Weights{$moltype};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
506
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
507
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
508 # compute weight of all the residues
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
509 foreach $element (keys %$rcount) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
510 $weight_lower_bound += $$rcount{$element} * $$weight_table{$element}->[0];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
511 $weight_upper_bound += $$rcount{$element} * $$weight_table{$element}->[1];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
512 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
513 if ($moltype =~ /protein/) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
514 # remove of H2O during peptide bond formation.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
515 $weight_lower_bound -= $water * ($seqobj->length - 1);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
516 $weight_upper_bound -= $water * ($seqobj->length - 1);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
517 } else {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
518 # Correction because phosphate of 5' residue has additional OH and
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
519 # sugar ring of 3' residue has additional H
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
520 $weight_lower_bound += $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
521 $weight_upper_bound += $water;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
522 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
523
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
524 $weight_lower_bound = sprintf("%.0f", $weight_lower_bound);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
525 $weight_upper_bound = sprintf("%.0f", $weight_upper_bound);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
526
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
527 $weight_array = [$weight_lower_bound, $weight_upper_bound];
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
528
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
529 if ($_is_instance) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
530 $self->{'_mol_wt'} = $weight_array; # Save in case called again later
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
531 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
532 return $weight_array;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
533 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
534
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
535
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
536 =head2 count_codons
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
537
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
538 Title : count_codons
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
539 Usage : $rcount = $seqstats->count_codons (); or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
540 $rcount = Bio::Tools::SeqStats->count_codons($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
541
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
542 Function: Counts the number of each type of codons in a given frame
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
543 for a dna or rna sequence.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
544 Example :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
545 Returns : Reference to a hash in which keys are codons of the genetic
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
546 alphabet used and values are number of occurrences of the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
547 codons in the sequence. All codons with "ambiguous" bases
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
548 are counted together.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
549 Args : None or reference to sequence object
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
550
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
551 Throws : an exception if type of sequence is unknown or protein.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
552
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
553 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
554
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
555 sub count_codons {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
556 my $rcount = {};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
557 my $codon ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
558 my $seqobj;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
559 my $_is_strict;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
560 my $element = '';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
561 my $_is_instance = 1 ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
562 my $self = shift @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
563 my $object_argument = shift @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
564
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
565 if (defined $object_argument) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
566 $_is_instance = 0;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
567 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
568
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
569 if ($_is_instance) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
570 if ($rcount = $self->{'_codon_count'}) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
571 return $rcount; # return count if previously calculated
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
572 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
573 $_is_strict = $self->{'_is_strict'}; # retrieve "strictness"
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
574 $seqobj = $self->{'_seqref'};
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
575 } else {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
576 $seqobj = $object_argument;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
577 $seqobj->isa("Bio::PrimarySeqI") ||
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
578 die(" Error: SeqStats works only on PrimarySeqI objects \n");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
579 $_is_strict = _is_alphabet_strict($seqobj);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
580 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
581
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
582 # Codon counts only make sense for nucleic acid sequences
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
583 my $alphabet = $seqobj->alphabet();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
584
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
585 unless ($alphabet =~ /[dr]na/) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
586 $seqobj->throw(" Codon counts only meaningful for dna or rna, ".
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
587 "not for $alphabet sequences. \n");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
588 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
589
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
590 # If sequence contains ambiguous bases, warn that codons
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
591 # containing them will all be lumped together in the count.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
592
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
593 if (!$_is_strict ) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
594 $seqobj->warn(" Sequence $seqobj contains ambiguous bases. \n".
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
595 " All codons with ambiguous bases will be added together in count. \n");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
596 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
597
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
598 my $seq = $seqobj->seq();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
599
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
600 # Now step through the string by threes and count the codons
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
601
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
602 CODON:
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
603 while (length($seq) > 2) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
604 $codon = substr($seq,0,3);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
605 $seq = substr($seq,3);
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
606 if ($codon =~ /[^ACTGU]/) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
607 $$rcount{'ambiguous'}++; #lump together ambiguous codons
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
608 next CODON;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
609 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
610 if (!defined $$rcount{$codon}) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
611 $$rcount{$codon}= 1 ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
612 next CODON;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
613 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
614 $$rcount{$codon}++; # default
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
615 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
616
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
617
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
618 if ($_is_instance) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
619 $self->{'_codon_count'} = $rcount; # Save in case called again later
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
620 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
621
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
622 return $rcount;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
623 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
624
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
625
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
626 =head2 _is_alphabet_strict
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
627
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
628 Title : _is_alphabet_strict
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
629 Usage :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
630 Function: internal function to determine whether there are
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
631 any ambiguous elements in the current sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
632 Example :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
633 Returns : 1 if strict alphabet is being used,
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
634 0 if ambiguous elements are present
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
635 Args :
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
636
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
637 Throws : an exception if type of sequence is unknown (ie amino or
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
638 nucleic) or if unknown letter in alphabet. Ambiguous
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
639 monomers are allowed.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
640
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
641 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
642
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
643 sub _is_alphabet_strict {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
644
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
645 my ($seqobj) = @_;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
646 my $moltype = $seqobj->alphabet();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
647
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
648 # convert everything to upper case to be safe
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
649 my $seqstring = uc $seqobj->seq();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
650
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
651 # Since T is used in RichSeq RNA sequences, do conversion locally
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
652 $seqstring =~ s/T/U/g if $seqobj->alphabet eq 'rna';
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
653
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
654 # First we check if only the 'strict' letters are present in the
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
655 # sequence string If not, we check whether the remaining letters
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
656 # are ambiguous monomers or whether there are illegal letters in
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
657 # the string
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
658
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
659 # $alpha_array is a ref to an array of the 'strictly' allowed letters
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
660 my $alpha_array = $Alphabets_strict{$moltype} ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
661
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
662 # $alphabet contains the allowed letters in string form
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
663 my $alphabet = join ('', @$alpha_array) ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
664 unless ($seqstring =~ /[^$alphabet]/) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
665 return 1 ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
666 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
667
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
668 # Next try to match with the alphabet's ambiguous letters
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
669 $alpha_array = $Alphabets{$moltype} ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
670 $alphabet = join ('', @$alpha_array) ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
671
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
672 unless ($seqstring =~ /[^$alphabet]/) {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
673 return 0 ;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
674 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
675
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
676 # If we got here there is an illegal letter in the sequence
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
677 $seqobj->throw(" Alphabet not OK for $seqobj \n");
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
678
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
679 }
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
680
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
681 =head2 _print_data
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
682
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
683 Title : _print_data
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
684 Usage : $seqobj->_print_data() or Bio::Tools::SeqStats->_print_data();
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
685 Function: Displays dna / rna parameters (used for debugging)
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
686 Returns : 1
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
687 Args : None
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
688
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
689 Used for debugging.
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
690
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
691 =cut
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
692
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
693 sub _print_data {
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
694
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
695 print "\n adenine = : $adenine \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
696 print "\n guanine = : $guanine \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
697 print "\n cytosine = : $cytosine \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
698 print "\n thymine = : $thymine \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
699 print "\n uracil = : $uracil \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
700
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
701 print "\n dna_A_wt = : $dna_A_wt \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
702 print "\n dna_C_wt = : $dna_C_wt \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
703 print "\n dna_G_wt = : $dna_G_wt \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
704 print "\n dna_T_wt = : $dna_T_wt \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
705
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
706 print "\n rna_A_wt = : $rna_A_wt \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
707 print "\n rna_C_wt = : $rna_C_wt \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
708 print "\n rna_G_wt = : $rna_G_wt \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
709 print "\n rna_U_wt = : $rna_U_wt \n";
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
710
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
711 return 1;
2bc9b66ada89 Uploaded
mahtabm
parents:
diff changeset
712 }