annotate variant_effect_predictor/Bio/Tools/SeqWords.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 # $Id: SeqWords.pm,v 1.7.2.1 2003/03/05 19:06:15 jason Exp $
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 #---------------------------------------------------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4 # PACKAGE : SeqWords.pm
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 # PURPOSE : To count n-mers in any sequence of characters
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6 # AUTHOR : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 # SOURCE :
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8 # CREATED : 21st March 2000
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 # MODIFIED :
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10 # DISCLAIMER : I am employed in the pharmaceutical industry but my
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 # : employers do not endorse or sponsor this module
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12 # : in any way whatsoever. The above email address is
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 # : given purely for the purpose of easy communication
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14 # : with the author, and does not imply any connection
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 # : between my employers and anything written below.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16 # LICENCE : You may distribute this module under the same terms
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 # : as the rest of BioPerl.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18 #---------------------------------------------------------------------------
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22 Bio::Tools::SeqWords - Object holding n-mer statistics for one sequence
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24 =head1 SYNOPSIS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26 Take a sequence object from eg, an inputstream, and creates an object
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 for the purposes of holding n-mer word statistics about that sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 The sequence can be nucleic acid or protein, but the module is
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 probably most relevant for DNA. The words are counted in a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 non-overlapping manner, ie. in the style of a codon table, but with
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31 any word length. For overlapping word counts, a sequence can be
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32 'shifted' to remove the first character and then the count repeated.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33 For counts on opposite strand (DNA/RNA), a reverse complement method
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34 should be performed, and then the count repeated.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 Creating the SeqWords object, eg:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38 my $inputstream = Bio::SeqIO->new( -file => "seqfile",
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 -format => 'Fasta');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40 my $seqobj = $inputstream->next_seq();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 or:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45 my $seqobj = Bio::PrimarySeq->new(-seq=>'[cut and paste a sequence here]',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 -alphabet => 'dna',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 -id => 'test');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50 obtain a hash of word counts, eg:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 my $hash_ref = $seq_stats->count_words($word_length);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 display hash table, eg:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 my %hash = %$hash_ref;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 foreach my $key(sort keys %hash)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58 {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59 print "\n$key\t$hash{$key}";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62 or
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64 my $hash_ref = Bio::SeqWords->count_words($seqobj,$word_length);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 Bio:SeqWords is a featherweight object for the calculation of n-mer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 word occurrences in a single sequence. It is envisaged that the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 object will be useful for construction of scripts which use n-mer word
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 tables as the raw material for statistical calculations; for instance,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 hexamer frequency for the calculation of coding protential, or the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 calculation of periodicity in repetitive DNA. Triplet frequency is
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75 already handled by Bio::SeqStats.pm (author: Peter Schattner). There
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 are a few possible applications for protein, eg: hypothesised amino
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 acid 7-mers in heat shock proteins, or proteins with multiple simple
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78 motifs. Sometimes these protein periodicities are best seen when the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 amino acid alphabet is truncated, eg Shulman alphabet. Since there
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 are quite a few of these shortened alphabets, this module does not
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81 specify any particular alphabet.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 See Synopsis above for object creation code.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 =head1 FEEDBACK
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87 =head2 Mailing Lists
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 User feedback is an integral part of the evolution of this
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 and other Bioperl modules. Send your comments and suggestions preferably
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 to one of the Bioperl mailing lists.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92 Your participation is much appreciated.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 bioperl-l@bioperl.org - General discussion
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 http://bio.perl.org/MailList.html - About the mailing lists
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97 =head2 Reporting Bugs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 Report bugs to the Bioperl bug tracking system to help us keep track
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 the bugs and their resolution.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 Bug reports can be submitted via the web:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 http://bugzilla.bioperl.org/
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 =head1 AUTHOR
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107 Derek Gatherer, in the loosest sense of the word 'author'. The
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 general shape of the module is lifted directly from Peter Schattner's
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109 SeqStats.pm module. The central subroutine to count the words is
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 adapted from original code provided by Dave Shivak, in response to a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 query on the bioperl mailing list. At least 2 other people provided
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 alternative means (equally good but not used in the end) of performing
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 the same calculation. Thanks to all for your assistance.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 =head1 CONTRUBITORS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 Jason Stajich, jason-at-bioperl.org
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 =head1 APPENDIX
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 The rest of the documentation details each of the object methods.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 Internal methods are usually preceded with a _
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126 package Bio::Tools::SeqWords;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 use vars qw(@ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130 use Bio::Root::Root;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 @ISA = qw(Bio::Root::Root);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134 sub new {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135 my($class,@args) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136 # our new standard way of instantiation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 my $self = $class->SUPER::new(@args);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 if((! defined($seqobj)) && @args && ref($args[0])) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141 # parameter not passed as named parameter?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 $seqobj = $args[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 if(! $seqobj->isa("Bio::PrimarySeqI")) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146 $self->throw(ref($self) . " works only on PrimarySeqI objects\n");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 $self->{'_seqref'} = $seqobj;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150 return $self;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 =head2 count_words
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 Title : count_words
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 Usage : $word_count = $seq_stats->count_words($word_length);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158 or : $word_count = $seq_stats->Bio::SeqWords->($seqobj,$word_length);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 Function: Counts non-overlapping words within a string
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 : any alphabet is used
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 Example : a sequence ACCGTCCGT, counted at word length 4,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162 : will give the hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 : ACCG 1, TCCG 1
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164 Returns : Reference to a hash in which keys are words (any length) of the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 : alphabet used and values are number of occurrences of the word
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166 : in the sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167 Args : Word length as scalar and, reference to sequence object if
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 : required
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170 Throws an exception word length is not a positive integer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171 or if word length is longer than the sequence.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175 sub count_words
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176 {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177 my ($self,$seqobj,$word_length) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179 # check how we were called, and if necessary rearrange arguments
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 if(ref($seqobj)) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181 # call as SeqWords->count_words($seq, $wordlen)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182 if(! $seqobj->isa("Bio::PrimarySeqI")) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183 $self->throw("SeqWords works only on PrimarySeqI objects\n");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185 } else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186 # call as $obj->count_words($wordlen)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187 $word_length = $seqobj;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188 $seqobj = undef;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191 if($word_length eq "" || $word_length =~ /[a-z]/i)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192 {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193 $self->throw("SeqWords cannot accept non-numeric characters".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194 " or a null value in the \$word_length variable\n");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196 elsif ($word_length <1 || ($word_length - int($word_length)) >0)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198 $self->throw("SeqWords requires the word length to be a ".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199 "positive integer\n");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202 if(! defined($seqobj)) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203 $seqobj = $self->{'_seqref'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205 my $seqstring = uc $seqobj->seq();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207 if($word_length > length($seqstring))
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208 {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209 $self->throw("die in count words, \$word_length is bigger ".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210 "than sequence length\n");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213 my %codon = ();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215 # now the real business
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216 # JS - remove DNA assumption
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217 while($seqstring =~ /((\w){$word_length})/gim) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218 $codon{uc($1)}++;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220 return \%codon;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222 # and that's it
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 1;