0
|
1 # $Id: SeqWords.pm,v 1.7.2.1 2003/03/05 19:06:15 jason Exp $
|
|
2
|
|
3 #---------------------------------------------------------------------------
|
|
4 # PACKAGE : SeqWords.pm
|
|
5 # PURPOSE : To count n-mers in any sequence of characters
|
|
6 # AUTHOR : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl)
|
|
7 # SOURCE :
|
|
8 # CREATED : 21st March 2000
|
|
9 # MODIFIED :
|
|
10 # DISCLAIMER : I am employed in the pharmaceutical industry but my
|
|
11 # : employers do not endorse or sponsor this module
|
|
12 # : in any way whatsoever. The above email address is
|
|
13 # : given purely for the purpose of easy communication
|
|
14 # : with the author, and does not imply any connection
|
|
15 # : between my employers and anything written below.
|
|
16 # LICENCE : You may distribute this module under the same terms
|
|
17 # : as the rest of BioPerl.
|
|
18 #---------------------------------------------------------------------------
|
|
19
|
|
20 =head1 NAME
|
|
21
|
|
22 Bio::Tools::SeqWords - Object holding n-mer statistics for one sequence
|
|
23
|
|
24 =head1 SYNOPSIS
|
|
25
|
|
26 Take a sequence object from eg, an inputstream, and creates an object
|
|
27 for the purposes of holding n-mer word statistics about that sequence.
|
|
28 The sequence can be nucleic acid or protein, but the module is
|
|
29 probably most relevant for DNA. The words are counted in a
|
|
30 non-overlapping manner, ie. in the style of a codon table, but with
|
|
31 any word length. For overlapping word counts, a sequence can be
|
|
32 'shifted' to remove the first character and then the count repeated.
|
|
33 For counts on opposite strand (DNA/RNA), a reverse complement method
|
|
34 should be performed, and then the count repeated.
|
|
35
|
|
36 Creating the SeqWords object, eg:
|
|
37
|
|
38 my $inputstream = Bio::SeqIO->new( -file => "seqfile",
|
|
39 -format => 'Fasta');
|
|
40 my $seqobj = $inputstream->next_seq();
|
|
41 my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
|
|
42
|
|
43 or:
|
|
44
|
|
45 my $seqobj = Bio::PrimarySeq->new(-seq=>'[cut and paste a sequence here]',
|
|
46 -alphabet => 'dna',
|
|
47 -id => 'test');
|
|
48 my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
|
|
49
|
|
50 obtain a hash of word counts, eg:
|
|
51
|
|
52 my $hash_ref = $seq_stats->count_words($word_length);
|
|
53
|
|
54 display hash table, eg:
|
|
55
|
|
56 my %hash = %$hash_ref;
|
|
57 foreach my $key(sort keys %hash)
|
|
58 {
|
|
59 print "\n$key\t$hash{$key}";
|
|
60 }
|
|
61
|
|
62 or
|
|
63
|
|
64 my $hash_ref = Bio::SeqWords->count_words($seqobj,$word_length);
|
|
65
|
|
66
|
|
67 =head1 DESCRIPTION
|
|
68
|
|
69 Bio:SeqWords is a featherweight object for the calculation of n-mer
|
|
70 word occurrences in a single sequence. It is envisaged that the
|
|
71 object will be useful for construction of scripts which use n-mer word
|
|
72 tables as the raw material for statistical calculations; for instance,
|
|
73 hexamer frequency for the calculation of coding protential, or the
|
|
74 calculation of periodicity in repetitive DNA. Triplet frequency is
|
|
75 already handled by Bio::SeqStats.pm (author: Peter Schattner). There
|
|
76 are a few possible applications for protein, eg: hypothesised amino
|
|
77 acid 7-mers in heat shock proteins, or proteins with multiple simple
|
|
78 motifs. Sometimes these protein periodicities are best seen when the
|
|
79 amino acid alphabet is truncated, eg Shulman alphabet. Since there
|
|
80 are quite a few of these shortened alphabets, this module does not
|
|
81 specify any particular alphabet.
|
|
82
|
|
83 See Synopsis above for object creation code.
|
|
84
|
|
85 =head1 FEEDBACK
|
|
86
|
|
87 =head2 Mailing Lists
|
|
88
|
|
89 User feedback is an integral part of the evolution of this
|
|
90 and other Bioperl modules. Send your comments and suggestions preferably
|
|
91 to one of the Bioperl mailing lists.
|
|
92 Your participation is much appreciated.
|
|
93
|
|
94 bioperl-l@bioperl.org - General discussion
|
|
95 http://bio.perl.org/MailList.html - About the mailing lists
|
|
96
|
|
97 =head2 Reporting Bugs
|
|
98
|
|
99 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
100 the bugs and their resolution.
|
|
101 Bug reports can be submitted via the web:
|
|
102
|
|
103 http://bugzilla.bioperl.org/
|
|
104
|
|
105 =head1 AUTHOR
|
|
106
|
|
107 Derek Gatherer, in the loosest sense of the word 'author'. The
|
|
108 general shape of the module is lifted directly from Peter Schattner's
|
|
109 SeqStats.pm module. The central subroutine to count the words is
|
|
110 adapted from original code provided by Dave Shivak, in response to a
|
|
111 query on the bioperl mailing list. At least 2 other people provided
|
|
112 alternative means (equally good but not used in the end) of performing
|
|
113 the same calculation. Thanks to all for your assistance.
|
|
114
|
|
115 =head1 CONTRUBITORS
|
|
116
|
|
117 Jason Stajich, jason-at-bioperl.org
|
|
118
|
|
119 =head1 APPENDIX
|
|
120
|
|
121 The rest of the documentation details each of the object methods.
|
|
122 Internal methods are usually preceded with a _
|
|
123
|
|
124 =cut
|
|
125
|
|
126 package Bio::Tools::SeqWords;
|
|
127 use vars qw(@ISA);
|
|
128 use strict;
|
|
129
|
|
130 use Bio::Root::Root;
|
|
131
|
|
132 @ISA = qw(Bio::Root::Root);
|
|
133
|
|
134 sub new {
|
|
135 my($class,@args) = @_;
|
|
136 # our new standard way of instantiation
|
|
137 my $self = $class->SUPER::new(@args);
|
|
138
|
|
139 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
|
|
140 if((! defined($seqobj)) && @args && ref($args[0])) {
|
|
141 # parameter not passed as named parameter?
|
|
142 $seqobj = $args[0];
|
|
143 }
|
|
144
|
|
145 if(! $seqobj->isa("Bio::PrimarySeqI")) {
|
|
146 $self->throw(ref($self) . " works only on PrimarySeqI objects\n");
|
|
147 }
|
|
148
|
|
149 $self->{'_seqref'} = $seqobj;
|
|
150 return $self;
|
|
151 }
|
|
152
|
|
153
|
|
154 =head2 count_words
|
|
155
|
|
156 Title : count_words
|
|
157 Usage : $word_count = $seq_stats->count_words($word_length);
|
|
158 or : $word_count = $seq_stats->Bio::SeqWords->($seqobj,$word_length);
|
|
159 Function: Counts non-overlapping words within a string
|
|
160 : any alphabet is used
|
|
161 Example : a sequence ACCGTCCGT, counted at word length 4,
|
|
162 : will give the hash
|
|
163 : ACCG 1, TCCG 1
|
|
164 Returns : Reference to a hash in which keys are words (any length) of the
|
|
165 : alphabet used and values are number of occurrences of the word
|
|
166 : in the sequence.
|
|
167 Args : Word length as scalar and, reference to sequence object if
|
|
168 : required
|
|
169
|
|
170 Throws an exception word length is not a positive integer
|
|
171 or if word length is longer than the sequence.
|
|
172
|
|
173 =cut
|
|
174
|
|
175 sub count_words
|
|
176 {
|
|
177 my ($self,$seqobj,$word_length) = @_;
|
|
178
|
|
179 # check how we were called, and if necessary rearrange arguments
|
|
180 if(ref($seqobj)) {
|
|
181 # call as SeqWords->count_words($seq, $wordlen)
|
|
182 if(! $seqobj->isa("Bio::PrimarySeqI")) {
|
|
183 $self->throw("SeqWords works only on PrimarySeqI objects\n");
|
|
184 }
|
|
185 } else {
|
|
186 # call as $obj->count_words($wordlen)
|
|
187 $word_length = $seqobj;
|
|
188 $seqobj = undef;
|
|
189 }
|
|
190
|
|
191 if($word_length eq "" || $word_length =~ /[a-z]/i)
|
|
192 {
|
|
193 $self->throw("SeqWords cannot accept non-numeric characters".
|
|
194 " or a null value in the \$word_length variable\n");
|
|
195 }
|
|
196 elsif ($word_length <1 || ($word_length - int($word_length)) >0)
|
|
197 {
|
|
198 $self->throw("SeqWords requires the word length to be a ".
|
|
199 "positive integer\n");
|
|
200 }
|
|
201
|
|
202 if(! defined($seqobj)) {
|
|
203 $seqobj = $self->{'_seqref'};
|
|
204 }
|
|
205 my $seqstring = uc $seqobj->seq();
|
|
206
|
|
207 if($word_length > length($seqstring))
|
|
208 {
|
|
209 $self->throw("die in count words, \$word_length is bigger ".
|
|
210 "than sequence length\n");
|
|
211 }
|
|
212
|
|
213 my %codon = ();
|
|
214
|
|
215 # now the real business
|
|
216 # JS - remove DNA assumption
|
|
217 while($seqstring =~ /((\w){$word_length})/gim) {
|
|
218 $codon{uc($1)}++;
|
|
219 }
|
|
220 return \%codon;
|
|
221
|
|
222 # and that's it
|
|
223 }
|
|
224
|
|
225 1;
|