comparison variant_effect_predictor/Bio/Tools/SeqWords.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: SeqWords.pm,v 1.7.2.1 2003/03/05 19:06:15 jason Exp $
2
3 #---------------------------------------------------------------------------
4 # PACKAGE : SeqWords.pm
5 # PURPOSE : To count n-mers in any sequence of characters
6 # AUTHOR : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl)
7 # SOURCE :
8 # CREATED : 21st March 2000
9 # MODIFIED :
10 # DISCLAIMER : I am employed in the pharmaceutical industry but my
11 # : employers do not endorse or sponsor this module
12 # : in any way whatsoever. The above email address is
13 # : given purely for the purpose of easy communication
14 # : with the author, and does not imply any connection
15 # : between my employers and anything written below.
16 # LICENCE : You may distribute this module under the same terms
17 # : as the rest of BioPerl.
18 #---------------------------------------------------------------------------
19
20 =head1 NAME
21
22 Bio::Tools::SeqWords - Object holding n-mer statistics for one sequence
23
24 =head1 SYNOPSIS
25
26 Take a sequence object from eg, an inputstream, and creates an object
27 for the purposes of holding n-mer word statistics about that sequence.
28 The sequence can be nucleic acid or protein, but the module is
29 probably most relevant for DNA. The words are counted in a
30 non-overlapping manner, ie. in the style of a codon table, but with
31 any word length. For overlapping word counts, a sequence can be
32 'shifted' to remove the first character and then the count repeated.
33 For counts on opposite strand (DNA/RNA), a reverse complement method
34 should be performed, and then the count repeated.
35
36 Creating the SeqWords object, eg:
37
38 my $inputstream = Bio::SeqIO->new( -file => "seqfile",
39 -format => 'Fasta');
40 my $seqobj = $inputstream->next_seq();
41 my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
42
43 or:
44
45 my $seqobj = Bio::PrimarySeq->new(-seq=>'[cut and paste a sequence here]',
46 -alphabet => 'dna',
47 -id => 'test');
48 my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj);
49
50 obtain a hash of word counts, eg:
51
52 my $hash_ref = $seq_stats->count_words($word_length);
53
54 display hash table, eg:
55
56 my %hash = %$hash_ref;
57 foreach my $key(sort keys %hash)
58 {
59 print "\n$key\t$hash{$key}";
60 }
61
62 or
63
64 my $hash_ref = Bio::SeqWords->count_words($seqobj,$word_length);
65
66
67 =head1 DESCRIPTION
68
69 Bio:SeqWords is a featherweight object for the calculation of n-mer
70 word occurrences in a single sequence. It is envisaged that the
71 object will be useful for construction of scripts which use n-mer word
72 tables as the raw material for statistical calculations; for instance,
73 hexamer frequency for the calculation of coding protential, or the
74 calculation of periodicity in repetitive DNA. Triplet frequency is
75 already handled by Bio::SeqStats.pm (author: Peter Schattner). There
76 are a few possible applications for protein, eg: hypothesised amino
77 acid 7-mers in heat shock proteins, or proteins with multiple simple
78 motifs. Sometimes these protein periodicities are best seen when the
79 amino acid alphabet is truncated, eg Shulman alphabet. Since there
80 are quite a few of these shortened alphabets, this module does not
81 specify any particular alphabet.
82
83 See Synopsis above for object creation code.
84
85 =head1 FEEDBACK
86
87 =head2 Mailing Lists
88
89 User feedback is an integral part of the evolution of this
90 and other Bioperl modules. Send your comments and suggestions preferably
91 to one of the Bioperl mailing lists.
92 Your participation is much appreciated.
93
94 bioperl-l@bioperl.org - General discussion
95 http://bio.perl.org/MailList.html - About the mailing lists
96
97 =head2 Reporting Bugs
98
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution.
101 Bug reports can be submitted via the web:
102
103 http://bugzilla.bioperl.org/
104
105 =head1 AUTHOR
106
107 Derek Gatherer, in the loosest sense of the word 'author'. The
108 general shape of the module is lifted directly from Peter Schattner's
109 SeqStats.pm module. The central subroutine to count the words is
110 adapted from original code provided by Dave Shivak, in response to a
111 query on the bioperl mailing list. At least 2 other people provided
112 alternative means (equally good but not used in the end) of performing
113 the same calculation. Thanks to all for your assistance.
114
115 =head1 CONTRUBITORS
116
117 Jason Stajich, jason-at-bioperl.org
118
119 =head1 APPENDIX
120
121 The rest of the documentation details each of the object methods.
122 Internal methods are usually preceded with a _
123
124 =cut
125
126 package Bio::Tools::SeqWords;
127 use vars qw(@ISA);
128 use strict;
129
130 use Bio::Root::Root;
131
132 @ISA = qw(Bio::Root::Root);
133
134 sub new {
135 my($class,@args) = @_;
136 # our new standard way of instantiation
137 my $self = $class->SUPER::new(@args);
138
139 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
140 if((! defined($seqobj)) && @args && ref($args[0])) {
141 # parameter not passed as named parameter?
142 $seqobj = $args[0];
143 }
144
145 if(! $seqobj->isa("Bio::PrimarySeqI")) {
146 $self->throw(ref($self) . " works only on PrimarySeqI objects\n");
147 }
148
149 $self->{'_seqref'} = $seqobj;
150 return $self;
151 }
152
153
154 =head2 count_words
155
156 Title : count_words
157 Usage : $word_count = $seq_stats->count_words($word_length);
158 or : $word_count = $seq_stats->Bio::SeqWords->($seqobj,$word_length);
159 Function: Counts non-overlapping words within a string
160 : any alphabet is used
161 Example : a sequence ACCGTCCGT, counted at word length 4,
162 : will give the hash
163 : ACCG 1, TCCG 1
164 Returns : Reference to a hash in which keys are words (any length) of the
165 : alphabet used and values are number of occurrences of the word
166 : in the sequence.
167 Args : Word length as scalar and, reference to sequence object if
168 : required
169
170 Throws an exception word length is not a positive integer
171 or if word length is longer than the sequence.
172
173 =cut
174
175 sub count_words
176 {
177 my ($self,$seqobj,$word_length) = @_;
178
179 # check how we were called, and if necessary rearrange arguments
180 if(ref($seqobj)) {
181 # call as SeqWords->count_words($seq, $wordlen)
182 if(! $seqobj->isa("Bio::PrimarySeqI")) {
183 $self->throw("SeqWords works only on PrimarySeqI objects\n");
184 }
185 } else {
186 # call as $obj->count_words($wordlen)
187 $word_length = $seqobj;
188 $seqobj = undef;
189 }
190
191 if($word_length eq "" || $word_length =~ /[a-z]/i)
192 {
193 $self->throw("SeqWords cannot accept non-numeric characters".
194 " or a null value in the \$word_length variable\n");
195 }
196 elsif ($word_length <1 || ($word_length - int($word_length)) >0)
197 {
198 $self->throw("SeqWords requires the word length to be a ".
199 "positive integer\n");
200 }
201
202 if(! defined($seqobj)) {
203 $seqobj = $self->{'_seqref'};
204 }
205 my $seqstring = uc $seqobj->seq();
206
207 if($word_length > length($seqstring))
208 {
209 $self->throw("die in count words, \$word_length is bigger ".
210 "than sequence length\n");
211 }
212
213 my %codon = ();
214
215 # now the real business
216 # JS - remove DNA assumption
217 while($seqstring =~ /((\w){$word_length})/gim) {
218 $codon{uc($1)}++;
219 }
220 return \%codon;
221
222 # and that's it
223 }
224
225 1;