Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/SeqWords.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: SeqWords.pm,v 1.7.2.1 2003/03/05 19:06:15 jason Exp $ | |
2 | |
3 #--------------------------------------------------------------------------- | |
4 # PACKAGE : SeqWords.pm | |
5 # PURPOSE : To count n-mers in any sequence of characters | |
6 # AUTHOR : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl) | |
7 # SOURCE : | |
8 # CREATED : 21st March 2000 | |
9 # MODIFIED : | |
10 # DISCLAIMER : I am employed in the pharmaceutical industry but my | |
11 # : employers do not endorse or sponsor this module | |
12 # : in any way whatsoever. The above email address is | |
13 # : given purely for the purpose of easy communication | |
14 # : with the author, and does not imply any connection | |
15 # : between my employers and anything written below. | |
16 # LICENCE : You may distribute this module under the same terms | |
17 # : as the rest of BioPerl. | |
18 #--------------------------------------------------------------------------- | |
19 | |
20 =head1 NAME | |
21 | |
22 Bio::Tools::SeqWords - Object holding n-mer statistics for one sequence | |
23 | |
24 =head1 SYNOPSIS | |
25 | |
26 Take a sequence object from eg, an inputstream, and creates an object | |
27 for the purposes of holding n-mer word statistics about that sequence. | |
28 The sequence can be nucleic acid or protein, but the module is | |
29 probably most relevant for DNA. The words are counted in a | |
30 non-overlapping manner, ie. in the style of a codon table, but with | |
31 any word length. For overlapping word counts, a sequence can be | |
32 'shifted' to remove the first character and then the count repeated. | |
33 For counts on opposite strand (DNA/RNA), a reverse complement method | |
34 should be performed, and then the count repeated. | |
35 | |
36 Creating the SeqWords object, eg: | |
37 | |
38 my $inputstream = Bio::SeqIO->new( -file => "seqfile", | |
39 -format => 'Fasta'); | |
40 my $seqobj = $inputstream->next_seq(); | |
41 my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj); | |
42 | |
43 or: | |
44 | |
45 my $seqobj = Bio::PrimarySeq->new(-seq=>'[cut and paste a sequence here]', | |
46 -alphabet => 'dna', | |
47 -id => 'test'); | |
48 my $seq_word = Bio::Tools::SeqWords->new(-seq => $seqobj); | |
49 | |
50 obtain a hash of word counts, eg: | |
51 | |
52 my $hash_ref = $seq_stats->count_words($word_length); | |
53 | |
54 display hash table, eg: | |
55 | |
56 my %hash = %$hash_ref; | |
57 foreach my $key(sort keys %hash) | |
58 { | |
59 print "\n$key\t$hash{$key}"; | |
60 } | |
61 | |
62 or | |
63 | |
64 my $hash_ref = Bio::SeqWords->count_words($seqobj,$word_length); | |
65 | |
66 | |
67 =head1 DESCRIPTION | |
68 | |
69 Bio:SeqWords is a featherweight object for the calculation of n-mer | |
70 word occurrences in a single sequence. It is envisaged that the | |
71 object will be useful for construction of scripts which use n-mer word | |
72 tables as the raw material for statistical calculations; for instance, | |
73 hexamer frequency for the calculation of coding protential, or the | |
74 calculation of periodicity in repetitive DNA. Triplet frequency is | |
75 already handled by Bio::SeqStats.pm (author: Peter Schattner). There | |
76 are a few possible applications for protein, eg: hypothesised amino | |
77 acid 7-mers in heat shock proteins, or proteins with multiple simple | |
78 motifs. Sometimes these protein periodicities are best seen when the | |
79 amino acid alphabet is truncated, eg Shulman alphabet. Since there | |
80 are quite a few of these shortened alphabets, this module does not | |
81 specify any particular alphabet. | |
82 | |
83 See Synopsis above for object creation code. | |
84 | |
85 =head1 FEEDBACK | |
86 | |
87 =head2 Mailing Lists | |
88 | |
89 User feedback is an integral part of the evolution of this | |
90 and other Bioperl modules. Send your comments and suggestions preferably | |
91 to one of the Bioperl mailing lists. | |
92 Your participation is much appreciated. | |
93 | |
94 bioperl-l@bioperl.org - General discussion | |
95 http://bio.perl.org/MailList.html - About the mailing lists | |
96 | |
97 =head2 Reporting Bugs | |
98 | |
99 Report bugs to the Bioperl bug tracking system to help us keep track | |
100 the bugs and their resolution. | |
101 Bug reports can be submitted via the web: | |
102 | |
103 http://bugzilla.bioperl.org/ | |
104 | |
105 =head1 AUTHOR | |
106 | |
107 Derek Gatherer, in the loosest sense of the word 'author'. The | |
108 general shape of the module is lifted directly from Peter Schattner's | |
109 SeqStats.pm module. The central subroutine to count the words is | |
110 adapted from original code provided by Dave Shivak, in response to a | |
111 query on the bioperl mailing list. At least 2 other people provided | |
112 alternative means (equally good but not used in the end) of performing | |
113 the same calculation. Thanks to all for your assistance. | |
114 | |
115 =head1 CONTRUBITORS | |
116 | |
117 Jason Stajich, jason-at-bioperl.org | |
118 | |
119 =head1 APPENDIX | |
120 | |
121 The rest of the documentation details each of the object methods. | |
122 Internal methods are usually preceded with a _ | |
123 | |
124 =cut | |
125 | |
126 package Bio::Tools::SeqWords; | |
127 use vars qw(@ISA); | |
128 use strict; | |
129 | |
130 use Bio::Root::Root; | |
131 | |
132 @ISA = qw(Bio::Root::Root); | |
133 | |
134 sub new { | |
135 my($class,@args) = @_; | |
136 # our new standard way of instantiation | |
137 my $self = $class->SUPER::new(@args); | |
138 | |
139 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args); | |
140 if((! defined($seqobj)) && @args && ref($args[0])) { | |
141 # parameter not passed as named parameter? | |
142 $seqobj = $args[0]; | |
143 } | |
144 | |
145 if(! $seqobj->isa("Bio::PrimarySeqI")) { | |
146 $self->throw(ref($self) . " works only on PrimarySeqI objects\n"); | |
147 } | |
148 | |
149 $self->{'_seqref'} = $seqobj; | |
150 return $self; | |
151 } | |
152 | |
153 | |
154 =head2 count_words | |
155 | |
156 Title : count_words | |
157 Usage : $word_count = $seq_stats->count_words($word_length); | |
158 or : $word_count = $seq_stats->Bio::SeqWords->($seqobj,$word_length); | |
159 Function: Counts non-overlapping words within a string | |
160 : any alphabet is used | |
161 Example : a sequence ACCGTCCGT, counted at word length 4, | |
162 : will give the hash | |
163 : ACCG 1, TCCG 1 | |
164 Returns : Reference to a hash in which keys are words (any length) of the | |
165 : alphabet used and values are number of occurrences of the word | |
166 : in the sequence. | |
167 Args : Word length as scalar and, reference to sequence object if | |
168 : required | |
169 | |
170 Throws an exception word length is not a positive integer | |
171 or if word length is longer than the sequence. | |
172 | |
173 =cut | |
174 | |
175 sub count_words | |
176 { | |
177 my ($self,$seqobj,$word_length) = @_; | |
178 | |
179 # check how we were called, and if necessary rearrange arguments | |
180 if(ref($seqobj)) { | |
181 # call as SeqWords->count_words($seq, $wordlen) | |
182 if(! $seqobj->isa("Bio::PrimarySeqI")) { | |
183 $self->throw("SeqWords works only on PrimarySeqI objects\n"); | |
184 } | |
185 } else { | |
186 # call as $obj->count_words($wordlen) | |
187 $word_length = $seqobj; | |
188 $seqobj = undef; | |
189 } | |
190 | |
191 if($word_length eq "" || $word_length =~ /[a-z]/i) | |
192 { | |
193 $self->throw("SeqWords cannot accept non-numeric characters". | |
194 " or a null value in the \$word_length variable\n"); | |
195 } | |
196 elsif ($word_length <1 || ($word_length - int($word_length)) >0) | |
197 { | |
198 $self->throw("SeqWords requires the word length to be a ". | |
199 "positive integer\n"); | |
200 } | |
201 | |
202 if(! defined($seqobj)) { | |
203 $seqobj = $self->{'_seqref'}; | |
204 } | |
205 my $seqstring = uc $seqobj->seq(); | |
206 | |
207 if($word_length > length($seqstring)) | |
208 { | |
209 $self->throw("die in count words, \$word_length is bigger ". | |
210 "than sequence length\n"); | |
211 } | |
212 | |
213 my %codon = (); | |
214 | |
215 # now the real business | |
216 # JS - remove DNA assumption | |
217 while($seqstring =~ /((\w){$word_length})/gim) { | |
218 $codon{uc($1)}++; | |
219 } | |
220 return \%codon; | |
221 | |
222 # and that's it | |
223 } | |
224 | |
225 1; |