0
|
1 # $Id: IUPAC.pm,v 1.19 2002/11/30 15:39:53 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for IUPAC
|
|
4 #
|
|
5 # Cared for by Aaron Mackey <amackey@virginia.edu>
|
|
6 #
|
|
7 # Copyright Aaron Mackey
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 use Bio::Seq;
|
|
20 use Bio::Tools::IUPAC;
|
|
21
|
|
22 my $ambiseq = new Bio::Seq (-seq => 'ARTCGUTGR', -alphabet => 'dna');
|
|
23 my $stream = new Bio::Tools::IUPAC(-seq => $ambiseq);
|
|
24
|
|
25 while ($uniqueseq = $stream->next_seq()) {
|
|
26 # process the unique Seq object.
|
|
27 }
|
|
28
|
|
29 =head1 DESCRIPTION
|
|
30
|
|
31 IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq
|
|
32 objects from an ambiquous Seq object (containing non-standard characters given
|
|
33 the meaning shown below)
|
|
34
|
|
35 Extended Dna / Rna alphabet :
|
|
36 (includes symbols for nucleotide ambiguity)
|
|
37 ------------------------------------------
|
|
38 Symbol Meaning Nucleic Acid
|
|
39 ------------------------------------------
|
|
40 A A Adenine
|
|
41 C C Cytosine
|
|
42 G G Guanine
|
|
43 T T Thymine
|
|
44 U U Uracil
|
|
45 M A or C
|
|
46 R A or G
|
|
47 W A or T
|
|
48 S C or G
|
|
49 Y C or T
|
|
50 K G or T
|
|
51 V A or C or G
|
|
52 H A or C or T
|
|
53 D A or G or T
|
|
54 B C or G or T
|
|
55 X G or A or T or C
|
|
56 N G or A or T or C
|
|
57
|
|
58 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
|
|
59 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
|
|
60
|
|
61 -----------------------------------
|
|
62
|
|
63 Amino Acid alphabet:
|
|
64 ------------------------------------------
|
|
65 Symbol Meaning
|
|
66 ------------------------------------------
|
|
67 A Alanine
|
|
68 B Aspartic Acid, Asparagine
|
|
69 C Cystine
|
|
70 D Aspartic Acid
|
|
71 E Glutamic Acid
|
|
72 F Phenylalanine
|
|
73 G Glycine
|
|
74 H Histidine
|
|
75 I Isoleucine
|
|
76 K Lysine
|
|
77 L Leucine
|
|
78 M Methionine
|
|
79 N Asparagine
|
|
80 P Proline
|
|
81 Q Glutamine
|
|
82 R Arginine
|
|
83 S Serine
|
|
84 T Threonine
|
|
85 V Valine
|
|
86 W Tryptophan
|
|
87 X Unknown
|
|
88 Y Tyrosine
|
|
89 Z Glutamic Acid, Glutamine
|
|
90 * Terminator
|
|
91
|
|
92
|
|
93 IUPAC-IUP AMINO ACID SYMBOLS:
|
|
94 Biochem J. 1984 Apr 15; 219(2): 345-373
|
|
95 Eur J Biochem. 1993 Apr 1; 213(1): 2
|
|
96
|
|
97 =head1 FEEDBACK
|
|
98
|
|
99 =head2 Mailing Lists
|
|
100
|
|
101 User feedback is an integral part of the evolution of this and other
|
|
102 Bioperl modules. Send your comments and suggestions preferably to one
|
|
103 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
104
|
|
105 bioperl-l@bioperl.org - General discussion
|
|
106 http://www.bioperl.org/MailList.html - About the mailing lists
|
|
107
|
|
108 =head2 Reporting Bugs
|
|
109
|
|
110 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
111 the bugs and their resolution. Bug reports can be submitted via email
|
|
112 or the web:
|
|
113
|
|
114 bioperl-bugs@bioperl.org
|
|
115 http://www.bugzilla.bioperl.org/
|
|
116
|
|
117 =head1 AUTHOR - Aaron Mackey
|
|
118
|
|
119 Email amackey@virginia.edu
|
|
120
|
|
121 =head1 APPENDIX
|
|
122
|
|
123 The rest of the documentation details each of the object
|
|
124 methods. Internal methods are usually preceded with a _
|
|
125
|
|
126 =cut
|
|
127
|
|
128
|
|
129 # Let the code begin...
|
|
130
|
|
131 package Bio::Tools::IUPAC;
|
|
132
|
|
133 use strict;
|
|
134 use vars qw(@ISA %IUP %IUB $AUTOLOAD);
|
|
135
|
|
136 BEGIN {
|
|
137 %IUB = ( A => [qw(A)],
|
|
138 C => [qw(C)],
|
|
139 G => [qw(G)],
|
|
140 T => [qw(T)],
|
|
141 U => [qw(U)],
|
|
142 M => [qw(A C)],
|
|
143 R => [qw(A G)],
|
|
144 W => [qw(A T)],
|
|
145 S => [qw(C G)],
|
|
146 Y => [qw(C T)],
|
|
147 K => [qw(G T)],
|
|
148 V => [qw(A C G)],
|
|
149 H => [qw(A C T)],
|
|
150 D => [qw(A G T)],
|
|
151 B => [qw(C G T)],
|
|
152 X => [qw(G A T C)],
|
|
153 N => [qw(G A T C)]
|
|
154 );
|
|
155
|
|
156 %IUP = (A => [qw(A)],
|
|
157 B => [qw(D N)],
|
|
158 C => [qw(C)],
|
|
159 D => [qw(D)],
|
|
160 E => [qw(E)],
|
|
161 F => [qw(F)],
|
|
162 G => [qw(G)],
|
|
163 H => [qw(H)],
|
|
164 I => [qw(I)],
|
|
165 K => [qw(K)],
|
|
166 L => [qw(L)],
|
|
167 M => [qw(M)],
|
|
168 N => [qw(N)],
|
|
169 P => [qw(P)],
|
|
170 Q => [qw(Q)],
|
|
171 R => [qw(R)],
|
|
172 S => [qw(S)],
|
|
173 T => [qw(T)],
|
|
174 U => [qw(U)],
|
|
175 V => [qw(V)],
|
|
176 W => [qw(W)],
|
|
177 X => [qw(X)],
|
|
178 Y => [qw(Y)],
|
|
179 Z => [qw(E Q)],
|
|
180 '*' => ['*']
|
|
181 );
|
|
182
|
|
183 }
|
|
184 use Bio::Root::Root;
|
|
185 @ISA = qw(Bio::Root::Root);
|
|
186
|
|
187 =head2 new
|
|
188
|
|
189 Title : new
|
|
190 Usage : new Bio::Tools::IUPAC $seq;
|
|
191 Function: returns a new seq stream (akin to SeqIO)
|
|
192 Returns : a Bio::Tools::IUPAC stream object that will produce unique
|
|
193 Seq objects on demand.
|
|
194 Args : an ambiguously coded Seq.pm object that has a specified 'type'
|
|
195
|
|
196
|
|
197 =cut
|
|
198
|
|
199
|
|
200 sub new {
|
|
201 my($class,@args) = @_;
|
|
202 my $self = $class->SUPER::new(@args);
|
|
203
|
|
204 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
|
|
205 if((! defined($seq)) && @args && ref($args[0])) {
|
|
206 # parameter not passed as named parameter?
|
|
207 $seq = $args[0];
|
|
208 }
|
|
209 $seq->isa('Bio::Seq') or
|
|
210 $self->throw("Must supply a Seq.pm object to IUPAC!");
|
|
211 $self->{'_SeqObj'} = $seq;
|
|
212 if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) {
|
|
213 # nucleotide seq object
|
|
214 $self->{'_alpha'} = [ map { $IUB{uc($_)} }
|
|
215 split('', $self->{'_SeqObj'}->seq()) ];
|
|
216 } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) {
|
|
217 # amino acid seq object
|
|
218 $self->{'_alpha'} = [ map { $IUP{uc($_)} }
|
|
219 split('', $self->{'_SeqObj'}->seq()) ];
|
|
220 } else { # unknown type: we could make a guess, but let's not.
|
|
221 $self->throw("You must specify the 'type' of sequence provided to IUPAC");
|
|
222 }
|
|
223 $self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())];
|
|
224 scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!");
|
|
225 $self->{'_string'}->[0] = -1;
|
|
226 return $self;
|
|
227 }
|
|
228
|
|
229 =head2 next_seq
|
|
230
|
|
231 Title : next_seq
|
|
232 Usage : $iupac->next_seq()
|
|
233 Function: returns the next unique Seq object
|
|
234 Returns : a Seq.pm object
|
|
235 Args : none.
|
|
236
|
|
237
|
|
238 =cut
|
|
239
|
|
240 sub next_seq{
|
|
241 my ($self) = @_;
|
|
242
|
|
243 for my $i ( 0 .. $#{$self->{'_string'}} ) {
|
|
244 next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
|
|
245 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
|
|
246 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
|
|
247 return undef;
|
|
248 } else {
|
|
249 $self->{'_string'}->[$i] = 0;
|
|
250 next;
|
|
251 }
|
|
252 } else {
|
|
253 $self->{'_string'}->[$i]++;
|
|
254 my $j = -1;
|
|
255 $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}}));
|
|
256 my $desc = $self->{'_SeqObj'}->desc();
|
|
257 if ( !defined $desc ) { $desc = ""; }
|
|
258
|
|
259 $self->{'_num'}++;
|
|
260 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
|
|
261 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
|
|
262 $self->{'_SeqObj'}->desc($desc);
|
|
263 $self->{'_num'} =~ s/,//g;
|
|
264 return $self->{'_SeqObj'};
|
|
265 }
|
|
266 }
|
|
267 }
|
|
268
|
|
269 =head2 iupac_iup
|
|
270
|
|
271 Title : iupac_iup
|
|
272 Usage : my %aasymbols = $iupac->iupac_iup
|
|
273 Function: Returns a hash of PROTEIN symbols -> symbol components
|
|
274 Returns : Hash
|
|
275 Args : none
|
|
276
|
|
277 =cut
|
|
278
|
|
279 sub iupac_iup{
|
|
280 return %IUP;
|
|
281
|
|
282 }
|
|
283
|
|
284 =head2 iupac_iub
|
|
285
|
|
286 Title : iupac_iub
|
|
287 Usage : my %dnasymbols = $iupac->iupac_iub
|
|
288 Function: Returns a hash of DNA symbols -> symbol components
|
|
289 Returns : Hash
|
|
290 Args : none
|
|
291
|
|
292 =cut
|
|
293
|
|
294 sub iupac_iub{
|
|
295 return %IUB;
|
|
296 }
|
|
297
|
|
298 sub AUTOLOAD {
|
|
299
|
|
300 my $self = shift @_;
|
|
301 my $method = $AUTOLOAD;
|
|
302 $method =~ s/.*:://;
|
|
303 return $self->{'_SeqObj'}->$method(@_)
|
|
304 unless $method eq 'DESTROY';
|
|
305 }
|
|
306
|
|
307 1;
|
|
308
|