comparison variant_effect_predictor/Bio/Tools/IUPAC.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: IUPAC.pm,v 1.19 2002/11/30 15:39:53 jason Exp $
2 #
3 # BioPerl module for IUPAC
4 #
5 # Cared for by Aaron Mackey <amackey@virginia.edu>
6 #
7 # Copyright Aaron Mackey
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object
16
17 =head1 SYNOPSIS
18
19 use Bio::Seq;
20 use Bio::Tools::IUPAC;
21
22 my $ambiseq = new Bio::Seq (-seq => 'ARTCGUTGR', -alphabet => 'dna');
23 my $stream = new Bio::Tools::IUPAC(-seq => $ambiseq);
24
25 while ($uniqueseq = $stream->next_seq()) {
26 # process the unique Seq object.
27 }
28
29 =head1 DESCRIPTION
30
31 IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq
32 objects from an ambiquous Seq object (containing non-standard characters given
33 the meaning shown below)
34
35 Extended Dna / Rna alphabet :
36 (includes symbols for nucleotide ambiguity)
37 ------------------------------------------
38 Symbol Meaning Nucleic Acid
39 ------------------------------------------
40 A A Adenine
41 C C Cytosine
42 G G Guanine
43 T T Thymine
44 U U Uracil
45 M A or C
46 R A or G
47 W A or T
48 S C or G
49 Y C or T
50 K G or T
51 V A or C or G
52 H A or C or T
53 D A or G or T
54 B C or G or T
55 X G or A or T or C
56 N G or A or T or C
57
58 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE:
59 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030.
60
61 -----------------------------------
62
63 Amino Acid alphabet:
64 ------------------------------------------
65 Symbol Meaning
66 ------------------------------------------
67 A Alanine
68 B Aspartic Acid, Asparagine
69 C Cystine
70 D Aspartic Acid
71 E Glutamic Acid
72 F Phenylalanine
73 G Glycine
74 H Histidine
75 I Isoleucine
76 K Lysine
77 L Leucine
78 M Methionine
79 N Asparagine
80 P Proline
81 Q Glutamine
82 R Arginine
83 S Serine
84 T Threonine
85 V Valine
86 W Tryptophan
87 X Unknown
88 Y Tyrosine
89 Z Glutamic Acid, Glutamine
90 * Terminator
91
92
93 IUPAC-IUP AMINO ACID SYMBOLS:
94 Biochem J. 1984 Apr 15; 219(2): 345-373
95 Eur J Biochem. 1993 Apr 1; 213(1): 2
96
97 =head1 FEEDBACK
98
99 =head2 Mailing Lists
100
101 User feedback is an integral part of the evolution of this and other
102 Bioperl modules. Send your comments and suggestions preferably to one
103 of the Bioperl mailing lists. Your participation is much appreciated.
104
105 bioperl-l@bioperl.org - General discussion
106 http://www.bioperl.org/MailList.html - About the mailing lists
107
108 =head2 Reporting Bugs
109
110 Report bugs to the Bioperl bug tracking system to help us keep track
111 the bugs and their resolution. Bug reports can be submitted via email
112 or the web:
113
114 bioperl-bugs@bioperl.org
115 http://www.bugzilla.bioperl.org/
116
117 =head1 AUTHOR - Aaron Mackey
118
119 Email amackey@virginia.edu
120
121 =head1 APPENDIX
122
123 The rest of the documentation details each of the object
124 methods. Internal methods are usually preceded with a _
125
126 =cut
127
128
129 # Let the code begin...
130
131 package Bio::Tools::IUPAC;
132
133 use strict;
134 use vars qw(@ISA %IUP %IUB $AUTOLOAD);
135
136 BEGIN {
137 %IUB = ( A => [qw(A)],
138 C => [qw(C)],
139 G => [qw(G)],
140 T => [qw(T)],
141 U => [qw(U)],
142 M => [qw(A C)],
143 R => [qw(A G)],
144 W => [qw(A T)],
145 S => [qw(C G)],
146 Y => [qw(C T)],
147 K => [qw(G T)],
148 V => [qw(A C G)],
149 H => [qw(A C T)],
150 D => [qw(A G T)],
151 B => [qw(C G T)],
152 X => [qw(G A T C)],
153 N => [qw(G A T C)]
154 );
155
156 %IUP = (A => [qw(A)],
157 B => [qw(D N)],
158 C => [qw(C)],
159 D => [qw(D)],
160 E => [qw(E)],
161 F => [qw(F)],
162 G => [qw(G)],
163 H => [qw(H)],
164 I => [qw(I)],
165 K => [qw(K)],
166 L => [qw(L)],
167 M => [qw(M)],
168 N => [qw(N)],
169 P => [qw(P)],
170 Q => [qw(Q)],
171 R => [qw(R)],
172 S => [qw(S)],
173 T => [qw(T)],
174 U => [qw(U)],
175 V => [qw(V)],
176 W => [qw(W)],
177 X => [qw(X)],
178 Y => [qw(Y)],
179 Z => [qw(E Q)],
180 '*' => ['*']
181 );
182
183 }
184 use Bio::Root::Root;
185 @ISA = qw(Bio::Root::Root);
186
187 =head2 new
188
189 Title : new
190 Usage : new Bio::Tools::IUPAC $seq;
191 Function: returns a new seq stream (akin to SeqIO)
192 Returns : a Bio::Tools::IUPAC stream object that will produce unique
193 Seq objects on demand.
194 Args : an ambiguously coded Seq.pm object that has a specified 'type'
195
196
197 =cut
198
199
200 sub new {
201 my($class,@args) = @_;
202 my $self = $class->SUPER::new(@args);
203
204 my ($seq) = $self->_rearrange([qw(SEQ)],@args);
205 if((! defined($seq)) && @args && ref($args[0])) {
206 # parameter not passed as named parameter?
207 $seq = $args[0];
208 }
209 $seq->isa('Bio::Seq') or
210 $self->throw("Must supply a Seq.pm object to IUPAC!");
211 $self->{'_SeqObj'} = $seq;
212 if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) {
213 # nucleotide seq object
214 $self->{'_alpha'} = [ map { $IUB{uc($_)} }
215 split('', $self->{'_SeqObj'}->seq()) ];
216 } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) {
217 # amino acid seq object
218 $self->{'_alpha'} = [ map { $IUP{uc($_)} }
219 split('', $self->{'_SeqObj'}->seq()) ];
220 } else { # unknown type: we could make a guess, but let's not.
221 $self->throw("You must specify the 'type' of sequence provided to IUPAC");
222 }
223 $self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())];
224 scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!");
225 $self->{'_string'}->[0] = -1;
226 return $self;
227 }
228
229 =head2 next_seq
230
231 Title : next_seq
232 Usage : $iupac->next_seq()
233 Function: returns the next unique Seq object
234 Returns : a Seq.pm object
235 Args : none.
236
237
238 =cut
239
240 sub next_seq{
241 my ($self) = @_;
242
243 for my $i ( 0 .. $#{$self->{'_string'}} ) {
244 next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
245 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
246 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
247 return undef;
248 } else {
249 $self->{'_string'}->[$i] = 0;
250 next;
251 }
252 } else {
253 $self->{'_string'}->[$i]++;
254 my $j = -1;
255 $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}}));
256 my $desc = $self->{'_SeqObj'}->desc();
257 if ( !defined $desc ) { $desc = ""; }
258
259 $self->{'_num'}++;
260 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
261 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
262 $self->{'_SeqObj'}->desc($desc);
263 $self->{'_num'} =~ s/,//g;
264 return $self->{'_SeqObj'};
265 }
266 }
267 }
268
269 =head2 iupac_iup
270
271 Title : iupac_iup
272 Usage : my %aasymbols = $iupac->iupac_iup
273 Function: Returns a hash of PROTEIN symbols -> symbol components
274 Returns : Hash
275 Args : none
276
277 =cut
278
279 sub iupac_iup{
280 return %IUP;
281
282 }
283
284 =head2 iupac_iub
285
286 Title : iupac_iub
287 Usage : my %dnasymbols = $iupac->iupac_iub
288 Function: Returns a hash of DNA symbols -> symbol components
289 Returns : Hash
290 Args : none
291
292 =cut
293
294 sub iupac_iub{
295 return %IUB;
296 }
297
298 sub AUTOLOAD {
299
300 my $self = shift @_;
301 my $method = $AUTOLOAD;
302 $method =~ s/.*:://;
303 return $self->{'_SeqObj'}->$method(@_)
304 unless $method eq 'DESTROY';
305 }
306
307 1;
308