Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/Tools/IUPAC.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: IUPAC.pm,v 1.19 2002/11/30 15:39:53 jason Exp $ | |
2 # | |
3 # BioPerl module for IUPAC | |
4 # | |
5 # Cared for by Aaron Mackey <amackey@virginia.edu> | |
6 # | |
7 # Copyright Aaron Mackey | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 use Bio::Seq; | |
20 use Bio::Tools::IUPAC; | |
21 | |
22 my $ambiseq = new Bio::Seq (-seq => 'ARTCGUTGR', -alphabet => 'dna'); | |
23 my $stream = new Bio::Tools::IUPAC(-seq => $ambiseq); | |
24 | |
25 while ($uniqueseq = $stream->next_seq()) { | |
26 # process the unique Seq object. | |
27 } | |
28 | |
29 =head1 DESCRIPTION | |
30 | |
31 IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq | |
32 objects from an ambiquous Seq object (containing non-standard characters given | |
33 the meaning shown below) | |
34 | |
35 Extended Dna / Rna alphabet : | |
36 (includes symbols for nucleotide ambiguity) | |
37 ------------------------------------------ | |
38 Symbol Meaning Nucleic Acid | |
39 ------------------------------------------ | |
40 A A Adenine | |
41 C C Cytosine | |
42 G G Guanine | |
43 T T Thymine | |
44 U U Uracil | |
45 M A or C | |
46 R A or G | |
47 W A or T | |
48 S C or G | |
49 Y C or T | |
50 K G or T | |
51 V A or C or G | |
52 H A or C or T | |
53 D A or G or T | |
54 B C or G or T | |
55 X G or A or T or C | |
56 N G or A or T or C | |
57 | |
58 IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: | |
59 Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. | |
60 | |
61 ----------------------------------- | |
62 | |
63 Amino Acid alphabet: | |
64 ------------------------------------------ | |
65 Symbol Meaning | |
66 ------------------------------------------ | |
67 A Alanine | |
68 B Aspartic Acid, Asparagine | |
69 C Cystine | |
70 D Aspartic Acid | |
71 E Glutamic Acid | |
72 F Phenylalanine | |
73 G Glycine | |
74 H Histidine | |
75 I Isoleucine | |
76 K Lysine | |
77 L Leucine | |
78 M Methionine | |
79 N Asparagine | |
80 P Proline | |
81 Q Glutamine | |
82 R Arginine | |
83 S Serine | |
84 T Threonine | |
85 V Valine | |
86 W Tryptophan | |
87 X Unknown | |
88 Y Tyrosine | |
89 Z Glutamic Acid, Glutamine | |
90 * Terminator | |
91 | |
92 | |
93 IUPAC-IUP AMINO ACID SYMBOLS: | |
94 Biochem J. 1984 Apr 15; 219(2): 345-373 | |
95 Eur J Biochem. 1993 Apr 1; 213(1): 2 | |
96 | |
97 =head1 FEEDBACK | |
98 | |
99 =head2 Mailing Lists | |
100 | |
101 User feedback is an integral part of the evolution of this and other | |
102 Bioperl modules. Send your comments and suggestions preferably to one | |
103 of the Bioperl mailing lists. Your participation is much appreciated. | |
104 | |
105 bioperl-l@bioperl.org - General discussion | |
106 http://www.bioperl.org/MailList.html - About the mailing lists | |
107 | |
108 =head2 Reporting Bugs | |
109 | |
110 Report bugs to the Bioperl bug tracking system to help us keep track | |
111 the bugs and their resolution. Bug reports can be submitted via email | |
112 or the web: | |
113 | |
114 bioperl-bugs@bioperl.org | |
115 http://www.bugzilla.bioperl.org/ | |
116 | |
117 =head1 AUTHOR - Aaron Mackey | |
118 | |
119 Email amackey@virginia.edu | |
120 | |
121 =head1 APPENDIX | |
122 | |
123 The rest of the documentation details each of the object | |
124 methods. Internal methods are usually preceded with a _ | |
125 | |
126 =cut | |
127 | |
128 | |
129 # Let the code begin... | |
130 | |
131 package Bio::Tools::IUPAC; | |
132 | |
133 use strict; | |
134 use vars qw(@ISA %IUP %IUB $AUTOLOAD); | |
135 | |
136 BEGIN { | |
137 %IUB = ( A => [qw(A)], | |
138 C => [qw(C)], | |
139 G => [qw(G)], | |
140 T => [qw(T)], | |
141 U => [qw(U)], | |
142 M => [qw(A C)], | |
143 R => [qw(A G)], | |
144 W => [qw(A T)], | |
145 S => [qw(C G)], | |
146 Y => [qw(C T)], | |
147 K => [qw(G T)], | |
148 V => [qw(A C G)], | |
149 H => [qw(A C T)], | |
150 D => [qw(A G T)], | |
151 B => [qw(C G T)], | |
152 X => [qw(G A T C)], | |
153 N => [qw(G A T C)] | |
154 ); | |
155 | |
156 %IUP = (A => [qw(A)], | |
157 B => [qw(D N)], | |
158 C => [qw(C)], | |
159 D => [qw(D)], | |
160 E => [qw(E)], | |
161 F => [qw(F)], | |
162 G => [qw(G)], | |
163 H => [qw(H)], | |
164 I => [qw(I)], | |
165 K => [qw(K)], | |
166 L => [qw(L)], | |
167 M => [qw(M)], | |
168 N => [qw(N)], | |
169 P => [qw(P)], | |
170 Q => [qw(Q)], | |
171 R => [qw(R)], | |
172 S => [qw(S)], | |
173 T => [qw(T)], | |
174 U => [qw(U)], | |
175 V => [qw(V)], | |
176 W => [qw(W)], | |
177 X => [qw(X)], | |
178 Y => [qw(Y)], | |
179 Z => [qw(E Q)], | |
180 '*' => ['*'] | |
181 ); | |
182 | |
183 } | |
184 use Bio::Root::Root; | |
185 @ISA = qw(Bio::Root::Root); | |
186 | |
187 =head2 new | |
188 | |
189 Title : new | |
190 Usage : new Bio::Tools::IUPAC $seq; | |
191 Function: returns a new seq stream (akin to SeqIO) | |
192 Returns : a Bio::Tools::IUPAC stream object that will produce unique | |
193 Seq objects on demand. | |
194 Args : an ambiguously coded Seq.pm object that has a specified 'type' | |
195 | |
196 | |
197 =cut | |
198 | |
199 | |
200 sub new { | |
201 my($class,@args) = @_; | |
202 my $self = $class->SUPER::new(@args); | |
203 | |
204 my ($seq) = $self->_rearrange([qw(SEQ)],@args); | |
205 if((! defined($seq)) && @args && ref($args[0])) { | |
206 # parameter not passed as named parameter? | |
207 $seq = $args[0]; | |
208 } | |
209 $seq->isa('Bio::Seq') or | |
210 $self->throw("Must supply a Seq.pm object to IUPAC!"); | |
211 $self->{'_SeqObj'} = $seq; | |
212 if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) { | |
213 # nucleotide seq object | |
214 $self->{'_alpha'} = [ map { $IUB{uc($_)} } | |
215 split('', $self->{'_SeqObj'}->seq()) ]; | |
216 } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) { | |
217 # amino acid seq object | |
218 $self->{'_alpha'} = [ map { $IUP{uc($_)} } | |
219 split('', $self->{'_SeqObj'}->seq()) ]; | |
220 } else { # unknown type: we could make a guess, but let's not. | |
221 $self->throw("You must specify the 'type' of sequence provided to IUPAC"); | |
222 } | |
223 $self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())]; | |
224 scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!"); | |
225 $self->{'_string'}->[0] = -1; | |
226 return $self; | |
227 } | |
228 | |
229 =head2 next_seq | |
230 | |
231 Title : next_seq | |
232 Usage : $iupac->next_seq() | |
233 Function: returns the next unique Seq object | |
234 Returns : a Seq.pm object | |
235 Args : none. | |
236 | |
237 | |
238 =cut | |
239 | |
240 sub next_seq{ | |
241 my ($self) = @_; | |
242 | |
243 for my $i ( 0 .. $#{$self->{'_string'}} ) { | |
244 next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1; | |
245 if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover | |
246 if ( $i == $#{$self->{'_string'}} ) { # end of possibilities | |
247 return undef; | |
248 } else { | |
249 $self->{'_string'}->[$i] = 0; | |
250 next; | |
251 } | |
252 } else { | |
253 $self->{'_string'}->[$i]++; | |
254 my $j = -1; | |
255 $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}})); | |
256 my $desc = $self->{'_SeqObj'}->desc(); | |
257 if ( !defined $desc ) { $desc = ""; } | |
258 | |
259 $self->{'_num'}++; | |
260 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/; | |
261 $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/; | |
262 $self->{'_SeqObj'}->desc($desc); | |
263 $self->{'_num'} =~ s/,//g; | |
264 return $self->{'_SeqObj'}; | |
265 } | |
266 } | |
267 } | |
268 | |
269 =head2 iupac_iup | |
270 | |
271 Title : iupac_iup | |
272 Usage : my %aasymbols = $iupac->iupac_iup | |
273 Function: Returns a hash of PROTEIN symbols -> symbol components | |
274 Returns : Hash | |
275 Args : none | |
276 | |
277 =cut | |
278 | |
279 sub iupac_iup{ | |
280 return %IUP; | |
281 | |
282 } | |
283 | |
284 =head2 iupac_iub | |
285 | |
286 Title : iupac_iub | |
287 Usage : my %dnasymbols = $iupac->iupac_iub | |
288 Function: Returns a hash of DNA symbols -> symbol components | |
289 Returns : Hash | |
290 Args : none | |
291 | |
292 =cut | |
293 | |
294 sub iupac_iub{ | |
295 return %IUB; | |
296 } | |
297 | |
298 sub AUTOLOAD { | |
299 | |
300 my $self = shift @_; | |
301 my $method = $AUTOLOAD; | |
302 $method =~ s/.*:://; | |
303 return $self->{'_SeqObj'}->$method(@_) | |
304 unless $method eq 'DESTROY'; | |
305 } | |
306 | |
307 1; | |
308 |