Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/Tools/IUPAC.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/IUPAC.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,308 @@ +# $Id: IUPAC.pm,v 1.19 2002/11/30 15:39:53 jason Exp $ +# +# BioPerl module for IUPAC +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::IUPAC - Generates unique Seq objects from an ambiguous Seq object + +=head1 SYNOPSIS + + use Bio::Seq; + use Bio::Tools::IUPAC; + + my $ambiseq = new Bio::Seq (-seq => 'ARTCGUTGR', -alphabet => 'dna'); + my $stream = new Bio::Tools::IUPAC(-seq => $ambiseq); + + while ($uniqueseq = $stream->next_seq()) { + # process the unique Seq object. + } + +=head1 DESCRIPTION + +IUPAC is a tool that produces a stream of unique, "strict"-satisfying Seq +objects from an ambiquous Seq object (containing non-standard characters given +the meaning shown below) + + Extended Dna / Rna alphabet : + (includes symbols for nucleotide ambiguity) + ------------------------------------------ + Symbol Meaning Nucleic Acid + ------------------------------------------ + A A Adenine + C C Cytosine + G G Guanine + T T Thymine + U U Uracil + M A or C + R A or G + W A or T + S C or G + Y C or T + K G or T + V A or C or G + H A or C or T + D A or G or T + B C or G or T + X G or A or T or C + N G or A or T or C + + IUPAC-IUB SYMBOLS FOR NUCLEOTIDE NOMENCLATURE: + Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030. + +----------------------------------- + + Amino Acid alphabet: + ------------------------------------------ + Symbol Meaning + ------------------------------------------ + A Alanine + B Aspartic Acid, Asparagine + C Cystine + D Aspartic Acid + E Glutamic Acid + F Phenylalanine + G Glycine + H Histidine + I Isoleucine + K Lysine + L Leucine + M Methionine + N Asparagine + P Proline + Q Glutamine + R Arginine + S Serine + T Threonine + V Valine + W Tryptophan + X Unknown + Y Tyrosine + Z Glutamic Acid, Glutamine + * Terminator + + + IUPAC-IUP AMINO ACID SYMBOLS: + Biochem J. 1984 Apr 15; 219(2): 345-373 + Eur J Biochem. 1993 Apr 1; 213(1): 2 + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://www.bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey + +Email amackey@virginia.edu + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Tools::IUPAC; + +use strict; +use vars qw(@ISA %IUP %IUB $AUTOLOAD); + +BEGIN { + %IUB = ( A => [qw(A)], + C => [qw(C)], + G => [qw(G)], + T => [qw(T)], + U => [qw(U)], + M => [qw(A C)], + R => [qw(A G)], + W => [qw(A T)], + S => [qw(C G)], + Y => [qw(C T)], + K => [qw(G T)], + V => [qw(A C G)], + H => [qw(A C T)], + D => [qw(A G T)], + B => [qw(C G T)], + X => [qw(G A T C)], + N => [qw(G A T C)] + ); + + %IUP = (A => [qw(A)], + B => [qw(D N)], + C => [qw(C)], + D => [qw(D)], + E => [qw(E)], + F => [qw(F)], + G => [qw(G)], + H => [qw(H)], + I => [qw(I)], + K => [qw(K)], + L => [qw(L)], + M => [qw(M)], + N => [qw(N)], + P => [qw(P)], + Q => [qw(Q)], + R => [qw(R)], + S => [qw(S)], + T => [qw(T)], + U => [qw(U)], + V => [qw(V)], + W => [qw(W)], + X => [qw(X)], + Y => [qw(Y)], + Z => [qw(E Q)], + '*' => ['*'] + ); + +} +use Bio::Root::Root; +@ISA = qw(Bio::Root::Root); + +=head2 new + + Title : new + Usage : new Bio::Tools::IUPAC $seq; + Function: returns a new seq stream (akin to SeqIO) + Returns : a Bio::Tools::IUPAC stream object that will produce unique + Seq objects on demand. + Args : an ambiguously coded Seq.pm object that has a specified 'type' + + +=cut + + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($seq) = $self->_rearrange([qw(SEQ)],@args); + if((! defined($seq)) && @args && ref($args[0])) { + # parameter not passed as named parameter? + $seq = $args[0]; + } + $seq->isa('Bio::Seq') or + $self->throw("Must supply a Seq.pm object to IUPAC!"); + $self->{'_SeqObj'} = $seq; + if ($self->{'_SeqObj'}->alphabet() =~ m/^[dr]na$/i ) { + # nucleotide seq object + $self->{'_alpha'} = [ map { $IUB{uc($_)} } + split('', $self->{'_SeqObj'}->seq()) ]; + } elsif ($self->{'_SeqObj'}->alphabet() =~ m/^protein$/i ) { + # amino acid seq object + $self->{'_alpha'} = [ map { $IUP{uc($_)} } + split('', $self->{'_SeqObj'}->seq()) ]; + } else { # unknown type: we could make a guess, but let's not. + $self->throw("You must specify the 'type' of sequence provided to IUPAC"); + } + $self->{'_string'} = [(0) x length($self->{'_SeqObj'}->seq())]; + scalar @{$self->{'_string'}} or $self->throw("Sequence has zero-length!"); + $self->{'_string'}->[0] = -1; + return $self; +} + +=head2 next_seq + + Title : next_seq + Usage : $iupac->next_seq() + Function: returns the next unique Seq object + Returns : a Seq.pm object + Args : none. + + +=cut + +sub next_seq{ + my ($self) = @_; + + for my $i ( 0 .. $#{$self->{'_string'}} ) { + next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1; + if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover + if ( $i == $#{$self->{'_string'}} ) { # end of possibilities + return undef; + } else { + $self->{'_string'}->[$i] = 0; + next; + } + } else { + $self->{'_string'}->[$i]++; + my $j = -1; + $self->{'_SeqObj'}->seq(join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}})); + my $desc = $self->{'_SeqObj'}->desc(); + if ( !defined $desc ) { $desc = ""; } + + $self->{'_num'}++; + 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/; + $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/; + $self->{'_SeqObj'}->desc($desc); + $self->{'_num'} =~ s/,//g; + return $self->{'_SeqObj'}; + } + } +} + +=head2 iupac_iup + + Title : iupac_iup + Usage : my %aasymbols = $iupac->iupac_iup + Function: Returns a hash of PROTEIN symbols -> symbol components + Returns : Hash + Args : none + +=cut + +sub iupac_iup{ + return %IUP; + +} + +=head2 iupac_iub + + Title : iupac_iub + Usage : my %dnasymbols = $iupac->iupac_iub + Function: Returns a hash of DNA symbols -> symbol components + Returns : Hash + Args : none + +=cut + +sub iupac_iub{ + return %IUB; +} + +sub AUTOLOAD { + + my $self = shift @_; + my $method = $AUTOLOAD; + $method =~ s/.*:://; + return $self->{'_SeqObj'}->$method(@_) + unless $method eq 'DESTROY'; +} + +1; +