diff variant_effect_predictor/Bio/EnsEMBL/Compara/NCBITaxon.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Compara/NCBITaxon.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,479 @@
+=head1 NAME
+
+NCBITaxon - DESCRIPTION of Object
+
+=head1 DESCRIPTION
+  
+  An object that hold a node within a taxonomic tree.  Inherits from NestedSet.
+
+  From Bio::Species
+   classification
+   common_name
+   binomial
+
+  Here are also the additional methods in Bio::Species that "might" be useful, but let us
+  forget about these for now.
+   genus
+   species
+   sub_species
+   variant
+   organelle
+   division
+
+=head1 CONTACT
+
+  Contact Jessica Severin on implemetation/design detail: jessica@ebi.ac.uk
+  Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
+
+=cut
+
+package Bio::EnsEMBL::Compara::NCBITaxon;
+
+use strict;
+use Bio::Species;
+use Bio::EnsEMBL::Compara::NestedSet;
+use Bio::EnsEMBL::Utils::Exception;
+use Bio::EnsEMBL::Utils::Argument;
+
+our @ISA = qw(Bio::EnsEMBL::Compara::NestedSet);
+
+=head2 copy
+
+  Arg [1]    : int $member_id (optional)
+  Example    :
+  Description: returns copy of object, calling superclass copy method
+  Returntype :
+  Exceptions :
+  Caller     :
+
+=cut
+
+sub copy {
+  my $self = shift;
+  
+  my $mycopy = $self->SUPER::copy;
+  bless $mycopy, "Bio::EnsEMBL::Compara::NCBITaxon";
+  
+  $mycopy->ncbi_taxid($self->ncbi_taxid);
+  $mycopy->rank($self->rank);
+  $mycopy->genbank_hidden_flag($self->genbank_hidden_flag);
+
+  return $mycopy;
+}
+
+
+sub ncbi_taxid {
+  my $self = shift;
+  my $value = shift;
+  $self->node_id($value) if($value); 
+  return $self->node_id;
+}
+
+sub taxon_id {
+  my $self = shift;
+  my $value = shift;
+  $self->node_id($value) if($value); 
+  return $self->node_id;
+}
+
+sub dbID {
+  my $self = shift;
+  my $value = shift;
+  $self->node_id($value) if($value); 
+  return $self->node_id;
+}
+
+sub rank {
+  my $self = shift;
+  $self->{'_rank'} = shift if(@_);
+  return $self->{'_rank'};
+}
+
+sub genbank_hidden_flag {
+  my $self = shift;
+  $self->{'_genbank_hidden_flag'} = shift if(@_);
+  return $self->{'_genbank_hidden_flag'};
+}
+
+=head2 classification
+
+  Arg[SEPARATOR]  : String (optional); used to separate the classification by
+                    when returning as a string. If not specified then a single
+                    space will be used.
+  Arg[FULL]       : Boolean (optional); indicates we want all nodes including
+                    those which Genbank sets as hidden
+  Arg[AS_ARRAY]   : Boolean (optional); says the return type will be an 
+                    ArrayRef of all nodes in the classification as instances
+                    of NCBITaxon.
+  Example         : my $classification_string = $node->classification();
+  Description     : Returns the String representation of a taxon node's 
+                    classification or the objects which constitute it (
+                    including the current node). The String return when
+                    split is compatible with BioPerl's Species classification
+                    code and will return a data structure compatible with
+                    that found in core species MetaContainers.
+                    
+                    This code is a redevelopment of existing code which
+                    descended down the taxonomy which had disadvanatages 
+                    when a classification was requested on nodes causing
+                    the taxonomy to bi/multi-furcate.
+                    
+                    Note the String representation does have some disadvantages
+                    when working with the poorer end of the taxonomy where
+                    species nodes are not well defined. For these situations
+                    you are better using the array representation and 
+                    capturing the required information from the nodes.
+                    
+                    Also to maintain the original functionality of the method
+                    we filter any species, subspecies or subgenus nodes above
+                    the current node. For the true classification always
+                    call using the array structure.
+                    
+                    Recalling this subroutine with the same parameters for
+                    separators will return a cached representation. Calling
+                    for AS_ARRAY will cause the classificaiton to be 
+                    recalculated each time.
+  Returntype      : String if not asking for an array otherwise the array
+  Exceptions      : - 
+  Caller          : Public
+
+=cut
+ 
+sub classification {
+  my ($self, @args) = @_;
+  my ($separator, $full, $as_array) = rearrange([qw( SEPARATOR FULL AS_ARRAY )], @args);
+
+  #setup defaults
+  $separator = ' ' unless(defined $separator);
+  $full = 0 unless (defined $full);
+  
+  if(!$as_array) {
+    #Reset the separators & classifications if we already had one & it 
+    #differed from the input
+    if(defined $self->{_separator} && $self->{_separator} ne $separator) {
+      $self->{_separator} = undef;
+      $self->{_classification} = undef;
+    }
+    if(defined $self->{_separator_full} && $self->{_separator_full} ne $separator) {
+      $self->{_separator_full} = undef;
+      $self->{_classification_full} = undef;
+    }
+    
+    $self->{_separator} = $separator unless (defined $self->{_separator});
+    $self->{_separator_full} = $separator unless (defined $self->{_separator_full});
+    
+    return $self->{_classification_full} if ($full && defined $self->{_classification_full});
+    return $self->{_classification} if (!$full && defined $self->{_classification});
+  }  
+
+  my $node = $self;
+  my @classification;
+  while( $node->name() ne 'root' ) {
+    my $subgenus = $node->rank() eq 'subgenus';
+    if($full) {
+      push(@classification, $node);
+    }
+    else {
+      unless($node->genbank_hidden_flag() || $subgenus) {
+        push(@classification, $node);
+      }
+    }
+    
+    $node = $node->parent();
+  }
+  
+  if($as_array) {
+    return \@classification;
+  }
+
+  #Once we have a normal array we can do top-down as before to replicate 
+  #the original functionality
+  my $text_classification = $self->_to_text_classification(\@classification);
+  
+  if ($full) {
+    $self->{_classification_full} = join($separator, @{$text_classification});
+    $self->{_separator_full} = $separator;
+    return $self->{_classification_full};
+  } else {
+    $self->{_classification} = join($separator, @{$text_classification});
+    $self->{_separator} = $separator;
+    return $self->{_classification};
+  }
+}
+
+=head2 _to_text_classification
+
+  Arg[1]          : ArrayRef of the classification array to be converted to 
+                    the text classification 
+  Example         : my $array = $node->_to_text_classification(\@classification);
+  Description     : Returns the Array representation of a taxon node's 
+                    classification or the objects which constitute it (
+                    including the current node) as the species names or split
+                    according to the rules for working with BioPerl.
+  Returntype      : ArrayRef of Strings
+  Exceptions      : - 
+  Caller          : Private
+
+=cut
+
+sub _to_text_classification {
+  my ($self, $classification) = @_;
+  my @text_classification;
+  my $first = 1;
+  for my $node ( @{$classification}) {
+    my $subgenus = $node->rank() eq 'subgenus';
+    my $species = $node->rank() eq 'species';
+    my $subspecies = $node->rank() eq 'subspecies';
+    
+    if($first) {
+      if($species || $subspecies) {
+        my ($genus, $species, $subspecies) = split(q{ }, $node->binomial());
+        unshift @text_classification, $species;
+        unshift @text_classification, $subspecies if (defined $subspecies);
+      }
+      $first = 0;
+      next;
+    }
+    
+    next if $subgenus || $species || $subspecies;
+    push(@text_classification, $node->name());
+  }
+  return \@text_classification;
+}
+
+=head2 subspecies
+
+  Example    : $ncbi->subspecies;
+  Description: Returns the subspeceis name for this species
+  Example    : "verus" for Pan troglodytes verus
+  Returntype : string
+  Exceptions :
+  Caller     : general
+
+=cut
+
+sub subspecies {
+  my $self = shift;
+
+  unless (defined $self->{'_species'}) {
+    my ($genus, $species, $subspecies) = split(" ", $self->binomial);
+    $self->{'_species'} = $species;
+    $self->{'_genus'} = $genus;
+    $self->{'_subspecies'} = $subspecies;
+  }
+
+  return $self->{'_species'};
+}
+
+
+=head2 species
+
+  Example    : $ncbi->species;
+  Description: Returns the speceis name for this species
+  Example    : "sapiens" for Homo sapiens
+  Returntype : string
+  Exceptions :
+  Caller     : general
+
+=cut
+
+sub species {
+  my $self = shift;
+
+  unless (defined $self->{'_species'}) {
+    my ($genus, $species, $subspecies) = split(" ", $self->binomial);
+    $self->{'_species'} = $species;
+    $self->{'_genus'} = $genus;
+    $self->{'_subspecies'} = $subspecies;
+  }
+
+  return $self->{'_species'};
+}
+
+
+=head2 genus
+
+  Example    : $ncbi->genus;
+  Description: Returns the genus name for this species
+  Returntype : string
+  Example    : "Homo" for Homo sapiens
+  Exceptions :
+  Caller     : general
+
+=cut
+
+sub genus {
+  my $self = shift;
+
+  unless (defined $self->{'_genus'}) {
+    my ($genus, $species, $subspecies) = split(" ", $self->binomial);
+    $self->{'_species'} = $species;
+    $self->{'_genus'} = $genus;
+    $self->{'_subspecies'} = $subspecies;
+  }
+
+  return $self->{'_genus'};
+}
+
+=head2 common_name
+
+  Example    : $ncbi->common_name;
+  Description: The comon name as defined by Genbank
+  Returntype : string
+  Exceptions : returns undef if no genbank common name exists.
+  Caller     : general
+
+=cut
+
+sub common_name {
+  my $self = shift;
+  if ($self->has_tag('genbank common name') && $self->rank eq 'species') {
+    return $self->get_tagvalue('genbank common name');
+  } else {
+    return undef;
+  }
+}
+
+=head2 ensembl_alias_name
+
+  Example    : $ncbi->ensembl_alias_name;
+  Description: The comon name as defined by ensembl alias
+  Returntype : string
+  Exceptions : returns undef if no ensembl alias name exists.
+  Caller     : general
+
+=cut
+
+sub ensembl_alias_name {
+  my $self = shift;
+
+  #Not checking for rank as we do above, because we do not get dog since the
+  #rank for dog is subspecies (ensembl-51).
+  if ($self->has_tag('ensembl alias name')) {
+    return $self->get_tagvalue('ensembl alias name');
+  } else {
+    return undef;
+  }
+}
+
+
+=head scientific_name
+
+  Example    : $ncbi->scientific_name;
+  Description: The scientific name of this taxon
+  Returntype : string
+  Exceptions :
+  Caller     : general
+
+=cut
+
+sub scientific_name {
+    my ($self) = @_;
+    return $self->get_tagvalue('scientific name');
+}
+
+=head2 binomial
+
+  Example    : $ncbi->binomial;
+  Description: The binomial name (AKA the scientific name) of this genome
+  Returntype : string
+  Exceptions : warns when node is not a species or has no scientific name
+  Caller     : general
+
+=cut
+
+sub binomial {
+  my $self = shift;
+  if ($self->has_tag('scientific name') && ($self->rank eq 'species' || $self->rank eq 'subspecies')) {
+      return $self->scientific_name;
+  } else {
+    warning("taxon_id=",$self->node_id," is not a species or subspecies. So binomial is undef (try the scientific_name method)\n");
+    return undef;
+  }
+}
+
+=head2 ensembl_alias
+
+  Example    : $ncbi->ensembl_alias;
+  Description: The ensembl_alias name (AKA the name in the ensembl website) of this genome
+  Returntype : string
+  Exceptions : warns when node is not a species or has no ensembl_alias
+  Caller     : general
+
+=cut
+
+sub ensembl_alias {
+  my $self = shift;
+  if ($self->has_tag('ensembl alias name')) {
+    return $self->get_tagvalue('ensembl alias name');
+  } else {
+    warning("taxon_id=",$self->node_id," is not a species or subspecies. So ensembl_alias is undef\n");
+    return undef;
+  }
+}
+
+
+=head2 short_name
+
+  Example    : $ncbi->short_name;
+  Description: The name of this genome in the Gspe ('G'enera
+               'spe'cies) format.
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+
+=cut
+
+sub short_name {
+  my $self = shift;
+  my $name = $self->name;
+  $name =~  s/(\S)\S+\s(\S{3})\S+/$1$2/;
+  $name =~ s/\ //g;
+  return $name;
+}
+
+sub get_short_name {
+  my $self = shift;
+  return $self->short_name;
+}
+
+
+sub RAP_species_format {
+  my $self = shift;
+  my $newick = "";
+  
+  if($self->get_child_count() > 0) {
+    $newick .= "(";
+    my $first_child=1;
+    foreach my $child (@{$self->sorted_children}) {  
+      $newick .= "," unless($first_child);
+      $newick .= $child->newick_format;
+      $first_child = 0;
+    }
+    $newick .= ")";
+  }
+  
+  $newick .= sprintf("\"%s\"", $self->name,);
+  $newick .= sprintf(":%1.4f", $self->distance_to_parent) if($self->distance_to_parent > 0);
+
+  if(!($self->has_parent)) {
+    $newick .= ";";
+  }
+  return $newick;
+}
+
+
+sub print_node {
+  my $self  = shift;
+  printf("(%s", $self->node_id);
+  printf(" %s", $self->rank) if($self->rank);
+  print(")");
+  printf("%s", $self->name) if($self->name);
+  print("\n");
+}
+
+1;