view variant_effect_predictor/Bio/EnsEMBL/Compara/NestedSet.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line source

=head1 NAME

NestedSet - DESCRIPTION of Object

=head1 SYNOPSIS

=head1 DESCRIPTION

Abstract superclass to encapsulate the process of storing and manipulating a
nested-set representation tree.  Also implements a 'reference count' system 
based on the ObjectiveC retain/release design. 
Designed to be used as the Root class for all Compara 'proxy' classes 
(Member, GenomeDB, DnaFrag, NCBITaxon) to allow them to be made into sets and trees.

=head1 CONTACT

  Contact Albert Vilella on implementation detail: avilella@ebi.ac.uk
  Contact Jessica Severin on implementation/design detail: jessica@ebi.ac.uk
  Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk

=head1 APPENDIX

The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _

=cut



package Bio::EnsEMBL::Compara::NestedSet;

use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Utils::Argument;

use Bio::EnsEMBL::Utils::Exception qw(deprecate throw);

use Bio::TreeIO;
use Bio::EnsEMBL::Compara::Graph::Node;
use Bio::EnsEMBL::Compara::Member;
our @ISA = qw(Bio::EnsEMBL::Compara::Graph::Node);

#################################################
# Factory methods
#################################################

sub init {
  my $self = shift;
  $self->SUPER::init;
  return $self;
}

sub dealloc {
  my $self = shift;

  #printf("DEALLOC NestedSet refcount:%d ", $self->refcount); $self->print_node;
  #$self->release_children;
  return $self->SUPER::dealloc;
}

=head2 copy

  Overview   : creates copy of tree starting at this node going down
  Example    : my $clone = $self->copy;
  Returntype : Bio::EnsEMBL::Compara::NestedSet
  Exceptions : none
  Caller     : general

=cut

sub copy {
  my $self = shift;
  
  my $mycopy = $self->SUPER::copy; 
  bless $mycopy, ref $self;

  $mycopy->distance_to_parent($self->distance_to_parent);
  $mycopy->left_index($self->left_index);
  $mycopy->right_index($self->right_index);

  foreach my $child (@{$self->children}) {  
    $mycopy->add_child($child->copy);
  }
  return $mycopy;
}


=head2 release_tree

  Overview   : deletes and frees the memory used by this tree
               and all the underlying nodes.
  Example    : $self->release_tree;
  Returntype : undef
  Exceptions : none
  Caller     : general

=cut

sub release_tree {
  my $self = shift;
  
  my $child_count = $self->get_child_count;
  $self->disavow_parent;
  $self->cascade_unlink if($child_count);
  return undef;
}

#################################################
# Object variable methods
#################################################

sub left_index {
  my $self = shift;
  $self->{'_left_index'} = shift if(@_);
  $self->{'_left_index'} = 0 unless(defined($self->{'_left_index'}));
  return $self->{'_left_index'};
}

sub right_index {
  my $self = shift;
  $self->{'_right_index'} = shift if(@_);
  $self->{'_right_index'} = 0 unless(defined($self->{'_right_index'}));
  return $self->{'_right_index'};
}


#######################################
# Set manipulation methods
#######################################

=head2 add_child

  Overview   : attaches child nestedset node to this nested set
  Arg [1]    : Bio::EnsEMBL::Compara::NestedSet $child
  Arg [2]    : (opt.) distance between this node and child
  Example    : $self->add_child($child);
  Returntype : undef
  Exceptions : if child is undef or not a NestedSet subclass
  Caller     : general

=cut

sub add_child {
  my $self = shift;
  my $child = shift;
  my $dist = shift;
  
  throw("child not defined") 
     unless(defined($child));
  throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$child]")
     unless($child->isa('Bio::EnsEMBL::Compara::NestedSet'));
  
  #printf("add_child: parent(%s) <-> child(%s)\n", $self->node_id, $child->node_id);
  
  unless(defined($dist)) { $dist = $child->_distance; }

  $child->disavow_parent;
  #create_link_to_node is a safe method which checks if connection exists
  my $link = $self->create_link_to_node($child);
  $child->_set_parent_link($link);
  $self->{'_children_loaded'} = 1; 
  $link->distance_between($dist);
  return $link;
}


=head2 disavow_parent

  Overview   : unlink and release self from its parent
               might cause self to delete if refcount reaches Zero.
  Example    : $self->disavow_parent
  Returntype : undef
  Caller     : general

=cut

sub disavow_parent {
  my $self = shift;

  if($self->{'_parent_link'}) {
    my $link = $self->{'_parent_link'};
    #print("DISAVOW parent : "); $parent->print_node;
    #print("        child  : "); $self->print_node;
    $link->dealloc;
  }
  $self->_set_parent_link(undef);
  return undef;
}


=head2 release_children

  Overview   : recursive releases all children
               will cause potential deletion of children if refcount reaches Zero.
  Example    : $self->release_children
  Returntype : $self
  Exceptions : none
  Caller     : general

=cut

sub release_children {
  my $self = shift;
  
  # by calling with parent, this preserved the link to the parent
  # and thus doesn't unlink self
  foreach my $child (@{$self->children}) {
    $child->disavow_parent;
    $child->release_children;
  }
  #$self->cascade_unlink($self->{'_parent_node'});
  return $self;
}


=head2 parent

  Overview   : returns the parent NestedSet object for this node
  Example    : my $my_parent = $object->parent();
  Returntype : undef or Bio::EnsEMBL::Compara::NestedSet
  Exceptions : none
  Caller     : general

=cut

sub parent {
    my $self = shift;
    return $self->adaptor->fetch_parent_for_node($self) if defined $self->adaptor;
    return $self->{'_parent_link'}->get_neighbor($self) if defined $self->{'_parent_link'};
    return undef;
}


sub parent_link {
  my $self = shift;
  return $self->{'_parent_link'};
}

sub has_parent {
  my $self = shift;
  return 1 if($self->{'_parent_link'} or $self->_parent_id);
  return 0;
}


sub has_ancestor {
  my $self = shift;
  my $ancestor = shift;
  throw "[$ancestor] must be a Bio::EnsEMBL::Compara::NestedSet object"
       unless ($ancestor and $ancestor->isa("Bio::EnsEMBL::Compara::NestedSet"));
  my $node = $self->parent;
  while($node) {
    return 1 if($node->equals($ancestor));
    $node = $node->parent;
  }
  return 0;
}


=head2 root

  Overview   : returns the root NestedSet object for this node
               returns $self if node has no parent (this is the root)
  Example    : my $root = $object->root();
  Description: Returns the root of the tree for this node
               with links to all the intermediate nodes. Sister nodes
               are not included in the result.
  Returntype : undef or Bio::EnsEMBL::Compara::NestedSet
  Exceptions : none
  Caller     : general

=cut

sub root {
  my $self = shift;

  # Only if we don't have it cached
  # Only if we have left and right and it's not a leaf
  # Only if it's for release clusterset (1 genetrees - 0 genomic align trees)
  if (!defined($self->{'_parent_link'}) and $self->adaptor 
      and ($self->right_index-$self->left_index)>1
      and (defined $self->{'_parent_id'})
      and (1==$self->{'_parent_id'})
     ) {
    return $self->adaptor->fetch_root_by_node($self);
  }

  # Otherwise, go through the step-by-step method
  return $self unless(defined($self->parent));
 #  return $self if($self->node_id eq $self->parent->node_id);
  return $self->parent->root;
}

sub subroot {
  deprecate('subroot() should not be used and will be removed in release 70.
  If you are using it, please contact the dev mailing-list dev@ensembl.org');
  my $self = shift;

  return undef unless($self->parent);
  return $self unless(defined($self->parent->parent));
  return $self->parent->subroot;
}


=head2 children

  Overview   : returns a list of NestedSet nodes directly under this parent node
  Example    : my @children = @{$object->children()};
  Returntype : array reference of Bio::EnsEMBL::Compara::NestedSet objects (could be empty)
  Exceptions : none
  Caller     : general
  Algorithm  : new algorithm for fetching children:
                for each link connected to this NestedsSet node, a child is defined if
                  old: the link is not my parent_link
                  new: the link's neighbors' parent_link is the link
               This allows one (with a carefully coded algorithm) to overlay a tree on top
               of a fully connected graph and use the parent/children methods of NestedSet
               to walk the 'tree' substructure of the graph.  
               Trees that are really just trees are still trees.

=cut

sub children {
  my $self = shift;
  $self->load_children_if_needed;
  my @kids;
  foreach my $link (@{$self->links}) {
    next unless(defined($link));
    my $neighbor = $link->get_neighbor($self);
    my $parent_link = $neighbor->parent_link;
    next unless($parent_link);
    next unless($parent_link eq $link);
    push @kids, $neighbor;
  }
  return \@kids;
}

sub each_child {
    my $self = shift;

    # Return an iterator over the children (most effective when children list is LONG)
    my $count = -1;
    $self->load_children_if_needed;
    my @links = @{$self->links};

    return sub {
	while ($count < scalar(@links)) {
	    $count++;
	    my $link = $links[$count];
	    next unless(defined $link);

	    my $neighbor = $link->get_neighbor($self);
	    next unless($neighbor->parent_link);
	    next unless($neighbor->parent_link->equals($link));
	    return $neighbor;
	}
	return undef;
    };
}

=head2 sorted_children

  Overview   : returns a sorted list of NestedSet nodes directly under this parent node
               sort so that internal nodes<leaves and then on distance
  Example    : my @kids = @{$object->ordered_children()};
  Returntype : array reference of Bio::EnsEMBL::Compara::NestedSet objects (could be empty)
  Exceptions : none
  Caller     : general

=cut

sub sorted_children {
  my $self = shift;

  my @sortedkids = 
     sort { $a->is_leaf <=> $b->is_leaf     
                     ||
            $a->get_child_count <=> $b->get_child_count         
                     ||
            $a->distance_to_parent <=> $b->distance_to_parent
          }  @{$self->children;};
  return \@sortedkids;
}


=head2 get_all_nodes

  Arg 1       : hashref $node_hash [used for recursivity, do not use it!]
  Example     : my $all_nodes = $root->get_all_nodes();
  Description : Returns this and all underlying sub nodes
  ReturnType  : listref of Bio::EnsEMBL::Compara::NestedSet objects
  Exceptions  : none
  Caller      : general
  Status      : Stable

=cut

sub get_all_nodes {
  my $self = shift;
  my $node_hash = shift;

  my $toplevel = 0;
  unless($node_hash) {
   $node_hash = {};
   $toplevel =1;
  }

  $node_hash->{$self->obj_id} = $self; 
  foreach my $child (@{$self->children}) {
    $child->get_all_nodes($node_hash);
  }

  if ($toplevel) {
    return [values(%$node_hash)];
  }
  return undef;
}


=head2 get_all_nodes_by_tag_value

  Arg 1       : tag_name
  Arg 2       : tag_value (optional)
  Example     : my $all_nodes = $root->get_all_nodes_by_tagvalue('taxon_name'=>'Mamalia');
  Description : Returns all underlying nodes that have a tag of the given name, and optionally a value of the given value.
  ReturnType  : listref of Bio::EnsEMBL::Compara::NestedSet objects
  Exceptions  : none
  Caller      : general
  Status      : 

=cut

sub get_all_nodes_by_tag_value {
  my $self  = shift;
  my $tag   = shift || die( "Need a tag name" );
  my $value = shift;
  my @found;
  foreach my $node( @{$self->get_all_nodes} ){
    my $tval = $node->get_tagvalue($tag);
    if( defined $tval and $value ? $tval eq $value : 1 ){
      push @found, $node;
    }
  }
  return [@found];
}


=head2 get_all_subnodes

  Arg 1       : hashref $node_hash [used for recursivity, do not use it!]
  Example     : my $all_nodes = $root->get_all_nodes();
  Description : Returns all underlying sub nodes
  ReturnType  : listref of Bio::EnsEMBL::Compara::NestedSet objects
  Exceptions  : none
  Caller      : general
  Status      : Stable

=cut

sub get_all_subnodes {
  my $self = shift;
  my $node_hash = shift;

  my $toplevel = 0;
  unless($node_hash) {
   $node_hash = {};
   $toplevel =1;
  }

  foreach my $child (@{$self->children}) {
    $node_hash->{$child->obj_id} = $child; 
    $child->get_all_subnodes($node_hash);
  }
  return values(%$node_hash) if($toplevel);
  return undef;
}

=head2 get_all_ancestors

  Arg 1       : 
  Example     : my @ancestors = @{$node->get_all_ancestors};
  Description : Returns all ancestor nodes for a given node
  ReturnType  : listref of Bio::EnsEMBL::Compara::NestedSet objects
  Exceptions  : none
  Caller      : general
  Status      :

=cut

sub get_all_ancestors {
  my $self = shift;
  my $this = $self;
  my @ancestors;
  while( $this = $this->parent ){
    push @ancestors, $this;
  }
  return [@ancestors]
}

=head2 get_all_adjacent_subtrees

  Arg 1       : 
  Example     : my @subtrees = @{$node->get_all_adjacent_subtrees};
  Description : Returns subtree 'root' nodes where the subtree is adjacent
                to this node. Used e.g. by the web code for the 'collapse 
                other nodes' action 
  ReturnType  : listref of Bio::EnsEMBL::Compara::NestedSet objects
  Exceptions  : none
  Caller      : EnsEMBL::Web::Component::Gene::ComparaTree
  Status      :

=cut

sub get_all_adjacent_subtrees {
  my $self = shift;
  my $node_id = $self->node_id;
  my @node_path_to_root = ($self, @{$self->get_all_ancestors} );
  my %path_node_ids = map{ $_->node_id => 1 } @node_path_to_root;

  my $this = $self->root; # Start at the root node

  my @adjacent_subtrees;
  while( $this ){
    last if $this->node_id == $node_id; # Stop on reaching current node
    my $next;
    foreach my $child (@{$this->children}){
      next if $child->is_leaf; # Leaves cannot be subtrees
      if( $path_node_ids{$child->node_id} ){ # Ancestor node
        $next = $child;
      } else {
        push @adjacent_subtrees, $child;
      }
    }
    $this = $next || undef;
  }

  return [@adjacent_subtrees]
}


=head2 num_leaves

  Example     : my $num_leaves = $node->num_leaves
  Description : Returns the number of leaves underlying the node
  ReturnType  : integer
  Exceptions  : none
  Caller      : general
  Status      : At risk (relies on left and right indexes)

=cut
#'
sub num_leaves {
   my $self = shift;

   my $left = $self->left_index;
   my $right = $self->right_index;

   return unless( $left && $right );

   my $num = $right - $left + 1;
   my $num_leaves = ( ($num/2) + 1 ) / 2;

   return $num_leaves;
}


sub get_child_count {
  my $self = shift;
  $self->load_children_if_needed;
  return scalar @{$self->children};
#  my $count = $self->link_count;
#  $count-- if($self->has_parent);
#  return $count;
}

sub load_children_if_needed {
  my $self = shift;
  if(!defined($self->{'_children_loaded'}) and $self->adaptor) {
    #define _children_id_hash thereby signally that I've tried to load my children
    $self->{'_children_loaded'} = 1; 
    #print("load_children_if_needed : "); $self->print_node;
    $self->adaptor->fetch_all_children_for_node($self);
  }
}

sub no_autoload_children {
  my $self = shift;

  return if($self->{'_children_loaded'});
  $self->{'_children_loaded'} = 1;
}


=head2 distance_to_parent

  Arg [1]    : (opt.) <int or double> distance
  Example    : my $dist = $object->distance_to_parent();
  Example    : $object->distance_to_parent(1.618);
  Description: Getter/Setter for the distance between this child and its parent
  Returntype : integer node_id
  Exceptions : none
  Caller     : general

=cut

sub distance_to_parent {
  my $self = shift;
  my $dist = shift;

  if($self->{'_parent_link'}) {
    if(defined($dist)) { $self->{'_parent_link'}->distance_between($dist); }
    else { $dist = $self->{'_parent_link'}->distance_between; }
  } else {
    if(defined($dist)) { $self->_distance($dist); }
    else { $dist = $self->_distance; } 
  }
  return $dist;
}

sub _distance  {
  my $self = shift;
  $self->{'_distance_to_parent'} = shift if(@_);
  $self->{'_distance_to_parent'} = 0.0 unless(defined($self->{'_distance_to_parent'}));
  return $self->{'_distance_to_parent'};
}

sub distance_to_root {
  my $self = shift;
  my $dist = $self->distance_to_parent;
  $dist += $self->parent->distance_to_root if($self->parent);
  return $dist;
}


=head2 distance_to_ancestor

  Arg [1]     : Bio::EnsEMBL::Compara::NestedSet $ancestor
  Example     : my $distance = $this_node->distance_to_ancestor($ancestor);
  Description : Calculates the distance in the tree between this node and
                its ancestor $ancestor
  Returntype  : float
  Exceptions  : throws if $ancestor is not an ancestor of this node.
  Caller      : general
  Status      : Stable

=cut

sub distance_to_ancestor {
  my $self = shift;
  my $ancestor = shift;

  if ($ancestor->node_id eq $self->node_id) {
    return 0;
  }
  unless (defined $self->parent) {
    throw("Ancestor not found\n");
  }
  return $self->distance_to_parent + $self->parent->distance_to_ancestor($ancestor);
}


=head2 distance_to_node

  Arg [1]     : Bio::EnsEMBL::Compara::NestedSet $node
  Example     : my $distance = $this_node->distance_to_node($other_node);
  Description : Calculates the distance in the tree between these
                two nodes.
  Returntype  : float
  Exceptions  : returns undef if no ancestor can be found, no distances are
                defined in the tree, etc.
  Caller      : general
  Status      : Stable

=cut

sub distance_to_node {
  my $self = shift;
  my $node = shift;

  my $ancestor = $self->find_first_shared_ancestor($node);
  if (!$ancestor) {
    return undef;
  }
  my $distance = $self->distance_to_ancestor($ancestor);
  $distance += $node->distance_to_ancestor($ancestor);

  return $distance;
}


# Returns a TreeI-compliant object based on this NestedSet.
sub get_TreeI {
    my $self = shift;
    my $newick = $self->newick_format();

    open(my $fake_fh, "+<", \$newick);
    my $treein = new Bio::TreeIO(-fh => $fake_fh, -format => 'newick');
    my $treeI = $treein->next_tree;
    $treein->close;

    return $treeI;
}

sub new_from_newick {
    my $class = shift;
    my $file = shift;
    my $treein = new Bio::TreeIO(-file => $file, -format => 'newick');
    my $treeI = $treein->next_tree;
    $treein->close;

    return $class->new_from_TreeI($treeI);
}

sub new_from_TreeI {
    my $class = shift;
    my $treeI = shift;

    my $rootI = $treeI->get_root_node;
    my $node = new $class;

    # Kick off the recursive, parallel node adding.
    _add_nodeI_to_node($node,$rootI);

    return $node;
}

# Recursive helper for new_from_TreeI.
sub _add_nodeI_to_node {
    my $node = shift; # Our node object (Compara)
    my $nodeI = shift; # Our nodeI object (BioPerl)

    foreach my $c ($nodeI->each_Descendent) {
	my $child = ref($node)->new;

	my $name = $c->id || '';
	$name =~ s/^\s+//;
	$name =~ s/\s+$//;

	# Set name.
	$child->name($name);

	# Set branch length.
	$node->add_child($child,$c->branch_length);

	# Recurse.
	_add_nodeI_to_node($child,$c);
    }
}

=head2 print_tree

  Arg [1]     : int $scale
  Example     : $this_node->print_tree(100);
  Description : Prints this tree in ASCII format. The scale is used to define
                the width of the tree in the output
  Returntype  : undef
  Exceptions  :
  Caller      : general
  Status      : At risk (as the output might change)

=cut

sub print_tree {
  my $self  = shift;
  my $scale = shift;

  $scale = 100 unless($scale);
  $self->_internal_print_tree(undef, 0, $scale);
}

sub string_tree {
  my $self  = shift;
  my $scale = shift;
     $scale ||= 100;
  my $buffer = '';
  $self->_internal_string_tree(undef, 0, $scale, \$buffer);
  return $buffer;
}

sub _internal_string_tree {
  my $self    = shift;
  my $indent  = shift;
  my $lastone = shift;
  my $scale   = shift;
  my $buffer  = shift;

  if(defined($indent)) {
    $$buffer .= $indent;
    for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $$buffer .= '-'; }
  }

  $$buffer .= $self->string_node($indent);

  if(defined($indent)) {
    if($lastone) {
      chop($indent);
      $indent .= " ";
    }
    for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $indent .= ' '; }
  }
  $indent = '' unless(defined($indent));
  $indent .= "|";

  my $children = $self->sorted_children;
  my $count=0;
  $lastone = 0;
  foreach my $child_node (@$children) {
    $count++;
    $lastone = 1 if($count == scalar(@$children));
    $child_node->_internal_string_tree($indent,$lastone,$scale,$buffer);
  }
}

sub _internal_print_tree {
  my $self  = shift;
  my $indent = shift;
  my $lastone = shift;
  my $scale = shift; 

  if(defined($indent)) {
    print($indent);
    for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { print('-'); }
  }
  
  $self->print_node($indent);

  if(defined($indent)) {
    if($lastone) {
      chop($indent);
      $indent .= " ";
    }
    for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $indent .= ' '; }
  }
  $indent = '' unless(defined($indent));
  $indent .= "|";

  my $children = $self->sorted_children;
  my $count=0;
  $lastone = 0;
  foreach my $child_node (@$children) {  
    $count++;
    $lastone = 1 if($count == scalar(@$children));
    $child_node->_internal_print_tree($indent,$lastone,$scale);
  }
}


sub print_node {
  my $self = shift;
  print $self->string_node;
}

sub string_node {
    my $self  = shift;
    my $str = '(';

    my $isdup = 0;
    $isdup = 1 if ($self->get_tagvalue('Duplication', 0) > 0 and not $self->get_tagvalue('dubious_duplication', 0));
    $isdup = 1 if $self->get_tagvalue('node_type', '') eq 'duplication';

    my $isdub = ($self->get_tagvalue('node_type', '') eq 'dubious');

    if ($isdup) {
        my $taxon_name = $self->get_tagvalue('taxon_name', '');
        if ($taxon_name =~ /\S+\ \S+/) {
            $str .= "Dup ";
        } else {
            $str .= "DUP ";
        }
       my $sis = $self->get_tagvalue('duplication_confidence_score', 0) * 100;
       $str .= sprintf('SIS=%d ', $sis);
    } elsif ($isdub) {
        $str .= "DD  ";
       $str .= 'SIS=0 ';
    }
    if($self->has_tag("bootstrap")) { my $bootstrap_value = $self->get_tagvalue("bootstrap"); $str .= "B=$bootstrap_value "; }
    if($self->has_tag("taxon_name")) { my $taxon_name_value = $self->get_tagvalue("taxon_name"); $str .="T=$taxon_name_value "; }
    $str .= sprintf("%s %d,%d)", $self->node_id, $self->left_index, $self->right_index);
    $str .= sprintf("%s\n", $self->name || '');
    return $str;
}


=head2 newick_format

  Arg [1]     : string $format_mode
  Example     : $this_node->newick_format("full");
  Description : Prints this tree in Newick format. Several modes are
                available: full, display_label_composite, simple, species,
                species_short_name, ncbi_taxon, ncbi_name, njtree and phylip
  Returntype  : string
  Exceptions  :
  Caller      : general
  Status      : Stable

=cut

my %ryo_modes = (
    'member_id' => '%{^-m}:%{d}',
    'member_id_taxon_id' => '%{-m}%{"_"-x}:%{d}',
    'display_label_composite' => '%{-l"_"}%{n}%{"_"-s}:%{d}',
    'full_common' => '%{n}%{" "-c.^}%{"."-g}%{"_"-t"_MYA"}:%{d}',
    'gene_stable_id_composite' => '%{-i"_"}%{n}%{"_"-s}:%{d}',
    'gene_stable_id' => '%{-i}:%{d}',
    'ncbi_taxon' => '%{o}',
    'ncbi_name' => '%{n}',
    'simple' => '%{^-n}:%{d}',
    'full' => '%{n}:%{d}',
    'species' => '%{^-S|p}',
    'species_short_name' => '%{^-s|p}',
    'otu_id' => '%{-s"|"}%{-l"|"}%{n}:%{d}',
    'int_node_id' => '%{-n}%{o-}:%{d}',
    'full_web' => '%{n-}%{-n|p}%{"_"-s"_"}%{":"d}',
    'phylip' => '%21{n,}:%{d}',
    'njtree' => '%{o}%{-T(is_incomplete)|E"*"}%{-T(is_incomplete,0,*)}',
);

my $nhx0 = '%{n-_|T(taxon_name)}:%{d}';
my $nhx1 = ':D=%{-E"N"}%{T(node_type,duplication,Y)-}%{T(node_type,dubious,Y)-}%{T(node_type,gene_split,Y)-}%{T(node_type,speciation,N)}%{":B="T(bootstrap)}';
my $nhx2 = ':T=%{-x}%{T(taxon_id)-}';

my %nhx_ryo_modes_1 = (
    'member_id_taxon_id' => '%{-m}%{o-}_%{-x}%{T(taxon_id)-}:%{d}',
    'protein_id' => '%{-n}'.$nhx0,
    'transcript_id' => '%{-r}'.$nhx0,
    'gene_id' => '%{-i}'.$nhx0,
    'full' => $nhx0,
    'full_web' => $nhx0,
    'display_label' => '%{-L|i}%{"_"-s}'.$nhx0,
    'display_label_composite' => '%{-L"_"}%{-i}%{"_"-s}'.$nhx0,
    'treebest_ortho' => '%{-m}%{"_"-x}'.$nhx0,
    'simple' => $ryo_modes{'simple'},
    'phylip' => $ryo_modes{'phylip'},
);

my %nhx_ryo_modes_2 = (
    'member_id_taxon_id' => $nhx1.$nhx2,
    'protein_id' => $nhx1.'%{":G="-i}'.$nhx2,
    'transcript_id' => $nhx1.'%{":G="-i}'.$nhx2,
    'gene_id' => $nhx1.'%{":G="-r}'.$nhx2,
    'full' => $nhx1.$nhx2,
    'full_web' => $nhx1.$nhx2,
    'display_label' => $nhx1.$nhx2,
    'display_label_composite' => $nhx1.$nhx2,
    'treebest_ortho' => $nhx1.$nhx2.':S=%{-x}%{T(taxon_id)-}',
);


sub newick_format {
    my $self = shift;
    my $format_mode = shift;

    my $ryo_string;

    if (not defined $format_mode) {
        $ryo_string = $ryo_modes{'full'};

    } elsif ($format_mode eq "ryo") {
        $ryo_string = shift @_;

    } elsif (defined $ryo_modes{$format_mode}) {
        $ryo_string = $ryo_modes{$format_mode};

    } else {
        throw("Unrecognized format '$format_mode'. Please use 'ryo' to introduce a roll-your-own format string\n");
    }
    return $self->_internal_newick_format_ryo($ryo_string);
}

sub nhx_format {
    my ($self, $format_mode) = @_;
    my $ryo_string1;
    my $ryo_string2;

    if (not defined $format_mode) {
        $ryo_string1 = $nhx_ryo_modes_1{'protein_id'};
        $ryo_string2 = $nhx_ryo_modes_2{'protein_id'};

    } elsif ($format_mode eq "ryo") {
        $ryo_string1 = shift @_;
        $ryo_string2 = shift @_;

    } elsif (defined $nhx_ryo_modes_1{$format_mode}) {
        $ryo_string1 = $nhx_ryo_modes_1{$format_mode};
        $ryo_string2 = $nhx_ryo_modes_2{$format_mode};

    } else {
        throw("Unrecognized format '$format_mode'. Please use 'ryo' to introduce a roll-your-own format string\n");
    }
    my $fmt = $ryo_string1;
    $fmt = $ryo_string1.'[&&NHX'.$ryo_string2.']' if defined $ryo_string2;
    return $self->_internal_newick_format_ryo($fmt);
}

sub _internal_newick_format_ryo {
    my ($self, $ryo_string) = @_;
    my $newick_str;
    eval {
        use Bio::EnsEMBL::Compara::FormatTree;
        my $ryo_formatter = Bio::EnsEMBL::Compara::FormatTree->new($ryo_string);
        $newick_str = $ryo_formatter->format_newick($self);
    };
    if ($@) {
        throw("Something bad happened while trying to stringify the tree: $@\n");
    }
    return "$newick_str;";
}

=head2 newick_simple_format

    DEPRECATED. Use newick_format("simple") instead

=cut

sub newick_simple_format {
  my $self = shift;
  deprecate('Use newick_format("simple") instead.');
  return $self->newick_format('simple'); 
}


##################################
#
# Set theory methods
#
##################################

#sub equals {
#  my $self = shift;
#  my $other = shift;
#  throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$other]")
#        unless($other->isa('Bio::EnsEMBL::Compara::NestedSet'));
#  return 1 if($self->node_id eq $other->node_id);
#  foreach my $child (@{$self->children}) {
#    return 0 unless($other->has_child($child));
#  }
#  return 1;
#}

sub has_child {
  my $self = shift;
  my $child = shift;
  throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$child]")
        unless($child->isa('Bio::EnsEMBL::Compara::NestedSet'));
  $self->load_children_if_needed;
  my $link = $self->link_for_neighbor($child);
  return 0 unless($link);
  return 0 if($self->{'_parent_link'} and ($self->{'_parent_link'}->equals($link)));
  return 1;
}

sub is_member_of {
  my $A = shift;
  my $B = shift;
  return 1 if($B->has_child($A));
  return 0; 
}

sub is_subset_of {
  my $A = shift;
  my $B = shift;
  foreach my $child (@{$A->children}) {
    return 0 unless($B->has_child($child));
  }
  return 1; 
}

sub is_leaf {
  my $self = shift;
  return 1 unless($self->get_child_count);
  return 0;
}

sub merge_children {
  my $self = shift;
  my $nset = shift;
  throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$nset]")
        unless($nset->isa('Bio::EnsEMBL::Compara::NestedSet'));
  foreach my $child_node (@{$nset->children}) {
    $self->add_child($child_node, $child_node->distance_to_parent);
  }
  return $self;
}

sub merge_node_via_shared_ancestor {
  my $self = shift;
  my $node = shift;

  my $node_dup = $self->find_node_by_node_id($node->node_id);
  if($node_dup) {
    #warn("trying to merge in a node with already exists\n");
    return $node_dup;
  }
  return undef unless($node->parent);
  
  my $ancestor = $self->find_node_by_node_id($node->parent->node_id);
  if($ancestor) {
    $ancestor->add_child($node);
    #print("common ancestor at : "); $ancestor->print_node;
    return $ancestor;
  }
  return $self->merge_node_via_shared_ancestor($node->parent);
}


sub extract_subtree_from_leaves {
    my $self = shift;
    my $copy = $self->copy;
    my $node_ids = shift;	# Array ref of node_ids.
    my @keepers = @{$node_ids};
    my @all = @{$copy->get_all_nodes};

    # Add all ancestors of kept nodes to the keep list.
    my @all_keepers = ();
    foreach my $keeper (@keepers) {
	my $node = $copy->find_node_by_node_id($keeper);
	push @all_keepers, $keeper;

	my $parent = $node->parent;
	while (defined $parent) {
	    push @all_keepers, $parent->node_id;
	    $parent = $parent->parent;
	}
    }

    my @remove_me = ();
    foreach my $node (@all) {
	push @remove_me, $node unless (grep {$node->node_id == $_} @all_keepers);
    }
    $copy->remove_nodes(\@remove_me);
    return $copy;
}


##################################
#
# nested_set manipulations
#
##################################


=head2 flatten_tree

  Overview   : Removes all internal nodes and attaches leaves to the tree root, creating
               a "flattened" star tree structure.
  Example    : $node->flatten_tree();
  Returntype : undef or Bio::EnsEMBL::Compara::NestedSet
  Exceptions : none
  Caller     : general

=cut

sub flatten_tree {
  my $self = shift;
  
  my $leaves = $self->get_all_leaves;
  foreach my $leaf (@{$leaves}) { 
    $leaf->disavow_parent;
  }

  $self->release_children;
  foreach my $leaf (@{$leaves}) {
    $self->add_child($leaf, 0.0);
  }
  
  return $self;
}

=head2 re_root

  Overview   : rearranges the tree structure so that the root is moved to 
               beetween this node and its parent.  If the old root was more than
	       bifurcated (2 children) a new node is created where it was to hold
	       the multiple children that arises from the re-rooting.  
	       The old root is returned.
  Example    : $node->re_root();
  Returntype : undef or Bio::EnsEMBL::Compara::NestedSet
  Exceptions : none
  Caller     : general

=cut

sub re_root {
  my $self = shift;
  
  return $self unless($self->parent); #I'm root so just return self

  my $root = $self->root;
  my $tmp_root = new Bio::EnsEMBL::Compara::NestedSet;
  $tmp_root->merge_children($root);
    
  my $parent = $self->parent;
  my $dist = $self->distance_to_parent;
  $self->disavow_parent;

  my $old_root = $parent->_invert_tree_above;
  $old_root->minimize_node;
  
  $root->add_child($parent, $dist / 2.0);
  $root->add_child($self, $dist / 2.0);
  
  return $root;
}


sub _invert_tree_above {
  my $self = shift;
  return $self unless($self->parent);
  
  my $old_root =  $self->parent->_invert_tree_above;
  #now my parent has been inverted so it is the new root
  
  #flip the direction of the link between myself and my parent
  $self->parent->_set_parent_link($self->{'_parent_link'});
  $self->_set_parent_link(undef);
  
  #now I'm the new root and the old root might need to be modified
  return $old_root;
}


sub build_leftright_indexing {
  my $self = shift;
  my $counter = shift;
  
  $counter = 1 unless($counter);
  
  $self->left_index($counter++);
  foreach my $child_node (@{$self->sorted_children}) {
    $counter = $child_node->build_leftright_indexing($counter);
  }
  $self->right_index($counter++);
  return $counter;
}


=head2 remove_nodes

  Arg [1]     : arrayref Bio::EnsEMBL::Compara::NestedSet $nodes
  Example     : my $ret_tree = $tree->remove_nodes($nodes);
  Description : Returns the tree with removed nodes in list. Nodes should be in the tree.
  Returntype  : Bio::EnsEMBL::Compara::NestedSet object
  Exceptions  :
  Caller      : general
  Status      : At risk (behaviour on exceptions could change)

=cut

sub remove_nodes {
  my $self = shift;
  my $nodes = shift;

  foreach my $node (@$nodes) {
    if ($node->is_leaf) {
      $node->disavow_parent;
      $self = $self->minimize_tree;
    } else {
      my $node_children = $node->children;
      foreach my $child (@$node_children) {
        $node->parent->add_child($child);
      }
      $node->disavow_parent;
    }
    # Delete dangling one-child trees (help memory manager)
    if ($self->get_child_count == 1) {
      my $child = $self->children->[0];
      $child->parent->merge_children($child);
      $child->disavow_parent;
      return undef;
    }
    # Could be zero if all asked to delete, so return undef instead of
    # fake one-node tree.
    if ($self->get_child_count < 2) {
      return undef;
    }
  }
  return $self;
}


=head2 delete_lineage

  Arg [1]     : Bio::EnsEMBL::Compara::NestedSet $node
  Example     : $tree->delete_lineage($node);
  Description : Removes $node from tree. Nodes should be in the tree.
  Returntype  : 
  Exceptions  :
  Caller      : general
  Status      : At risk (behaviour on exceptions could change)

=cut

sub delete_lineage {
  my $self = shift;
  my $del_me = shift;

  throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$self]") 
    unless ($self->isa('Bio::EnsEMBL::Compara::NestedSet'));

  my $parent = $del_me->parent;
  while ($parent) {
    my $num_children = scalar @{$parent->children};
    if ($num_children > 1) {
      $self->remove_nodes([$del_me]);
      return $self;
    } elsif ($num_children == 1) {
      $self->remove_nodes([$del_me]);
      $del_me = $parent;
      $parent = $del_me->parent;
    }
  }
  return $self;
}

=head2 minimize_tree

  Arg [1]     : -none-
  Example     : $leaf->disavow_parent();
                $tree = $tree->minimize_tree();
  Description : Returns the tree after removing internal nodes that do not
                represent an multi- or bi-furcation anymore. This is typically
                required after disavowing a node. Please ensure you use the
                object returned by the method and not the original object
                anymore!
  Returntype  : Bio::EnsEMBL::Compara::NestedSet object
  Exceptions  :
  Caller      : general
  Status      : Stable

=cut

sub minimize_tree {
  my $self = shift;
  return $self if($self->is_leaf);
  
  foreach my $child (@{$self->children}) { 
    $child->minimize_tree;
  }
  return $self->minimize_node;
}


sub minimize_node {
  my $self = shift;
  
  return $self unless($self->get_child_count() == 1);
  
  my $child = $self->children->[0];
  my $dist = $child->distance_to_parent + $self->distance_to_parent;
  if ($self->parent) {
     $self->parent->add_child($child, $dist); 
     $self->disavow_parent;
  } else {
     $child->disavow_parent;
  }
  return $child
}


sub scale {
  my $self = shift;
  my $scale = shift;

  foreach my $node (@{$self->get_all_nodes}) {
    my $bl = $node->distance_to_parent;
    $bl = 0 unless (defined $bl);
    $node->distance_to_parent($bl*$scale);
  }
  return $self;
}


sub scale_max_to {
  my $self = shift;
  my $new_max = shift;

  my $max_dist = $self->max_distance;
  my $scale_factor = $new_max / $max_dist;
  return $self->scale($scale_factor);
}



##################################
#
# search methods
#
##################################

sub find_node_by_name {
  my $self = shift;
  my $name = shift;
  
  return $self if((defined $self->name) && $name eq $self->name);
  
  my $children = $self->children;
  foreach my $child_node (@$children) {
    my $found = $child_node->find_node_by_name($name);
    return $found if(defined($found));
  }
  
  return undef;
}

sub find_node_by_node_id {
  my $self = shift;
  my $node_id = shift;
  
  return $self if($node_id eq $self->node_id);
  
  my $children = $self->children;
  foreach my $child_node (@$children) {
    my $found = $child_node->find_node_by_node_id($node_id);
    return $found if(defined($found));
  }
  
  return undef;
}

sub find_leaf_by_name {
  my $self = shift;
  my $name = shift;

  return $self if((defined $self->name) and ($name eq $self->name));

  my $leaves = $self->get_all_leaves;
  foreach my $leaf (@$leaves) {
    return $leaf if((defined $leaf->name) and ($name eq $leaf->name));
  }

  return undef;
}

sub find_leaf_by_node_id {
  my $self = shift;
  my $node_id = shift;

  return $self if($node_id eq $self->node_id);

  my $leaves = $self->get_all_leaves;
  foreach my $leaf (@$leaves) {
    return $leaf if($node_id eq $leaf->node_id);
  }

  return undef;
}


=head2 get_all_sorted_leaves

  Arg [1]     : Bio::EnsEMBL::Compara::NestedSet $top_leaf
  Arg [...]   : (optional) Bio::EnsEMBL::Compara::NestedSet $secondary_priority_leaf
  Example     : my $sorted_leaves = $object->get_all_sorted_leaves($human_leaf);
  Example     : my $sorted_leaves = $object->get_all_sorted_leaves($human_leaf, $mouse_leaf);
  Description : Sorts the tree such as $top_leaf is the first leave and returns
                all the other leaves in the order defined by the tree.
                It is possible to define as many secondary top leaves as you require
                to sort other branches of the tree. The priority to sort the trees
                is defined by the order in which you specify the leaves.
  Returntype  : listref of Bio::EnsEMBL::Compara::NestedSet (all sorted leaves)
  Exceptions  : none
  Caller      : general
  Status      : Stable

=cut

sub get_all_sorted_leaves {
  my ($self, @priority_leaves) = @_;

  if (!@priority_leaves) {
    return $self->get_all_leaves;
  }

  # Assign priority scores for all parent nodes of the priority leaves
  my $score_by_node;
  my $score = 0;
  # Loop through all the priority leaves, starting from the last one (lowest score)
  while (my $priority_leaf = pop @priority_leaves) {
    $score++; # Increases the score, next priority leaves (earlier in the argument list) will overwrite the score if needed
    my $this_node = $priority_leaf;
    # Loop through all the parent node up to the root of the tree
    do {
      $score_by_node->{$this_node} = $score;
      $this_node = $this_node->parent;
    } while ($this_node);
  }

  my $sorted_leaves = $self->_recursive_get_all_sorted_leaves($score_by_node);

  return $sorted_leaves;
}

=head2 _recursive_get_all_sorted_leaves

  Arg [1]     : hashref $score_by_node
  Example     : my $sorted_leaves = $object->_recursive_get_all_sorted_leaves($score_by_node);
  Description : Recursive code for the get_all_sorted_leaves() method
  Returntype  : listref of Bio::EnsEMBL::Compara::NestedSet (sorted leaves)
  Exceptions  : none
  Caller      : private
  Status      : Stable

=cut

sub _recursive_get_all_sorted_leaves {
  my $self = shift;
  my $score_by_node = shift;

  my $sorted_leaves = [];
  my $children = $self->children;

  if (@$children == 0) {
    $sorted_leaves = [$self];
  } else {
    $children = [sort {
        ($score_by_node->{$b} || $score_by_node->{$a}) ? (($score_by_node->{$b} || 0)<=>($score_by_node->{$a} || 0)) : ($a->node_id <=> $b->node_id)
      } @$children];
    for (my $i = 0; $i < @$children; $i++) {
      push(@$sorted_leaves, @{$children->[$i]->_recursive_get_all_sorted_leaves($score_by_node)});
    }
  }

  return $sorted_leaves;
}

=head2 get_all_leaves

 Title   : get_all_leaves
 Usage   : my @leaves = @{$tree->get_all_leaves};
 Function: searching from the given starting node, searches and creates list
           of all leaves in this subtree and returns by reference
 Example :
 Returns : reference to list of NestedSet objects (all leaves)
 Args    : none

=cut

sub get_all_leaves {
  my $self = shift;
  
  my $leaves = {};
  $self->_recursive_get_all_leaves($leaves);
  my @leaf_list = sort {$a->node_id <=> $b->node_id} values(%{$leaves});
  return \@leaf_list;
}

sub _recursive_get_all_leaves {
  my $self = shift;
  my $leaves = shift;
    
  $leaves->{$self->obj_id} = $self if($self->is_leaf);

  foreach my $child (@{$self->children}) {
     no warnings 'recursion';
     $child->_recursive_get_all_leaves($leaves);
  }
}


sub get_all_leaves_indexed {
  my $self = shift;
  deprecate("Use Bio::EnsEMBL::Compara::DBSQL::GeneTreeNodeAdaptor->fetch_all_leaves_indexed() method instead");

  my @leaf_list = @{$self->adaptor->fetch_all_leaves_indexed($self)};

  return \@leaf_list;
}

=head2 max_distance

 Title   : max_distance
 Args    : none
 Usage   : $tree_node->max_distance;
 Function: searching from the given starting node, calculates the maximum distance to a leaf
 Returns : int

=cut

sub max_distance {
  my $self = shift;

  my $max_distance = 0;
  
  foreach my $child (@{$self->children}) {
    my $distance = $child->max_distance;
    $max_distance = $distance if($distance>$max_distance);
  }

  return ($self->distance_to_parent + $max_distance);
}


=head2 max_depth

 Title   : max_depth
 Args    : none
 Usage   : $tree_node->max_depth;
 Function: searching from the given starting node, calculates the maximum depth to a leaf
 Returns : int

=cut

sub max_depth {
  my $self = shift;

  my $max_depth = 0;
  
  foreach my $child (@{$self->children}) {
    my $depth = $child->max_depth + 1;
    $max_depth=$depth if($depth>$max_depth);
  }
  return $max_depth;  
}


=head2 find_first_shared_ancestor

  Arg [1]     : Bio::EnsEMBL::Compara::NestedSet $node
  Example     : my $ancestor = $this_node->find_first_shared_ancestor($other_node);
  Description : Gets the first common ancestor between this node and the other one.
  Returntype  : Bio::EnsEMBL::Compara::NestedSet object
  Exceptions  :
  Caller      : general
  Status      : Stable

=cut

sub find_first_shared_ancestor {
  my $self = shift;
  my $node = shift;

  return $self if($self->equals($node));
  return $node if($self->has_ancestor($node));
  return $self->find_first_shared_ancestor($node->parent);
}


sub find_first_shared_ancestor_from_leaves {
  my $self = shift;
  my $leaf_list = shift;

  my @leaves = @{$leaf_list};

  my $ancestor = shift @leaves;
  while (scalar @leaves > 0) {
    my $node = shift @leaves;
    $ancestor = $ancestor->find_first_shared_ancestor($node);
  }
  return $ancestor;
}


##################################
#
# developer/adaptor API methods
#
##################################


# used for building tree from a DB fetch, want to restrict users to create trees
# by only -add_child method
sub _set_parent_link {
  my ($self, $link) = @_;
  
  $self->{'_parent_id'} = 0;
  $self->{'_parent_link'} = $link;
  $self->{'_parent_id'} = $link->get_neighbor($self)->node_id if($link);
  return $self;
}


# used for building tree from a DB fetch until all the objects are in memory
sub _parent_id {
  my $self = shift;
  $self->{'_parent_id'} = shift if(@_);
  return $self->{'_parent_id'};
}

# used for building tree from a DB fetch until all the objects are in memory
sub _root_id {
  my $self = shift;
  $self->{'_root_id'} = shift if(@_);
  return $self->{'_root_id'};
}

1;