Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/Compara/NestedSet.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Compara/NestedSet.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,1687 @@ +=head1 NAME + +NestedSet - DESCRIPTION of Object + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +Abstract superclass to encapsulate the process of storing and manipulating a +nested-set representation tree. Also implements a 'reference count' system +based on the ObjectiveC retain/release design. +Designed to be used as the Root class for all Compara 'proxy' classes +(Member, GenomeDB, DnaFrag, NCBITaxon) to allow them to be made into sets and trees. + +=head1 CONTACT + + Contact Albert Vilella on implementation detail: avilella@ebi.ac.uk + Contact Jessica Severin on implementation/design detail: jessica@ebi.ac.uk + Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + + +package Bio::EnsEMBL::Compara::NestedSet; + +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use Bio::EnsEMBL::Utils::Argument; + +use Bio::EnsEMBL::Utils::Exception qw(deprecate throw); + +use Bio::TreeIO; +use Bio::EnsEMBL::Compara::Graph::Node; +use Bio::EnsEMBL::Compara::Member; +our @ISA = qw(Bio::EnsEMBL::Compara::Graph::Node); + +################################################# +# Factory methods +################################################# + +sub init { + my $self = shift; + $self->SUPER::init; + return $self; +} + +sub dealloc { + my $self = shift; + + #printf("DEALLOC NestedSet refcount:%d ", $self->refcount); $self->print_node; + #$self->release_children; + return $self->SUPER::dealloc; +} + +=head2 copy + + Overview : creates copy of tree starting at this node going down + Example : my $clone = $self->copy; + Returntype : Bio::EnsEMBL::Compara::NestedSet + Exceptions : none + Caller : general + +=cut + +sub copy { + my $self = shift; + + my $mycopy = $self->SUPER::copy; + bless $mycopy, ref $self; + + $mycopy->distance_to_parent($self->distance_to_parent); + $mycopy->left_index($self->left_index); + $mycopy->right_index($self->right_index); + + foreach my $child (@{$self->children}) { + $mycopy->add_child($child->copy); + } + return $mycopy; +} + + +=head2 release_tree + + Overview : deletes and frees the memory used by this tree + and all the underlying nodes. + Example : $self->release_tree; + Returntype : undef + Exceptions : none + Caller : general + +=cut + +sub release_tree { + my $self = shift; + + my $child_count = $self->get_child_count; + $self->disavow_parent; + $self->cascade_unlink if($child_count); + return undef; +} + +################################################# +# Object variable methods +################################################# + +sub left_index { + my $self = shift; + $self->{'_left_index'} = shift if(@_); + $self->{'_left_index'} = 0 unless(defined($self->{'_left_index'})); + return $self->{'_left_index'}; +} + +sub right_index { + my $self = shift; + $self->{'_right_index'} = shift if(@_); + $self->{'_right_index'} = 0 unless(defined($self->{'_right_index'})); + return $self->{'_right_index'}; +} + + +####################################### +# Set manipulation methods +####################################### + +=head2 add_child + + Overview : attaches child nestedset node to this nested set + Arg [1] : Bio::EnsEMBL::Compara::NestedSet $child + Arg [2] : (opt.) distance between this node and child + Example : $self->add_child($child); + Returntype : undef + Exceptions : if child is undef or not a NestedSet subclass + Caller : general + +=cut + +sub add_child { + my $self = shift; + my $child = shift; + my $dist = shift; + + throw("child not defined") + unless(defined($child)); + throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$child]") + unless($child->isa('Bio::EnsEMBL::Compara::NestedSet')); + + #printf("add_child: parent(%s) <-> child(%s)\n", $self->node_id, $child->node_id); + + unless(defined($dist)) { $dist = $child->_distance; } + + $child->disavow_parent; + #create_link_to_node is a safe method which checks if connection exists + my $link = $self->create_link_to_node($child); + $child->_set_parent_link($link); + $self->{'_children_loaded'} = 1; + $link->distance_between($dist); + return $link; +} + + +=head2 disavow_parent + + Overview : unlink and release self from its parent + might cause self to delete if refcount reaches Zero. + Example : $self->disavow_parent + Returntype : undef + Caller : general + +=cut + +sub disavow_parent { + my $self = shift; + + if($self->{'_parent_link'}) { + my $link = $self->{'_parent_link'}; + #print("DISAVOW parent : "); $parent->print_node; + #print(" child : "); $self->print_node; + $link->dealloc; + } + $self->_set_parent_link(undef); + return undef; +} + + +=head2 release_children + + Overview : recursive releases all children + will cause potential deletion of children if refcount reaches Zero. + Example : $self->release_children + Returntype : $self + Exceptions : none + Caller : general + +=cut + +sub release_children { + my $self = shift; + + # by calling with parent, this preserved the link to the parent + # and thus doesn't unlink self + foreach my $child (@{$self->children}) { + $child->disavow_parent; + $child->release_children; + } + #$self->cascade_unlink($self->{'_parent_node'}); + return $self; +} + + +=head2 parent + + Overview : returns the parent NestedSet object for this node + Example : my $my_parent = $object->parent(); + Returntype : undef or Bio::EnsEMBL::Compara::NestedSet + Exceptions : none + Caller : general + +=cut + +sub parent { + my $self = shift; + return $self->adaptor->fetch_parent_for_node($self) if defined $self->adaptor; + return $self->{'_parent_link'}->get_neighbor($self) if defined $self->{'_parent_link'}; + return undef; +} + + +sub parent_link { + my $self = shift; + return $self->{'_parent_link'}; +} + +sub has_parent { + my $self = shift; + return 1 if($self->{'_parent_link'} or $self->_parent_id); + return 0; +} + + +sub has_ancestor { + my $self = shift; + my $ancestor = shift; + throw "[$ancestor] must be a Bio::EnsEMBL::Compara::NestedSet object" + unless ($ancestor and $ancestor->isa("Bio::EnsEMBL::Compara::NestedSet")); + my $node = $self->parent; + while($node) { + return 1 if($node->equals($ancestor)); + $node = $node->parent; + } + return 0; +} + + +=head2 root + + Overview : returns the root NestedSet object for this node + returns $self if node has no parent (this is the root) + Example : my $root = $object->root(); + Description: Returns the root of the tree for this node + with links to all the intermediate nodes. Sister nodes + are not included in the result. + Returntype : undef or Bio::EnsEMBL::Compara::NestedSet + Exceptions : none + Caller : general + +=cut + +sub root { + my $self = shift; + + # Only if we don't have it cached + # Only if we have left and right and it's not a leaf + # Only if it's for release clusterset (1 genetrees - 0 genomic align trees) + if (!defined($self->{'_parent_link'}) and $self->adaptor + and ($self->right_index-$self->left_index)>1 + and (defined $self->{'_parent_id'}) + and (1==$self->{'_parent_id'}) + ) { + return $self->adaptor->fetch_root_by_node($self); + } + + # Otherwise, go through the step-by-step method + return $self unless(defined($self->parent)); + # return $self if($self->node_id eq $self->parent->node_id); + return $self->parent->root; +} + +sub subroot { + deprecate('subroot() should not be used and will be removed in release 70. + If you are using it, please contact the dev mailing-list dev@ensembl.org'); + my $self = shift; + + return undef unless($self->parent); + return $self unless(defined($self->parent->parent)); + return $self->parent->subroot; +} + + +=head2 children + + Overview : returns a list of NestedSet nodes directly under this parent node + Example : my @children = @{$object->children()}; + Returntype : array reference of Bio::EnsEMBL::Compara::NestedSet objects (could be empty) + Exceptions : none + Caller : general + Algorithm : new algorithm for fetching children: + for each link connected to this NestedsSet node, a child is defined if + old: the link is not my parent_link + new: the link's neighbors' parent_link is the link + This allows one (with a carefully coded algorithm) to overlay a tree on top + of a fully connected graph and use the parent/children methods of NestedSet + to walk the 'tree' substructure of the graph. + Trees that are really just trees are still trees. + +=cut + +sub children { + my $self = shift; + $self->load_children_if_needed; + my @kids; + foreach my $link (@{$self->links}) { + next unless(defined($link)); + my $neighbor = $link->get_neighbor($self); + my $parent_link = $neighbor->parent_link; + next unless($parent_link); + next unless($parent_link eq $link); + push @kids, $neighbor; + } + return \@kids; +} + +sub each_child { + my $self = shift; + + # Return an iterator over the children (most effective when children list is LONG) + my $count = -1; + $self->load_children_if_needed; + my @links = @{$self->links}; + + return sub { + while ($count < scalar(@links)) { + $count++; + my $link = $links[$count]; + next unless(defined $link); + + my $neighbor = $link->get_neighbor($self); + next unless($neighbor->parent_link); + next unless($neighbor->parent_link->equals($link)); + return $neighbor; + } + return undef; + }; +} + +=head2 sorted_children + + Overview : returns a sorted list of NestedSet nodes directly under this parent node + sort so that internal nodes<leaves and then on distance + Example : my @kids = @{$object->ordered_children()}; + Returntype : array reference of Bio::EnsEMBL::Compara::NestedSet objects (could be empty) + Exceptions : none + Caller : general + +=cut + +sub sorted_children { + my $self = shift; + + my @sortedkids = + sort { $a->is_leaf <=> $b->is_leaf + || + $a->get_child_count <=> $b->get_child_count + || + $a->distance_to_parent <=> $b->distance_to_parent + } @{$self->children;}; + return \@sortedkids; +} + + +=head2 get_all_nodes + + Arg 1 : hashref $node_hash [used for recursivity, do not use it!] + Example : my $all_nodes = $root->get_all_nodes(); + Description : Returns this and all underlying sub nodes + ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_nodes { + my $self = shift; + my $node_hash = shift; + + my $toplevel = 0; + unless($node_hash) { + $node_hash = {}; + $toplevel =1; + } + + $node_hash->{$self->obj_id} = $self; + foreach my $child (@{$self->children}) { + $child->get_all_nodes($node_hash); + } + + if ($toplevel) { + return [values(%$node_hash)]; + } + return undef; +} + + +=head2 get_all_nodes_by_tag_value + + Arg 1 : tag_name + Arg 2 : tag_value (optional) + Example : my $all_nodes = $root->get_all_nodes_by_tagvalue('taxon_name'=>'Mamalia'); + Description : Returns all underlying nodes that have a tag of the given name, and optionally a value of the given value. + ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects + Exceptions : none + Caller : general + Status : + +=cut + +sub get_all_nodes_by_tag_value { + my $self = shift; + my $tag = shift || die( "Need a tag name" ); + my $value = shift; + my @found; + foreach my $node( @{$self->get_all_nodes} ){ + my $tval = $node->get_tagvalue($tag); + if( defined $tval and $value ? $tval eq $value : 1 ){ + push @found, $node; + } + } + return [@found]; +} + + +=head2 get_all_subnodes + + Arg 1 : hashref $node_hash [used for recursivity, do not use it!] + Example : my $all_nodes = $root->get_all_nodes(); + Description : Returns all underlying sub nodes + ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_subnodes { + my $self = shift; + my $node_hash = shift; + + my $toplevel = 0; + unless($node_hash) { + $node_hash = {}; + $toplevel =1; + } + + foreach my $child (@{$self->children}) { + $node_hash->{$child->obj_id} = $child; + $child->get_all_subnodes($node_hash); + } + return values(%$node_hash) if($toplevel); + return undef; +} + +=head2 get_all_ancestors + + Arg 1 : + Example : my @ancestors = @{$node->get_all_ancestors}; + Description : Returns all ancestor nodes for a given node + ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects + Exceptions : none + Caller : general + Status : + +=cut + +sub get_all_ancestors { + my $self = shift; + my $this = $self; + my @ancestors; + while( $this = $this->parent ){ + push @ancestors, $this; + } + return [@ancestors] +} + +=head2 get_all_adjacent_subtrees + + Arg 1 : + Example : my @subtrees = @{$node->get_all_adjacent_subtrees}; + Description : Returns subtree 'root' nodes where the subtree is adjacent + to this node. Used e.g. by the web code for the 'collapse + other nodes' action + ReturnType : listref of Bio::EnsEMBL::Compara::NestedSet objects + Exceptions : none + Caller : EnsEMBL::Web::Component::Gene::ComparaTree + Status : + +=cut + +sub get_all_adjacent_subtrees { + my $self = shift; + my $node_id = $self->node_id; + my @node_path_to_root = ($self, @{$self->get_all_ancestors} ); + my %path_node_ids = map{ $_->node_id => 1 } @node_path_to_root; + + my $this = $self->root; # Start at the root node + + my @adjacent_subtrees; + while( $this ){ + last if $this->node_id == $node_id; # Stop on reaching current node + my $next; + foreach my $child (@{$this->children}){ + next if $child->is_leaf; # Leaves cannot be subtrees + if( $path_node_ids{$child->node_id} ){ # Ancestor node + $next = $child; + } else { + push @adjacent_subtrees, $child; + } + } + $this = $next || undef; + } + + return [@adjacent_subtrees] +} + + +=head2 num_leaves + + Example : my $num_leaves = $node->num_leaves + Description : Returns the number of leaves underlying the node + ReturnType : integer + Exceptions : none + Caller : general + Status : At risk (relies on left and right indexes) + +=cut +#' +sub num_leaves { + my $self = shift; + + my $left = $self->left_index; + my $right = $self->right_index; + + return unless( $left && $right ); + + my $num = $right - $left + 1; + my $num_leaves = ( ($num/2) + 1 ) / 2; + + return $num_leaves; +} + + +sub get_child_count { + my $self = shift; + $self->load_children_if_needed; + return scalar @{$self->children}; +# my $count = $self->link_count; +# $count-- if($self->has_parent); +# return $count; +} + +sub load_children_if_needed { + my $self = shift; + if(!defined($self->{'_children_loaded'}) and $self->adaptor) { + #define _children_id_hash thereby signally that I've tried to load my children + $self->{'_children_loaded'} = 1; + #print("load_children_if_needed : "); $self->print_node; + $self->adaptor->fetch_all_children_for_node($self); + } +} + +sub no_autoload_children { + my $self = shift; + + return if($self->{'_children_loaded'}); + $self->{'_children_loaded'} = 1; +} + + +=head2 distance_to_parent + + Arg [1] : (opt.) <int or double> distance + Example : my $dist = $object->distance_to_parent(); + Example : $object->distance_to_parent(1.618); + Description: Getter/Setter for the distance between this child and its parent + Returntype : integer node_id + Exceptions : none + Caller : general + +=cut + +sub distance_to_parent { + my $self = shift; + my $dist = shift; + + if($self->{'_parent_link'}) { + if(defined($dist)) { $self->{'_parent_link'}->distance_between($dist); } + else { $dist = $self->{'_parent_link'}->distance_between; } + } else { + if(defined($dist)) { $self->_distance($dist); } + else { $dist = $self->_distance; } + } + return $dist; +} + +sub _distance { + my $self = shift; + $self->{'_distance_to_parent'} = shift if(@_); + $self->{'_distance_to_parent'} = 0.0 unless(defined($self->{'_distance_to_parent'})); + return $self->{'_distance_to_parent'}; +} + +sub distance_to_root { + my $self = shift; + my $dist = $self->distance_to_parent; + $dist += $self->parent->distance_to_root if($self->parent); + return $dist; +} + + +=head2 distance_to_ancestor + + Arg [1] : Bio::EnsEMBL::Compara::NestedSet $ancestor + Example : my $distance = $this_node->distance_to_ancestor($ancestor); + Description : Calculates the distance in the tree between this node and + its ancestor $ancestor + Returntype : float + Exceptions : throws if $ancestor is not an ancestor of this node. + Caller : general + Status : Stable + +=cut + +sub distance_to_ancestor { + my $self = shift; + my $ancestor = shift; + + if ($ancestor->node_id eq $self->node_id) { + return 0; + } + unless (defined $self->parent) { + throw("Ancestor not found\n"); + } + return $self->distance_to_parent + $self->parent->distance_to_ancestor($ancestor); +} + + +=head2 distance_to_node + + Arg [1] : Bio::EnsEMBL::Compara::NestedSet $node + Example : my $distance = $this_node->distance_to_node($other_node); + Description : Calculates the distance in the tree between these + two nodes. + Returntype : float + Exceptions : returns undef if no ancestor can be found, no distances are + defined in the tree, etc. + Caller : general + Status : Stable + +=cut + +sub distance_to_node { + my $self = shift; + my $node = shift; + + my $ancestor = $self->find_first_shared_ancestor($node); + if (!$ancestor) { + return undef; + } + my $distance = $self->distance_to_ancestor($ancestor); + $distance += $node->distance_to_ancestor($ancestor); + + return $distance; +} + + +# Returns a TreeI-compliant object based on this NestedSet. +sub get_TreeI { + my $self = shift; + my $newick = $self->newick_format(); + + open(my $fake_fh, "+<", \$newick); + my $treein = new Bio::TreeIO(-fh => $fake_fh, -format => 'newick'); + my $treeI = $treein->next_tree; + $treein->close; + + return $treeI; +} + +sub new_from_newick { + my $class = shift; + my $file = shift; + my $treein = new Bio::TreeIO(-file => $file, -format => 'newick'); + my $treeI = $treein->next_tree; + $treein->close; + + return $class->new_from_TreeI($treeI); +} + +sub new_from_TreeI { + my $class = shift; + my $treeI = shift; + + my $rootI = $treeI->get_root_node; + my $node = new $class; + + # Kick off the recursive, parallel node adding. + _add_nodeI_to_node($node,$rootI); + + return $node; +} + +# Recursive helper for new_from_TreeI. +sub _add_nodeI_to_node { + my $node = shift; # Our node object (Compara) + my $nodeI = shift; # Our nodeI object (BioPerl) + + foreach my $c ($nodeI->each_Descendent) { + my $child = ref($node)->new; + + my $name = $c->id || ''; + $name =~ s/^\s+//; + $name =~ s/\s+$//; + + # Set name. + $child->name($name); + + # Set branch length. + $node->add_child($child,$c->branch_length); + + # Recurse. + _add_nodeI_to_node($child,$c); + } +} + +=head2 print_tree + + Arg [1] : int $scale + Example : $this_node->print_tree(100); + Description : Prints this tree in ASCII format. The scale is used to define + the width of the tree in the output + Returntype : undef + Exceptions : + Caller : general + Status : At risk (as the output might change) + +=cut + +sub print_tree { + my $self = shift; + my $scale = shift; + + $scale = 100 unless($scale); + $self->_internal_print_tree(undef, 0, $scale); +} + +sub string_tree { + my $self = shift; + my $scale = shift; + $scale ||= 100; + my $buffer = ''; + $self->_internal_string_tree(undef, 0, $scale, \$buffer); + return $buffer; +} + +sub _internal_string_tree { + my $self = shift; + my $indent = shift; + my $lastone = shift; + my $scale = shift; + my $buffer = shift; + + if(defined($indent)) { + $$buffer .= $indent; + for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $$buffer .= '-'; } + } + + $$buffer .= $self->string_node($indent); + + if(defined($indent)) { + if($lastone) { + chop($indent); + $indent .= " "; + } + for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $indent .= ' '; } + } + $indent = '' unless(defined($indent)); + $indent .= "|"; + + my $children = $self->sorted_children; + my $count=0; + $lastone = 0; + foreach my $child_node (@$children) { + $count++; + $lastone = 1 if($count == scalar(@$children)); + $child_node->_internal_string_tree($indent,$lastone,$scale,$buffer); + } +} + +sub _internal_print_tree { + my $self = shift; + my $indent = shift; + my $lastone = shift; + my $scale = shift; + + if(defined($indent)) { + print($indent); + for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { print('-'); } + } + + $self->print_node($indent); + + if(defined($indent)) { + if($lastone) { + chop($indent); + $indent .= " "; + } + for(my $i=0; $i<$self->distance_to_parent()*$scale; $i++) { $indent .= ' '; } + } + $indent = '' unless(defined($indent)); + $indent .= "|"; + + my $children = $self->sorted_children; + my $count=0; + $lastone = 0; + foreach my $child_node (@$children) { + $count++; + $lastone = 1 if($count == scalar(@$children)); + $child_node->_internal_print_tree($indent,$lastone,$scale); + } +} + + +sub print_node { + my $self = shift; + print $self->string_node; +} + +sub string_node { + my $self = shift; + my $str = '('; + + my $isdup = 0; + $isdup = 1 if ($self->get_tagvalue('Duplication', 0) > 0 and not $self->get_tagvalue('dubious_duplication', 0)); + $isdup = 1 if $self->get_tagvalue('node_type', '') eq 'duplication'; + + my $isdub = ($self->get_tagvalue('node_type', '') eq 'dubious'); + + if ($isdup) { + my $taxon_name = $self->get_tagvalue('taxon_name', ''); + if ($taxon_name =~ /\S+\ \S+/) { + $str .= "Dup "; + } else { + $str .= "DUP "; + } + my $sis = $self->get_tagvalue('duplication_confidence_score', 0) * 100; + $str .= sprintf('SIS=%d ', $sis); + } elsif ($isdub) { + $str .= "DD "; + $str .= 'SIS=0 '; + } + if($self->has_tag("bootstrap")) { my $bootstrap_value = $self->get_tagvalue("bootstrap"); $str .= "B=$bootstrap_value "; } + if($self->has_tag("taxon_name")) { my $taxon_name_value = $self->get_tagvalue("taxon_name"); $str .="T=$taxon_name_value "; } + $str .= sprintf("%s %d,%d)", $self->node_id, $self->left_index, $self->right_index); + $str .= sprintf("%s\n", $self->name || ''); + return $str; +} + + +=head2 newick_format + + Arg [1] : string $format_mode + Example : $this_node->newick_format("full"); + Description : Prints this tree in Newick format. Several modes are + available: full, display_label_composite, simple, species, + species_short_name, ncbi_taxon, ncbi_name, njtree and phylip + Returntype : string + Exceptions : + Caller : general + Status : Stable + +=cut + +my %ryo_modes = ( + 'member_id' => '%{^-m}:%{d}', + 'member_id_taxon_id' => '%{-m}%{"_"-x}:%{d}', + 'display_label_composite' => '%{-l"_"}%{n}%{"_"-s}:%{d}', + 'full_common' => '%{n}%{" "-c.^}%{"."-g}%{"_"-t"_MYA"}:%{d}', + 'gene_stable_id_composite' => '%{-i"_"}%{n}%{"_"-s}:%{d}', + 'gene_stable_id' => '%{-i}:%{d}', + 'ncbi_taxon' => '%{o}', + 'ncbi_name' => '%{n}', + 'simple' => '%{^-n}:%{d}', + 'full' => '%{n}:%{d}', + 'species' => '%{^-S|p}', + 'species_short_name' => '%{^-s|p}', + 'otu_id' => '%{-s"|"}%{-l"|"}%{n}:%{d}', + 'int_node_id' => '%{-n}%{o-}:%{d}', + 'full_web' => '%{n-}%{-n|p}%{"_"-s"_"}%{":"d}', + 'phylip' => '%21{n,}:%{d}', + 'njtree' => '%{o}%{-T(is_incomplete)|E"*"}%{-T(is_incomplete,0,*)}', +); + +my $nhx0 = '%{n-_|T(taxon_name)}:%{d}'; +my $nhx1 = ':D=%{-E"N"}%{T(node_type,duplication,Y)-}%{T(node_type,dubious,Y)-}%{T(node_type,gene_split,Y)-}%{T(node_type,speciation,N)}%{":B="T(bootstrap)}'; +my $nhx2 = ':T=%{-x}%{T(taxon_id)-}'; + +my %nhx_ryo_modes_1 = ( + 'member_id_taxon_id' => '%{-m}%{o-}_%{-x}%{T(taxon_id)-}:%{d}', + 'protein_id' => '%{-n}'.$nhx0, + 'transcript_id' => '%{-r}'.$nhx0, + 'gene_id' => '%{-i}'.$nhx0, + 'full' => $nhx0, + 'full_web' => $nhx0, + 'display_label' => '%{-L|i}%{"_"-s}'.$nhx0, + 'display_label_composite' => '%{-L"_"}%{-i}%{"_"-s}'.$nhx0, + 'treebest_ortho' => '%{-m}%{"_"-x}'.$nhx0, + 'simple' => $ryo_modes{'simple'}, + 'phylip' => $ryo_modes{'phylip'}, +); + +my %nhx_ryo_modes_2 = ( + 'member_id_taxon_id' => $nhx1.$nhx2, + 'protein_id' => $nhx1.'%{":G="-i}'.$nhx2, + 'transcript_id' => $nhx1.'%{":G="-i}'.$nhx2, + 'gene_id' => $nhx1.'%{":G="-r}'.$nhx2, + 'full' => $nhx1.$nhx2, + 'full_web' => $nhx1.$nhx2, + 'display_label' => $nhx1.$nhx2, + 'display_label_composite' => $nhx1.$nhx2, + 'treebest_ortho' => $nhx1.$nhx2.':S=%{-x}%{T(taxon_id)-}', +); + + +sub newick_format { + my $self = shift; + my $format_mode = shift; + + my $ryo_string; + + if (not defined $format_mode) { + $ryo_string = $ryo_modes{'full'}; + + } elsif ($format_mode eq "ryo") { + $ryo_string = shift @_; + + } elsif (defined $ryo_modes{$format_mode}) { + $ryo_string = $ryo_modes{$format_mode}; + + } else { + throw("Unrecognized format '$format_mode'. Please use 'ryo' to introduce a roll-your-own format string\n"); + } + return $self->_internal_newick_format_ryo($ryo_string); +} + +sub nhx_format { + my ($self, $format_mode) = @_; + my $ryo_string1; + my $ryo_string2; + + if (not defined $format_mode) { + $ryo_string1 = $nhx_ryo_modes_1{'protein_id'}; + $ryo_string2 = $nhx_ryo_modes_2{'protein_id'}; + + } elsif ($format_mode eq "ryo") { + $ryo_string1 = shift @_; + $ryo_string2 = shift @_; + + } elsif (defined $nhx_ryo_modes_1{$format_mode}) { + $ryo_string1 = $nhx_ryo_modes_1{$format_mode}; + $ryo_string2 = $nhx_ryo_modes_2{$format_mode}; + + } else { + throw("Unrecognized format '$format_mode'. Please use 'ryo' to introduce a roll-your-own format string\n"); + } + my $fmt = $ryo_string1; + $fmt = $ryo_string1.'[&&NHX'.$ryo_string2.']' if defined $ryo_string2; + return $self->_internal_newick_format_ryo($fmt); +} + +sub _internal_newick_format_ryo { + my ($self, $ryo_string) = @_; + my $newick_str; + eval { + use Bio::EnsEMBL::Compara::FormatTree; + my $ryo_formatter = Bio::EnsEMBL::Compara::FormatTree->new($ryo_string); + $newick_str = $ryo_formatter->format_newick($self); + }; + if ($@) { + throw("Something bad happened while trying to stringify the tree: $@\n"); + } + return "$newick_str;"; +} + +=head2 newick_simple_format + + DEPRECATED. Use newick_format("simple") instead + +=cut + +sub newick_simple_format { + my $self = shift; + deprecate('Use newick_format("simple") instead.'); + return $self->newick_format('simple'); +} + + +################################## +# +# Set theory methods +# +################################## + +#sub equals { +# my $self = shift; +# my $other = shift; +# throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$other]") +# unless($other->isa('Bio::EnsEMBL::Compara::NestedSet')); +# return 1 if($self->node_id eq $other->node_id); +# foreach my $child (@{$self->children}) { +# return 0 unless($other->has_child($child)); +# } +# return 1; +#} + +sub has_child { + my $self = shift; + my $child = shift; + throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$child]") + unless($child->isa('Bio::EnsEMBL::Compara::NestedSet')); + $self->load_children_if_needed; + my $link = $self->link_for_neighbor($child); + return 0 unless($link); + return 0 if($self->{'_parent_link'} and ($self->{'_parent_link'}->equals($link))); + return 1; +} + +sub is_member_of { + my $A = shift; + my $B = shift; + return 1 if($B->has_child($A)); + return 0; +} + +sub is_subset_of { + my $A = shift; + my $B = shift; + foreach my $child (@{$A->children}) { + return 0 unless($B->has_child($child)); + } + return 1; +} + +sub is_leaf { + my $self = shift; + return 1 unless($self->get_child_count); + return 0; +} + +sub merge_children { + my $self = shift; + my $nset = shift; + throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$nset]") + unless($nset->isa('Bio::EnsEMBL::Compara::NestedSet')); + foreach my $child_node (@{$nset->children}) { + $self->add_child($child_node, $child_node->distance_to_parent); + } + return $self; +} + +sub merge_node_via_shared_ancestor { + my $self = shift; + my $node = shift; + + my $node_dup = $self->find_node_by_node_id($node->node_id); + if($node_dup) { + #warn("trying to merge in a node with already exists\n"); + return $node_dup; + } + return undef unless($node->parent); + + my $ancestor = $self->find_node_by_node_id($node->parent->node_id); + if($ancestor) { + $ancestor->add_child($node); + #print("common ancestor at : "); $ancestor->print_node; + return $ancestor; + } + return $self->merge_node_via_shared_ancestor($node->parent); +} + + +sub extract_subtree_from_leaves { + my $self = shift; + my $copy = $self->copy; + my $node_ids = shift; # Array ref of node_ids. + my @keepers = @{$node_ids}; + my @all = @{$copy->get_all_nodes}; + + # Add all ancestors of kept nodes to the keep list. + my @all_keepers = (); + foreach my $keeper (@keepers) { + my $node = $copy->find_node_by_node_id($keeper); + push @all_keepers, $keeper; + + my $parent = $node->parent; + while (defined $parent) { + push @all_keepers, $parent->node_id; + $parent = $parent->parent; + } + } + + my @remove_me = (); + foreach my $node (@all) { + push @remove_me, $node unless (grep {$node->node_id == $_} @all_keepers); + } + $copy->remove_nodes(\@remove_me); + return $copy; +} + + +################################## +# +# nested_set manipulations +# +################################## + + +=head2 flatten_tree + + Overview : Removes all internal nodes and attaches leaves to the tree root, creating + a "flattened" star tree structure. + Example : $node->flatten_tree(); + Returntype : undef or Bio::EnsEMBL::Compara::NestedSet + Exceptions : none + Caller : general + +=cut + +sub flatten_tree { + my $self = shift; + + my $leaves = $self->get_all_leaves; + foreach my $leaf (@{$leaves}) { + $leaf->disavow_parent; + } + + $self->release_children; + foreach my $leaf (@{$leaves}) { + $self->add_child($leaf, 0.0); + } + + return $self; +} + +=head2 re_root + + Overview : rearranges the tree structure so that the root is moved to + beetween this node and its parent. If the old root was more than + bifurcated (2 children) a new node is created where it was to hold + the multiple children that arises from the re-rooting. + The old root is returned. + Example : $node->re_root(); + Returntype : undef or Bio::EnsEMBL::Compara::NestedSet + Exceptions : none + Caller : general + +=cut + +sub re_root { + my $self = shift; + + return $self unless($self->parent); #I'm root so just return self + + my $root = $self->root; + my $tmp_root = new Bio::EnsEMBL::Compara::NestedSet; + $tmp_root->merge_children($root); + + my $parent = $self->parent; + my $dist = $self->distance_to_parent; + $self->disavow_parent; + + my $old_root = $parent->_invert_tree_above; + $old_root->minimize_node; + + $root->add_child($parent, $dist / 2.0); + $root->add_child($self, $dist / 2.0); + + return $root; +} + + +sub _invert_tree_above { + my $self = shift; + return $self unless($self->parent); + + my $old_root = $self->parent->_invert_tree_above; + #now my parent has been inverted so it is the new root + + #flip the direction of the link between myself and my parent + $self->parent->_set_parent_link($self->{'_parent_link'}); + $self->_set_parent_link(undef); + + #now I'm the new root and the old root might need to be modified + return $old_root; +} + + +sub build_leftright_indexing { + my $self = shift; + my $counter = shift; + + $counter = 1 unless($counter); + + $self->left_index($counter++); + foreach my $child_node (@{$self->sorted_children}) { + $counter = $child_node->build_leftright_indexing($counter); + } + $self->right_index($counter++); + return $counter; +} + + +=head2 remove_nodes + + Arg [1] : arrayref Bio::EnsEMBL::Compara::NestedSet $nodes + Example : my $ret_tree = $tree->remove_nodes($nodes); + Description : Returns the tree with removed nodes in list. Nodes should be in the tree. + Returntype : Bio::EnsEMBL::Compara::NestedSet object + Exceptions : + Caller : general + Status : At risk (behaviour on exceptions could change) + +=cut + +sub remove_nodes { + my $self = shift; + my $nodes = shift; + + foreach my $node (@$nodes) { + if ($node->is_leaf) { + $node->disavow_parent; + $self = $self->minimize_tree; + } else { + my $node_children = $node->children; + foreach my $child (@$node_children) { + $node->parent->add_child($child); + } + $node->disavow_parent; + } + # Delete dangling one-child trees (help memory manager) + if ($self->get_child_count == 1) { + my $child = $self->children->[0]; + $child->parent->merge_children($child); + $child->disavow_parent; + return undef; + } + # Could be zero if all asked to delete, so return undef instead of + # fake one-node tree. + if ($self->get_child_count < 2) { + return undef; + } + } + return $self; +} + + +=head2 delete_lineage + + Arg [1] : Bio::EnsEMBL::Compara::NestedSet $node + Example : $tree->delete_lineage($node); + Description : Removes $node from tree. Nodes should be in the tree. + Returntype : + Exceptions : + Caller : general + Status : At risk (behaviour on exceptions could change) + +=cut + +sub delete_lineage { + my $self = shift; + my $del_me = shift; + + throw("arg must be a [Bio::EnsEMBL::Compara::NestedSet] not a [$self]") + unless ($self->isa('Bio::EnsEMBL::Compara::NestedSet')); + + my $parent = $del_me->parent; + while ($parent) { + my $num_children = scalar @{$parent->children}; + if ($num_children > 1) { + $self->remove_nodes([$del_me]); + return $self; + } elsif ($num_children == 1) { + $self->remove_nodes([$del_me]); + $del_me = $parent; + $parent = $del_me->parent; + } + } + return $self; +} + +=head2 minimize_tree + + Arg [1] : -none- + Example : $leaf->disavow_parent(); + $tree = $tree->minimize_tree(); + Description : Returns the tree after removing internal nodes that do not + represent an multi- or bi-furcation anymore. This is typically + required after disavowing a node. Please ensure you use the + object returned by the method and not the original object + anymore! + Returntype : Bio::EnsEMBL::Compara::NestedSet object + Exceptions : + Caller : general + Status : Stable + +=cut + +sub minimize_tree { + my $self = shift; + return $self if($self->is_leaf); + + foreach my $child (@{$self->children}) { + $child->minimize_tree; + } + return $self->minimize_node; +} + + +sub minimize_node { + my $self = shift; + + return $self unless($self->get_child_count() == 1); + + my $child = $self->children->[0]; + my $dist = $child->distance_to_parent + $self->distance_to_parent; + if ($self->parent) { + $self->parent->add_child($child, $dist); + $self->disavow_parent; + } else { + $child->disavow_parent; + } + return $child +} + + +sub scale { + my $self = shift; + my $scale = shift; + + foreach my $node (@{$self->get_all_nodes}) { + my $bl = $node->distance_to_parent; + $bl = 0 unless (defined $bl); + $node->distance_to_parent($bl*$scale); + } + return $self; +} + + +sub scale_max_to { + my $self = shift; + my $new_max = shift; + + my $max_dist = $self->max_distance; + my $scale_factor = $new_max / $max_dist; + return $self->scale($scale_factor); +} + + + +################################## +# +# search methods +# +################################## + +sub find_node_by_name { + my $self = shift; + my $name = shift; + + return $self if((defined $self->name) && $name eq $self->name); + + my $children = $self->children; + foreach my $child_node (@$children) { + my $found = $child_node->find_node_by_name($name); + return $found if(defined($found)); + } + + return undef; +} + +sub find_node_by_node_id { + my $self = shift; + my $node_id = shift; + + return $self if($node_id eq $self->node_id); + + my $children = $self->children; + foreach my $child_node (@$children) { + my $found = $child_node->find_node_by_node_id($node_id); + return $found if(defined($found)); + } + + return undef; +} + +sub find_leaf_by_name { + my $self = shift; + my $name = shift; + + return $self if((defined $self->name) and ($name eq $self->name)); + + my $leaves = $self->get_all_leaves; + foreach my $leaf (@$leaves) { + return $leaf if((defined $leaf->name) and ($name eq $leaf->name)); + } + + return undef; +} + +sub find_leaf_by_node_id { + my $self = shift; + my $node_id = shift; + + return $self if($node_id eq $self->node_id); + + my $leaves = $self->get_all_leaves; + foreach my $leaf (@$leaves) { + return $leaf if($node_id eq $leaf->node_id); + } + + return undef; +} + + +=head2 get_all_sorted_leaves + + Arg [1] : Bio::EnsEMBL::Compara::NestedSet $top_leaf + Arg [...] : (optional) Bio::EnsEMBL::Compara::NestedSet $secondary_priority_leaf + Example : my $sorted_leaves = $object->get_all_sorted_leaves($human_leaf); + Example : my $sorted_leaves = $object->get_all_sorted_leaves($human_leaf, $mouse_leaf); + Description : Sorts the tree such as $top_leaf is the first leave and returns + all the other leaves in the order defined by the tree. + It is possible to define as many secondary top leaves as you require + to sort other branches of the tree. The priority to sort the trees + is defined by the order in which you specify the leaves. + Returntype : listref of Bio::EnsEMBL::Compara::NestedSet (all sorted leaves) + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_all_sorted_leaves { + my ($self, @priority_leaves) = @_; + + if (!@priority_leaves) { + return $self->get_all_leaves; + } + + # Assign priority scores for all parent nodes of the priority leaves + my $score_by_node; + my $score = 0; + # Loop through all the priority leaves, starting from the last one (lowest score) + while (my $priority_leaf = pop @priority_leaves) { + $score++; # Increases the score, next priority leaves (earlier in the argument list) will overwrite the score if needed + my $this_node = $priority_leaf; + # Loop through all the parent node up to the root of the tree + do { + $score_by_node->{$this_node} = $score; + $this_node = $this_node->parent; + } while ($this_node); + } + + my $sorted_leaves = $self->_recursive_get_all_sorted_leaves($score_by_node); + + return $sorted_leaves; +} + +=head2 _recursive_get_all_sorted_leaves + + Arg [1] : hashref $score_by_node + Example : my $sorted_leaves = $object->_recursive_get_all_sorted_leaves($score_by_node); + Description : Recursive code for the get_all_sorted_leaves() method + Returntype : listref of Bio::EnsEMBL::Compara::NestedSet (sorted leaves) + Exceptions : none + Caller : private + Status : Stable + +=cut + +sub _recursive_get_all_sorted_leaves { + my $self = shift; + my $score_by_node = shift; + + my $sorted_leaves = []; + my $children = $self->children; + + if (@$children == 0) { + $sorted_leaves = [$self]; + } else { + $children = [sort { + ($score_by_node->{$b} || $score_by_node->{$a}) ? (($score_by_node->{$b} || 0)<=>($score_by_node->{$a} || 0)) : ($a->node_id <=> $b->node_id) + } @$children]; + for (my $i = 0; $i < @$children; $i++) { + push(@$sorted_leaves, @{$children->[$i]->_recursive_get_all_sorted_leaves($score_by_node)}); + } + } + + return $sorted_leaves; +} + +=head2 get_all_leaves + + Title : get_all_leaves + Usage : my @leaves = @{$tree->get_all_leaves}; + Function: searching from the given starting node, searches and creates list + of all leaves in this subtree and returns by reference + Example : + Returns : reference to list of NestedSet objects (all leaves) + Args : none + +=cut + +sub get_all_leaves { + my $self = shift; + + my $leaves = {}; + $self->_recursive_get_all_leaves($leaves); + my @leaf_list = sort {$a->node_id <=> $b->node_id} values(%{$leaves}); + return \@leaf_list; +} + +sub _recursive_get_all_leaves { + my $self = shift; + my $leaves = shift; + + $leaves->{$self->obj_id} = $self if($self->is_leaf); + + foreach my $child (@{$self->children}) { + no warnings 'recursion'; + $child->_recursive_get_all_leaves($leaves); + } +} + + +sub get_all_leaves_indexed { + my $self = shift; + deprecate("Use Bio::EnsEMBL::Compara::DBSQL::GeneTreeNodeAdaptor->fetch_all_leaves_indexed() method instead"); + + my @leaf_list = @{$self->adaptor->fetch_all_leaves_indexed($self)}; + + return \@leaf_list; +} + +=head2 max_distance + + Title : max_distance + Args : none + Usage : $tree_node->max_distance; + Function: searching from the given starting node, calculates the maximum distance to a leaf + Returns : int + +=cut + +sub max_distance { + my $self = shift; + + my $max_distance = 0; + + foreach my $child (@{$self->children}) { + my $distance = $child->max_distance; + $max_distance = $distance if($distance>$max_distance); + } + + return ($self->distance_to_parent + $max_distance); +} + + +=head2 max_depth + + Title : max_depth + Args : none + Usage : $tree_node->max_depth; + Function: searching from the given starting node, calculates the maximum depth to a leaf + Returns : int + +=cut + +sub max_depth { + my $self = shift; + + my $max_depth = 0; + + foreach my $child (@{$self->children}) { + my $depth = $child->max_depth + 1; + $max_depth=$depth if($depth>$max_depth); + } + return $max_depth; +} + + +=head2 find_first_shared_ancestor + + Arg [1] : Bio::EnsEMBL::Compara::NestedSet $node + Example : my $ancestor = $this_node->find_first_shared_ancestor($other_node); + Description : Gets the first common ancestor between this node and the other one. + Returntype : Bio::EnsEMBL::Compara::NestedSet object + Exceptions : + Caller : general + Status : Stable + +=cut + +sub find_first_shared_ancestor { + my $self = shift; + my $node = shift; + + return $self if($self->equals($node)); + return $node if($self->has_ancestor($node)); + return $self->find_first_shared_ancestor($node->parent); +} + + +sub find_first_shared_ancestor_from_leaves { + my $self = shift; + my $leaf_list = shift; + + my @leaves = @{$leaf_list}; + + my $ancestor = shift @leaves; + while (scalar @leaves > 0) { + my $node = shift @leaves; + $ancestor = $ancestor->find_first_shared_ancestor($node); + } + return $ancestor; +} + + +################################## +# +# developer/adaptor API methods +# +################################## + + +# used for building tree from a DB fetch, want to restrict users to create trees +# by only -add_child method +sub _set_parent_link { + my ($self, $link) = @_; + + $self->{'_parent_id'} = 0; + $self->{'_parent_link'} = $link; + $self->{'_parent_id'} = $link->get_neighbor($self)->node_id if($link); + return $self; +} + + +# used for building tree from a DB fetch until all the objects are in memory +sub _parent_id { + my $self = shift; + $self->{'_parent_id'} = shift if(@_); + return $self->{'_parent_id'}; +} + +# used for building tree from a DB fetch until all the objects are in memory +sub _root_id { + my $self = shift; + $self->{'_root_id'} = shift if(@_); + return $self->{'_root_id'}; +} + +1; +