view variant_effect_predictor/Bio/Tree/TreeFunctionsI.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line source

# $Id: TreeFunctionsI.pm,v 1.5.2.3 2003/09/14 20:18:25 jason Exp $
#
# BioPerl module for Bio::Tree::TreeFunctionsI
#
# Cared for by Jason Stajich <jason@bioperl.org>
#
# Copyright Jason Stajich
#
# You may distribute this module under the same terms as perl itself

# POD documentation - main docs before the code

=head1 NAME

Bio::Tree::TreeFunctionsI - Decorated Interface implementing basic Tree exploration methods

=head1 SYNOPSIS

  use Bio::TreeIO;
  my $in = new Bio::TreeIO(-format => 'newick', -file => 'tree.tre');

  my $tree = $in->next_tree;

  my @nodes = $tree->find_node('id1');

  if( $tree->is_monophyletic(-clade => @nodes, -outgroup => $outnode) ){

  }

=head1 DESCRIPTION

Describe the interface here

=head1 FEEDBACK

=head2 Mailing Lists

User feedback is an integral part of the evolution of this and other
Bioperl modules. Send your comments and suggestions preferably to
the Bioperl mailing list.  Your participation is much appreciated.

  bioperl-l@bioperl.org              - General discussion
  http://bioperl.org/MailList.shtml  - About the mailing lists

=head2 Reporting Bugs

Report bugs to the Bioperl bug tracking system to help us keep track
of the bugs and their resolution. Bug reports can be submitted via
email or the web:

  bioperl-bugs@bioperl.org
  http://bugzilla.bioperl.org/

=head1 AUTHOR - Jason Stajich, Aaron Mackey, Justin Reese

Email jason-at-bioperl-dot-org
Email amackey-at-virginia.edu
Email jtr4v-at-virginia.edu

=head1 CONTRIBUTORS

Additional contributors names and emails here

Rerooting code was worked on by

  Daniel Barker d.barker-at-reading.ac.uk
  Ramiro Barrantes Ramiro.Barrantes-at-uvm.edu

=head1 APPENDIX

The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _

=cut


# Let the code begin...


package Bio::Tree::TreeFunctionsI;
use vars qw(@ISA);
use strict;
use  Bio::Tree::TreeI;

@ISA = qw(Bio::Tree::TreeI);

=head2 find_node

 Title   : find_node
 Usage   : my @nodes = $self->find_node(-id => 'node1');
 Function: returns all nodes that match a specific field, by default this
           is id, but different branch_length, 
 Returns : List of nodes which matched search
 Args    : text string to search for
           OR
           -fieldname => $textstring

=cut

sub find_node {
   my ($self,$type,$field) = @_;
   if( ! defined $type ) { 
       $self->warn("Must request a either a string or field and string when searching");
   }

   # all this work for a '-' named field
   # is so that we could potentially 
   # expand to other constraints in 
   # different implementations
   # like 'find all nodes with boostrap < XX'

   if( ! defined $field ) { 
       # only 1 argument, default to searching by id
       $field= $type; 
       $type = 'id';
   } else {   
       $type =~ s/^-//;
   }

   # could actually do this by testing $rootnode->can($type) but
   # it is possible that a tree is implemeted with different node types
   # - although it is unlikely that the root node would be richer than the 
   # leaf nodes.  Can't handle NHX tags right now

   unless( $type eq 'id' || $type eq 'name' ||
	   $type eq 'bootstrap' || $type eq 'description' ||
	   $type eq 'internal_id') {
       $self->warn("unknown search type $type - will try anyways");
   } 
   my @nodes = grep { $_->can($type) && defined $_->$type() &&
		     $_->$type() eq $field } $self->get_nodes();

   if ( wantarray) { 
       return @nodes;
   } else { 
       if( @nodes > 1 ) { 
	   $self->warn("More than 1 node found but caller requested scalar, only returning first node");
       }
       return shift @nodes;
   }
}

=head2 remove_Node

 Title   : remove_Node
 Usage   : $tree->remove_Node($node)
 Function: Removes a node from the tree
 Returns : boolean represent status of success
 Args    : either Bio::Tree::NodeI or string of the node id


=cut

sub remove_Node {
   my ($self,$input) = @_;
   my $node = undef;
   unless( ref($input) ) {
       $node = $self->find_node($input);
   }  elsif( ! $input->isa('Bio::Tree::NodeI') ) {
       $self->warn("Did not provide either a valid Bio::Tree::NodeI object to remove_node or the node name");
       return 0;
   } else { 
       $node = $input;
   }
   if( ! $node->ancestor && $self->get_root_node->internal_id != $node->internal_id) {
       $self->warn("Node (".$node->to_string . ") has no ancestor, can't remove!");
   } else { 
       $node->ancestor->remove_Descendent($node);
   }
}


# Added for Justin Reese by Jason

=head2 get_lca

 Title   : get_lca
 Usage   : get_lca(-nodes => \@nodes )
 Function: given two nodes, returns the lowest common ancestor
 Returns : node object
 Args    : -nodes => arrayref of nodes to test


=cut

sub get_lca {
    my ($self,@args) = @_;
    my ($nodes) = $self->_rearrange([qw(NODES)],@args);
   if( ! defined $nodes ) {
       $self->warn("Must supply -nodes parameter to get_lca() method");
       return undef;
   }
    my ($node1,$node2) = $self->_check_two_nodes($nodes);
    return undef unless $node1 && $node2;

    # algorithm: Start with first node, find and save every node from it to
    #    root. Then start with second node; for it and each of its ancestor
    #    nodes, check to see if it's in the first node's ancestor list - if
    #    so it is the lca.
    #
    # This is very slow and naive, but I somehow doubt the overhead
    # of mapping the tree to a complete binary tree and doing the linear
    # lca search would be worth the overhead, especially for small trees.
    # Maybe someday I'll write a linear get_lca and find out.

    # find and save every ancestor of node1 (including itself)

    my %node1_ancestors;	# keys are internal ids, values are objects
    my $place = $node1;		# start at node1

    while ( $place ){
	$node1_ancestors{$place->internal_id} = $place;
	$place = $place->ancestor;
    }

    # now climb up node2, for each node checking whether 
    # it's in node1_ancestors
    $place = $node2;		# start at node2
    while ( $place ){
	foreach my $key ( keys %node1_ancestors ){ # ugh
	    if ( $place->internal_id == $key){
		return $node1_ancestors{$key};
	    }
	}
	$place = $place->ancestor;
    }
    $self->warn("Could not find lca!"); # should never execute, 
                                        # if so, there's a problem
    return undef;
}

# Added for Justin Reese by Jason

=head2 distance

 Title   : distance
 Usage   : distance(-nodes => \@nodes )
 Function: returns the distance between two given nodes
 Returns : numerical distance
 Args    : -nodes => arrayref of nodes to test


=cut

sub distance {
    my ($self,@args) = @_;
    my ($nodes) = $self->_rearrange([qw(NODES)],@args);
    if( ! defined $nodes ) {
	$self->warn("Must supply -nodes parameter to distance() method");
	return undef;
    }
    my ($node1,$node2) = $self->_check_two_nodes($nodes);
    # algorithm:

    # Find lca: Start with first node, find and save every node from it
    # to root, saving cumulative distance. Then start with second node;
    # for it and each of its ancestor nodes, check to see if it's in
    # the first node's ancestor list - if so it is the lca. Return sum
    # of (cumul. distance from node1 to lca) and (cumul. distance from
    # node2 to lca)

    # find and save every ancestor of node1 (including itself)

    my %node1_ancestors;	# keys are internal ids, values are objects
    my %node1_cumul_dist;	# keys are internal ids, values 
    # are cumulative distance from node1 to given node
    my $place = $node1;		# start at node1
    my $cumul_dist = 0;

    while ( $place ){
	$node1_ancestors{$place->internal_id} = $place;
	$node1_cumul_dist{$place->internal_id} = $cumul_dist;
	if ($place->branch_length) {
	    $cumul_dist += $place->branch_length; # include current branch
	                                          # length in next iteration
	}
	$place = $place->ancestor;
    }

    # now climb up node2, for each node checking whether 
    # it's in node1_ancestors
    $place = $node2;  # start at node2
    $cumul_dist = 0;
    while ( $place ){
	foreach my $key ( keys %node1_ancestors ){ # ugh
	    if ( $place->internal_id == $key){ # we're at lca
		return $node1_cumul_dist{$key} + $cumul_dist;
	    }
	}
	# include current branch length in next iteration
	$cumul_dist += $place->branch_length; 
	$place = $place->ancestor;
    }
    $self->warn("Could not find distance!"); # should never execute, 
    # if so, there's a problem
    return undef;
}

# helper function to check lca and distance arguments

sub _check_two_nodes {
    my ($self, $nodes) = @_;

   if( ref($nodes) !~ /ARRAY/i ||
       !ref($nodes->[0]) ||
       !ref($nodes->[1])
       ) {
       $self->warn("Must provide a valid array reference for -nodes");
       return undef;
   } elsif( scalar(@$nodes) > 2 ){
       $self->warn("More than two nodes given, using first two");
   } elsif( scalar(@$nodes) < 2 ){
       $self->warn("-nodes parameter does not contain reference to two nodes");
       return undef;
   }
    unless( $nodes->[0]->isa('Bio::Tree::NodeI') &&
	    $nodes->[1]->isa('Bio::Tree::NodeI') ) {
	$self->warn("Did not provide valid Bio::Tree::NodeI objects as nodes\n");
	return undef;
    }
    return @$nodes;
}


=head2 is_monophyletic

 Title   : is_monophyletic
 Usage   : if( $tree->is_monophyletic(-nodes => \@nodes, 
				      -outgroup => $outgroup)
 Function: Will do a test of monophyly for the nodes specified
           in comparison to a chosen outgroup
 Returns : boolean
 Args    : -nodes => arrayref of nodes to test
           -outgroup => outgroup to serve as a reference


=cut

sub is_monophyletic{
   my ($self,@args) = @_;
   my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args);

   if( ! defined $nodes || ! defined $outgroup ) {
       $self->warn("Must supply -nodes and -outgroup parameters to the method
is_monophyletic");
       return undef;
   }
   if( ref($nodes) !~ /ARRAY/i ) {
       $self->warn("Must provide a valid array reference for -nodes");
   }
   my $clade_root;
   # this is to combine multiple tests into a single node
   # order doesn't really matter as long as get_lca does its job right
   while( @$nodes > 2 ) { 
       my ($a,$b) = ( shift @$nodes, shift @$nodes);
       $clade_root = $self->get_lca(-nodes => [$a,$b] );
       unshift @$nodes, $clade_root;
   }
   $clade_root = $self->get_lca(-nodes => $nodes );
   my $og_ancestor = $outgroup->ancestor;
   while( defined ($og_ancestor ) ) {
       if( $og_ancestor->internal_id == $clade_root->internal_id ) {
           # monophyly is violated
           return 0;
       }
       $og_ancestor = $og_ancestor->ancestor;
   }
   return 1;
}

=head2 is_paraphyletic

 Title   : is_paraphyletic
 Usage   : if( $tree->is_paraphyletic(-nodes =>\@nodes,
				      -outgroup => $node) ){ }
 Function: Tests whether or not a given set of nodes are paraphyletic
           (representing the full clade) given an outgroup
 Returns : [-1,0,1] , -1 if the group is not monophyletic
                       0 if the group is not paraphyletic
                       1 if the group is paraphyletic
 Args    : -nodes => Array of Bio::Tree::NodeI objects which are in the tree
           -outgroup => a Bio::Tree::NodeI to compare the nodes to


=cut

sub is_paraphyletic{
   my ($self,@args) = @_;
   my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args);

   if( ! defined $nodes || ! defined $outgroup ) {
       $self->warn("Must suply -nodes and -outgroup parameters to the method is_paraphyletic");
       return undef;
   }
   if( ref($nodes) !~ /ARRAY/i ) { 
       $self->warn("Must provide a valid array reference for -nodes");
       return undef;
   }

   # Algorithm
   # Find the lca
   # Find all the nodes beneath the lca
   # Test to see that none are missing from the nodes list
   my %nodehash;
   foreach my $n ( @$nodes ) {
       $nodehash{$n->internal_id} = $n;
   }
   while( @$nodes > 2 ) { 
       unshift @$nodes, $self->get_lca(-nodes => [( shift @$nodes, 
						    shift @$nodes)] );
   }
   my $clade_root = $self->get_lca(-nodes => $nodes );
   unless( defined $clade_root ) { 
       $self->warn("could not find clade root via lca");
       return undef;
   }
   my $og_ancestor = $outgroup->ancestor;

   # Is this necessary/correct for paraphyly test?
   while( defined ($og_ancestor ) ) {
       if( $og_ancestor->internal_id == $clade_root->internal_id ) {
           # monophyly is violated, could be paraphyletic
           return -1;
       }
       $og_ancestor = $og_ancestor->ancestor;
   }
   my $tree = new Bio::Tree::Tree(-root     => $clade_root,
				  -nodelete => 1);

   foreach my $n ( $tree->get_nodes() ) { 
       next unless $n->is_Leaf();
       # if any leaf node is not in the list
       # then it is part of the clade and so the list
       # must be paraphyletic
       return 1 unless (  $nodehash{$n->internal_id} );
   }
   return 0;
}


=head2 reroot

 Title   : reroot_tree
 Usage   : $tree->reroot($node);
 Function: Reroots a tree either making a new node the root
 Returns : 1 on success, 0 on failure
 Args    : Bio::Tree::NodeI that is in the tree, but is not the current root

=cut

sub reroot {
    my ($self,$new_root) = @_;
    unless (defined $new_root && $new_root->isa("Bio::Tree::NodeI")) {
	$self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
	return 0;
    }
    if( $new_root->is_Leaf() ) {
	$self->warn("Asking to root with a leaf, will use the leaf's ancestor");
	$new_root = $new_root->ancestor;
    }

    my $old_root = $self->get_root_node;
    if( $new_root == $old_root ) {
	$self->warn("Node requested for reroot is already the root node!");
	return 0;
    }

    my @path = ();	# along tree, from newroot to oldroot
    my $node = $new_root;
    while ($node) {
	push @path, $node;
	$node = $node->ancestor;
    }

    my @path_from_oldroot = reverse @path;
    for (my $i = 0; $i < @path_from_oldroot - 1; $i++) {
	my $current = $path_from_oldroot[$i];
	my $next = $path_from_oldroot[$i + 1];
	$current->remove_Descendent($next);
	$current->branch_length($next->branch_length);
	$next->add_Descendent($current);
	
    }
    $new_root->branch_length(undef);
    $self->set_root_node($new_root);

    return 1;
}

=head2 reverse_edge

 Title   : reverse_edge
 Usage   : $node->reverse_edge(child);
 Function: makes child be a parent of node
 Requires: child must be a direct descendent of node
 Returns : nothing
 Args    : Bio::Tree::NodeI that is in the tree

=cut

sub reverse_edge {
    my ($self,$node) = @_;
    delete_edge($self, $node);
    $node->add_Descendent($self);
    1;
}

=head2 delete_edge

 Title   : delete_edge
 Usage   : $node->reverse_edge(child); 
 Function: makes child be a parent of node
 Requires: child must be a direct descendent of node
 Returns : nothing
 Args    : Bio::Tree::NodeI that is in the tree

=cut

sub delete_edge {
    my ($self,$node) = @_;
    unless (defined $self && $self->isa("Bio::Tree::NodeI")) {
        $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
        return 1;
    }
    unless (defined $node && $node->isa("Bio::Tree::NodeI")) {
        $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
        return 1;
    }
    if( $self->{'_desc'}->{$node->internal_id} ) {
        $node->ancestor(undef);
        $self->{'_desc'}->{$node->internal_id}->ancestor(undef);
        delete $self->{'_desc'}->{$node->internal_id};
    } else {
        $self->warn("First argument must be direct parent of node");
        return 1;
    }
    1;
}

sub findnode_by_id {
    my $tree = shift;
    my $id = shift;
    my $rootnode = $tree->get_root_node;
    if ( ($rootnode->id) and ($rootnode->id eq $id) ) {
        return $rootnode;
    }
    # process all the children
    foreach my $node ( $rootnode->get_Descendents ) {
        if ( ($node->id) and ($node->id eq $id ) ) {
            return $node;
        }
    }
}

1;