diff variant_effect_predictor/Bio/Tree/TreeFunctionsI.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Tree/TreeFunctionsI.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,555 @@
+# $Id: TreeFunctionsI.pm,v 1.5.2.3 2003/09/14 20:18:25 jason Exp $
+#
+# BioPerl module for Bio::Tree::TreeFunctionsI
+#
+# Cared for by Jason Stajich <jason@bioperl.org>
+#
+# Copyright Jason Stajich
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Tree::TreeFunctionsI - Decorated Interface implementing basic Tree exploration methods
+
+=head1 SYNOPSIS
+
+  use Bio::TreeIO;
+  my $in = new Bio::TreeIO(-format => 'newick', -file => 'tree.tre');
+
+  my $tree = $in->next_tree;
+
+  my @nodes = $tree->find_node('id1');
+
+  if( $tree->is_monophyletic(-clade => @nodes, -outgroup => $outnode) ){
+
+  }
+
+=head1 DESCRIPTION
+
+Describe the interface here
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bioperl.org/MailList.shtml  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bioperl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Jason Stajich, Aaron Mackey, Justin Reese
+
+Email jason-at-bioperl-dot-org
+Email amackey-at-virginia.edu
+Email jtr4v-at-virginia.edu
+
+=head1 CONTRIBUTORS
+
+Additional contributors names and emails here
+
+Rerooting code was worked on by
+
+  Daniel Barker d.barker-at-reading.ac.uk
+  Ramiro Barrantes Ramiro.Barrantes-at-uvm.edu
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::Tree::TreeFunctionsI;
+use vars qw(@ISA);
+use strict;
+use  Bio::Tree::TreeI;
+
+@ISA = qw(Bio::Tree::TreeI);
+
+=head2 find_node
+
+ Title   : find_node
+ Usage   : my @nodes = $self->find_node(-id => 'node1');
+ Function: returns all nodes that match a specific field, by default this
+           is id, but different branch_length, 
+ Returns : List of nodes which matched search
+ Args    : text string to search for
+           OR
+           -fieldname => $textstring
+
+=cut
+
+sub find_node {
+   my ($self,$type,$field) = @_;
+   if( ! defined $type ) { 
+       $self->warn("Must request a either a string or field and string when searching");
+   }
+
+   # all this work for a '-' named field
+   # is so that we could potentially 
+   # expand to other constraints in 
+   # different implementations
+   # like 'find all nodes with boostrap < XX'
+
+   if( ! defined $field ) { 
+       # only 1 argument, default to searching by id
+       $field= $type; 
+       $type = 'id';
+   } else {   
+       $type =~ s/^-//;
+   }
+
+   # could actually do this by testing $rootnode->can($type) but
+   # it is possible that a tree is implemeted with different node types
+   # - although it is unlikely that the root node would be richer than the 
+   # leaf nodes.  Can't handle NHX tags right now
+
+   unless( $type eq 'id' || $type eq 'name' ||
+	   $type eq 'bootstrap' || $type eq 'description' ||
+	   $type eq 'internal_id') {
+       $self->warn("unknown search type $type - will try anyways");
+   } 
+   my @nodes = grep { $_->can($type) && defined $_->$type() &&
+		     $_->$type() eq $field } $self->get_nodes();
+
+   if ( wantarray) { 
+       return @nodes;
+   } else { 
+       if( @nodes > 1 ) { 
+	   $self->warn("More than 1 node found but caller requested scalar, only returning first node");
+       }
+       return shift @nodes;
+   }
+}
+
+=head2 remove_Node
+
+ Title   : remove_Node
+ Usage   : $tree->remove_Node($node)
+ Function: Removes a node from the tree
+ Returns : boolean represent status of success
+ Args    : either Bio::Tree::NodeI or string of the node id
+
+
+=cut
+
+sub remove_Node {
+   my ($self,$input) = @_;
+   my $node = undef;
+   unless( ref($input) ) {
+       $node = $self->find_node($input);
+   }  elsif( ! $input->isa('Bio::Tree::NodeI') ) {
+       $self->warn("Did not provide either a valid Bio::Tree::NodeI object to remove_node or the node name");
+       return 0;
+   } else { 
+       $node = $input;
+   }
+   if( ! $node->ancestor && $self->get_root_node->internal_id != $node->internal_id) {
+       $self->warn("Node (".$node->to_string . ") has no ancestor, can't remove!");
+   } else { 
+       $node->ancestor->remove_Descendent($node);
+   }
+}
+
+
+# Added for Justin Reese by Jason
+
+=head2 get_lca
+
+ Title   : get_lca
+ Usage   : get_lca(-nodes => \@nodes )
+ Function: given two nodes, returns the lowest common ancestor
+ Returns : node object
+ Args    : -nodes => arrayref of nodes to test
+
+
+=cut
+
+sub get_lca {
+    my ($self,@args) = @_;
+    my ($nodes) = $self->_rearrange([qw(NODES)],@args);
+   if( ! defined $nodes ) {
+       $self->warn("Must supply -nodes parameter to get_lca() method");
+       return undef;
+   }
+    my ($node1,$node2) = $self->_check_two_nodes($nodes);
+    return undef unless $node1 && $node2;
+
+    # algorithm: Start with first node, find and save every node from it to
+    #    root. Then start with second node; for it and each of its ancestor
+    #    nodes, check to see if it's in the first node's ancestor list - if
+    #    so it is the lca.
+    #
+    # This is very slow and naive, but I somehow doubt the overhead
+    # of mapping the tree to a complete binary tree and doing the linear
+    # lca search would be worth the overhead, especially for small trees.
+    # Maybe someday I'll write a linear get_lca and find out.
+
+    # find and save every ancestor of node1 (including itself)
+
+    my %node1_ancestors;	# keys are internal ids, values are objects
+    my $place = $node1;		# start at node1
+
+    while ( $place ){
+	$node1_ancestors{$place->internal_id} = $place;
+	$place = $place->ancestor;
+    }
+
+    # now climb up node2, for each node checking whether 
+    # it's in node1_ancestors
+    $place = $node2;		# start at node2
+    while ( $place ){
+	foreach my $key ( keys %node1_ancestors ){ # ugh
+	    if ( $place->internal_id == $key){
+		return $node1_ancestors{$key};
+	    }
+	}
+	$place = $place->ancestor;
+    }
+    $self->warn("Could not find lca!"); # should never execute, 
+                                        # if so, there's a problem
+    return undef;
+}
+
+# Added for Justin Reese by Jason
+
+=head2 distance
+
+ Title   : distance
+ Usage   : distance(-nodes => \@nodes )
+ Function: returns the distance between two given nodes
+ Returns : numerical distance
+ Args    : -nodes => arrayref of nodes to test
+
+
+=cut
+
+sub distance {
+    my ($self,@args) = @_;
+    my ($nodes) = $self->_rearrange([qw(NODES)],@args);
+    if( ! defined $nodes ) {
+	$self->warn("Must supply -nodes parameter to distance() method");
+	return undef;
+    }
+    my ($node1,$node2) = $self->_check_two_nodes($nodes);
+    # algorithm:
+
+    # Find lca: Start with first node, find and save every node from it
+    # to root, saving cumulative distance. Then start with second node;
+    # for it and each of its ancestor nodes, check to see if it's in
+    # the first node's ancestor list - if so it is the lca. Return sum
+    # of (cumul. distance from node1 to lca) and (cumul. distance from
+    # node2 to lca)
+
+    # find and save every ancestor of node1 (including itself)
+
+    my %node1_ancestors;	# keys are internal ids, values are objects
+    my %node1_cumul_dist;	# keys are internal ids, values 
+    # are cumulative distance from node1 to given node
+    my $place = $node1;		# start at node1
+    my $cumul_dist = 0;
+
+    while ( $place ){
+	$node1_ancestors{$place->internal_id} = $place;
+	$node1_cumul_dist{$place->internal_id} = $cumul_dist;
+	if ($place->branch_length) {
+	    $cumul_dist += $place->branch_length; # include current branch
+	                                          # length in next iteration
+	}
+	$place = $place->ancestor;
+    }
+
+    # now climb up node2, for each node checking whether 
+    # it's in node1_ancestors
+    $place = $node2;  # start at node2
+    $cumul_dist = 0;
+    while ( $place ){
+	foreach my $key ( keys %node1_ancestors ){ # ugh
+	    if ( $place->internal_id == $key){ # we're at lca
+		return $node1_cumul_dist{$key} + $cumul_dist;
+	    }
+	}
+	# include current branch length in next iteration
+	$cumul_dist += $place->branch_length; 
+	$place = $place->ancestor;
+    }
+    $self->warn("Could not find distance!"); # should never execute, 
+    # if so, there's a problem
+    return undef;
+}
+
+# helper function to check lca and distance arguments
+
+sub _check_two_nodes {
+    my ($self, $nodes) = @_;
+
+   if( ref($nodes) !~ /ARRAY/i ||
+       !ref($nodes->[0]) ||
+       !ref($nodes->[1])
+       ) {
+       $self->warn("Must provide a valid array reference for -nodes");
+       return undef;
+   } elsif( scalar(@$nodes) > 2 ){
+       $self->warn("More than two nodes given, using first two");
+   } elsif( scalar(@$nodes) < 2 ){
+       $self->warn("-nodes parameter does not contain reference to two nodes");
+       return undef;
+   }
+    unless( $nodes->[0]->isa('Bio::Tree::NodeI') &&
+	    $nodes->[1]->isa('Bio::Tree::NodeI') ) {
+	$self->warn("Did not provide valid Bio::Tree::NodeI objects as nodes\n");
+	return undef;
+    }
+    return @$nodes;
+}
+
+
+=head2 is_monophyletic
+
+ Title   : is_monophyletic
+ Usage   : if( $tree->is_monophyletic(-nodes => \@nodes, 
+				      -outgroup => $outgroup)
+ Function: Will do a test of monophyly for the nodes specified
+           in comparison to a chosen outgroup
+ Returns : boolean
+ Args    : -nodes => arrayref of nodes to test
+           -outgroup => outgroup to serve as a reference
+
+
+=cut
+
+sub is_monophyletic{
+   my ($self,@args) = @_;
+   my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args);
+
+   if( ! defined $nodes || ! defined $outgroup ) {
+       $self->warn("Must supply -nodes and -outgroup parameters to the method
+is_monophyletic");
+       return undef;
+   }
+   if( ref($nodes) !~ /ARRAY/i ) {
+       $self->warn("Must provide a valid array reference for -nodes");
+   }
+   my $clade_root;
+   # this is to combine multiple tests into a single node
+   # order doesn't really matter as long as get_lca does its job right
+   while( @$nodes > 2 ) { 
+       my ($a,$b) = ( shift @$nodes, shift @$nodes);
+       $clade_root = $self->get_lca(-nodes => [$a,$b] );
+       unshift @$nodes, $clade_root;
+   }
+   $clade_root = $self->get_lca(-nodes => $nodes );
+   my $og_ancestor = $outgroup->ancestor;
+   while( defined ($og_ancestor ) ) {
+       if( $og_ancestor->internal_id == $clade_root->internal_id ) {
+           # monophyly is violated
+           return 0;
+       }
+       $og_ancestor = $og_ancestor->ancestor;
+   }
+   return 1;
+}
+
+=head2 is_paraphyletic
+
+ Title   : is_paraphyletic
+ Usage   : if( $tree->is_paraphyletic(-nodes =>\@nodes,
+				      -outgroup => $node) ){ }
+ Function: Tests whether or not a given set of nodes are paraphyletic
+           (representing the full clade) given an outgroup
+ Returns : [-1,0,1] , -1 if the group is not monophyletic
+                       0 if the group is not paraphyletic
+                       1 if the group is paraphyletic
+ Args    : -nodes => Array of Bio::Tree::NodeI objects which are in the tree
+           -outgroup => a Bio::Tree::NodeI to compare the nodes to
+
+
+=cut
+
+sub is_paraphyletic{
+   my ($self,@args) = @_;
+   my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args);
+
+   if( ! defined $nodes || ! defined $outgroup ) {
+       $self->warn("Must suply -nodes and -outgroup parameters to the method is_paraphyletic");
+       return undef;
+   }
+   if( ref($nodes) !~ /ARRAY/i ) { 
+       $self->warn("Must provide a valid array reference for -nodes");
+       return undef;
+   }
+
+   # Algorithm
+   # Find the lca
+   # Find all the nodes beneath the lca
+   # Test to see that none are missing from the nodes list
+   my %nodehash;
+   foreach my $n ( @$nodes ) {
+       $nodehash{$n->internal_id} = $n;
+   }
+   while( @$nodes > 2 ) { 
+       unshift @$nodes, $self->get_lca(-nodes => [( shift @$nodes, 
+						    shift @$nodes)] );
+   }
+   my $clade_root = $self->get_lca(-nodes => $nodes );
+   unless( defined $clade_root ) { 
+       $self->warn("could not find clade root via lca");
+       return undef;
+   }
+   my $og_ancestor = $outgroup->ancestor;
+
+   # Is this necessary/correct for paraphyly test?
+   while( defined ($og_ancestor ) ) {
+       if( $og_ancestor->internal_id == $clade_root->internal_id ) {
+           # monophyly is violated, could be paraphyletic
+           return -1;
+       }
+       $og_ancestor = $og_ancestor->ancestor;
+   }
+   my $tree = new Bio::Tree::Tree(-root     => $clade_root,
+				  -nodelete => 1);
+
+   foreach my $n ( $tree->get_nodes() ) { 
+       next unless $n->is_Leaf();
+       # if any leaf node is not in the list
+       # then it is part of the clade and so the list
+       # must be paraphyletic
+       return 1 unless (  $nodehash{$n->internal_id} );
+   }
+   return 0;
+}
+
+
+=head2 reroot
+
+ Title   : reroot_tree
+ Usage   : $tree->reroot($node);
+ Function: Reroots a tree either making a new node the root
+ Returns : 1 on success, 0 on failure
+ Args    : Bio::Tree::NodeI that is in the tree, but is not the current root
+
+=cut
+
+sub reroot {
+    my ($self,$new_root) = @_;
+    unless (defined $new_root && $new_root->isa("Bio::Tree::NodeI")) {
+	$self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
+	return 0;
+    }
+    if( $new_root->is_Leaf() ) {
+	$self->warn("Asking to root with a leaf, will use the leaf's ancestor");
+	$new_root = $new_root->ancestor;
+    }
+
+    my $old_root = $self->get_root_node;
+    if( $new_root == $old_root ) {
+	$self->warn("Node requested for reroot is already the root node!");
+	return 0;
+    }
+
+    my @path = ();	# along tree, from newroot to oldroot
+    my $node = $new_root;
+    while ($node) {
+	push @path, $node;
+	$node = $node->ancestor;
+    }
+
+    my @path_from_oldroot = reverse @path;
+    for (my $i = 0; $i < @path_from_oldroot - 1; $i++) {
+	my $current = $path_from_oldroot[$i];
+	my $next = $path_from_oldroot[$i + 1];
+	$current->remove_Descendent($next);
+	$current->branch_length($next->branch_length);
+	$next->add_Descendent($current);
+	
+    }
+    $new_root->branch_length(undef);
+    $self->set_root_node($new_root);
+
+    return 1;
+}
+
+=head2 reverse_edge
+
+ Title   : reverse_edge
+ Usage   : $node->reverse_edge(child);
+ Function: makes child be a parent of node
+ Requires: child must be a direct descendent of node
+ Returns : nothing
+ Args    : Bio::Tree::NodeI that is in the tree
+
+=cut
+
+sub reverse_edge {
+    my ($self,$node) = @_;
+    delete_edge($self, $node);
+    $node->add_Descendent($self);
+    1;
+}
+
+=head2 delete_edge
+
+ Title   : delete_edge
+ Usage   : $node->reverse_edge(child); 
+ Function: makes child be a parent of node
+ Requires: child must be a direct descendent of node
+ Returns : nothing
+ Args    : Bio::Tree::NodeI that is in the tree
+
+=cut
+
+sub delete_edge {
+    my ($self,$node) = @_;
+    unless (defined $self && $self->isa("Bio::Tree::NodeI")) {
+        $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
+        return 1;
+    }
+    unless (defined $node && $node->isa("Bio::Tree::NodeI")) {
+        $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
+        return 1;
+    }
+    if( $self->{'_desc'}->{$node->internal_id} ) {
+        $node->ancestor(undef);
+        $self->{'_desc'}->{$node->internal_id}->ancestor(undef);
+        delete $self->{'_desc'}->{$node->internal_id};
+    } else {
+        $self->warn("First argument must be direct parent of node");
+        return 1;
+    }
+    1;
+}
+
+sub findnode_by_id {
+    my $tree = shift;
+    my $id = shift;
+    my $rootnode = $tree->get_root_node;
+    if ( ($rootnode->id) and ($rootnode->id eq $id) ) {
+        return $rootnode;
+    }
+    # process all the children
+    foreach my $node ( $rootnode->get_Descendents ) {
+        if ( ($node->id) and ($node->id eq $id ) ) {
+            return $node;
+        }
+    }
+}
+
+1;