comparison variant_effect_predictor/Bio/Tree/TreeFunctionsI.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: TreeFunctionsI.pm,v 1.5.2.3 2003/09/14 20:18:25 jason Exp $
2 #
3 # BioPerl module for Bio::Tree::TreeFunctionsI
4 #
5 # Cared for by Jason Stajich <jason@bioperl.org>
6 #
7 # Copyright Jason Stajich
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tree::TreeFunctionsI - Decorated Interface implementing basic Tree exploration methods
16
17 =head1 SYNOPSIS
18
19 use Bio::TreeIO;
20 my $in = new Bio::TreeIO(-format => 'newick', -file => 'tree.tre');
21
22 my $tree = $in->next_tree;
23
24 my @nodes = $tree->find_node('id1');
25
26 if( $tree->is_monophyletic(-clade => @nodes, -outgroup => $outnode) ){
27
28 }
29
30 =head1 DESCRIPTION
31
32 Describe the interface here
33
34 =head1 FEEDBACK
35
36 =head2 Mailing Lists
37
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to
40 the Bioperl mailing list. Your participation is much appreciated.
41
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/MailList.shtml - About the mailing lists
44
45 =head2 Reporting Bugs
46
47 Report bugs to the Bioperl bug tracking system to help us keep track
48 of the bugs and their resolution. Bug reports can be submitted via
49 email or the web:
50
51 bioperl-bugs@bioperl.org
52 http://bugzilla.bioperl.org/
53
54 =head1 AUTHOR - Jason Stajich, Aaron Mackey, Justin Reese
55
56 Email jason-at-bioperl-dot-org
57 Email amackey-at-virginia.edu
58 Email jtr4v-at-virginia.edu
59
60 =head1 CONTRIBUTORS
61
62 Additional contributors names and emails here
63
64 Rerooting code was worked on by
65
66 Daniel Barker d.barker-at-reading.ac.uk
67 Ramiro Barrantes Ramiro.Barrantes-at-uvm.edu
68
69 =head1 APPENDIX
70
71 The rest of the documentation details each of the object methods.
72 Internal methods are usually preceded with a _
73
74 =cut
75
76
77 # Let the code begin...
78
79
80 package Bio::Tree::TreeFunctionsI;
81 use vars qw(@ISA);
82 use strict;
83 use Bio::Tree::TreeI;
84
85 @ISA = qw(Bio::Tree::TreeI);
86
87 =head2 find_node
88
89 Title : find_node
90 Usage : my @nodes = $self->find_node(-id => 'node1');
91 Function: returns all nodes that match a specific field, by default this
92 is id, but different branch_length,
93 Returns : List of nodes which matched search
94 Args : text string to search for
95 OR
96 -fieldname => $textstring
97
98 =cut
99
100 sub find_node {
101 my ($self,$type,$field) = @_;
102 if( ! defined $type ) {
103 $self->warn("Must request a either a string or field and string when searching");
104 }
105
106 # all this work for a '-' named field
107 # is so that we could potentially
108 # expand to other constraints in
109 # different implementations
110 # like 'find all nodes with boostrap < XX'
111
112 if( ! defined $field ) {
113 # only 1 argument, default to searching by id
114 $field= $type;
115 $type = 'id';
116 } else {
117 $type =~ s/^-//;
118 }
119
120 # could actually do this by testing $rootnode->can($type) but
121 # it is possible that a tree is implemeted with different node types
122 # - although it is unlikely that the root node would be richer than the
123 # leaf nodes. Can't handle NHX tags right now
124
125 unless( $type eq 'id' || $type eq 'name' ||
126 $type eq 'bootstrap' || $type eq 'description' ||
127 $type eq 'internal_id') {
128 $self->warn("unknown search type $type - will try anyways");
129 }
130 my @nodes = grep { $_->can($type) && defined $_->$type() &&
131 $_->$type() eq $field } $self->get_nodes();
132
133 if ( wantarray) {
134 return @nodes;
135 } else {
136 if( @nodes > 1 ) {
137 $self->warn("More than 1 node found but caller requested scalar, only returning first node");
138 }
139 return shift @nodes;
140 }
141 }
142
143 =head2 remove_Node
144
145 Title : remove_Node
146 Usage : $tree->remove_Node($node)
147 Function: Removes a node from the tree
148 Returns : boolean represent status of success
149 Args : either Bio::Tree::NodeI or string of the node id
150
151
152 =cut
153
154 sub remove_Node {
155 my ($self,$input) = @_;
156 my $node = undef;
157 unless( ref($input) ) {
158 $node = $self->find_node($input);
159 } elsif( ! $input->isa('Bio::Tree::NodeI') ) {
160 $self->warn("Did not provide either a valid Bio::Tree::NodeI object to remove_node or the node name");
161 return 0;
162 } else {
163 $node = $input;
164 }
165 if( ! $node->ancestor && $self->get_root_node->internal_id != $node->internal_id) {
166 $self->warn("Node (".$node->to_string . ") has no ancestor, can't remove!");
167 } else {
168 $node->ancestor->remove_Descendent($node);
169 }
170 }
171
172
173 # Added for Justin Reese by Jason
174
175 =head2 get_lca
176
177 Title : get_lca
178 Usage : get_lca(-nodes => \@nodes )
179 Function: given two nodes, returns the lowest common ancestor
180 Returns : node object
181 Args : -nodes => arrayref of nodes to test
182
183
184 =cut
185
186 sub get_lca {
187 my ($self,@args) = @_;
188 my ($nodes) = $self->_rearrange([qw(NODES)],@args);
189 if( ! defined $nodes ) {
190 $self->warn("Must supply -nodes parameter to get_lca() method");
191 return undef;
192 }
193 my ($node1,$node2) = $self->_check_two_nodes($nodes);
194 return undef unless $node1 && $node2;
195
196 # algorithm: Start with first node, find and save every node from it to
197 # root. Then start with second node; for it and each of its ancestor
198 # nodes, check to see if it's in the first node's ancestor list - if
199 # so it is the lca.
200 #
201 # This is very slow and naive, but I somehow doubt the overhead
202 # of mapping the tree to a complete binary tree and doing the linear
203 # lca search would be worth the overhead, especially for small trees.
204 # Maybe someday I'll write a linear get_lca and find out.
205
206 # find and save every ancestor of node1 (including itself)
207
208 my %node1_ancestors; # keys are internal ids, values are objects
209 my $place = $node1; # start at node1
210
211 while ( $place ){
212 $node1_ancestors{$place->internal_id} = $place;
213 $place = $place->ancestor;
214 }
215
216 # now climb up node2, for each node checking whether
217 # it's in node1_ancestors
218 $place = $node2; # start at node2
219 while ( $place ){
220 foreach my $key ( keys %node1_ancestors ){ # ugh
221 if ( $place->internal_id == $key){
222 return $node1_ancestors{$key};
223 }
224 }
225 $place = $place->ancestor;
226 }
227 $self->warn("Could not find lca!"); # should never execute,
228 # if so, there's a problem
229 return undef;
230 }
231
232 # Added for Justin Reese by Jason
233
234 =head2 distance
235
236 Title : distance
237 Usage : distance(-nodes => \@nodes )
238 Function: returns the distance between two given nodes
239 Returns : numerical distance
240 Args : -nodes => arrayref of nodes to test
241
242
243 =cut
244
245 sub distance {
246 my ($self,@args) = @_;
247 my ($nodes) = $self->_rearrange([qw(NODES)],@args);
248 if( ! defined $nodes ) {
249 $self->warn("Must supply -nodes parameter to distance() method");
250 return undef;
251 }
252 my ($node1,$node2) = $self->_check_two_nodes($nodes);
253 # algorithm:
254
255 # Find lca: Start with first node, find and save every node from it
256 # to root, saving cumulative distance. Then start with second node;
257 # for it and each of its ancestor nodes, check to see if it's in
258 # the first node's ancestor list - if so it is the lca. Return sum
259 # of (cumul. distance from node1 to lca) and (cumul. distance from
260 # node2 to lca)
261
262 # find and save every ancestor of node1 (including itself)
263
264 my %node1_ancestors; # keys are internal ids, values are objects
265 my %node1_cumul_dist; # keys are internal ids, values
266 # are cumulative distance from node1 to given node
267 my $place = $node1; # start at node1
268 my $cumul_dist = 0;
269
270 while ( $place ){
271 $node1_ancestors{$place->internal_id} = $place;
272 $node1_cumul_dist{$place->internal_id} = $cumul_dist;
273 if ($place->branch_length) {
274 $cumul_dist += $place->branch_length; # include current branch
275 # length in next iteration
276 }
277 $place = $place->ancestor;
278 }
279
280 # now climb up node2, for each node checking whether
281 # it's in node1_ancestors
282 $place = $node2; # start at node2
283 $cumul_dist = 0;
284 while ( $place ){
285 foreach my $key ( keys %node1_ancestors ){ # ugh
286 if ( $place->internal_id == $key){ # we're at lca
287 return $node1_cumul_dist{$key} + $cumul_dist;
288 }
289 }
290 # include current branch length in next iteration
291 $cumul_dist += $place->branch_length;
292 $place = $place->ancestor;
293 }
294 $self->warn("Could not find distance!"); # should never execute,
295 # if so, there's a problem
296 return undef;
297 }
298
299 # helper function to check lca and distance arguments
300
301 sub _check_two_nodes {
302 my ($self, $nodes) = @_;
303
304 if( ref($nodes) !~ /ARRAY/i ||
305 !ref($nodes->[0]) ||
306 !ref($nodes->[1])
307 ) {
308 $self->warn("Must provide a valid array reference for -nodes");
309 return undef;
310 } elsif( scalar(@$nodes) > 2 ){
311 $self->warn("More than two nodes given, using first two");
312 } elsif( scalar(@$nodes) < 2 ){
313 $self->warn("-nodes parameter does not contain reference to two nodes");
314 return undef;
315 }
316 unless( $nodes->[0]->isa('Bio::Tree::NodeI') &&
317 $nodes->[1]->isa('Bio::Tree::NodeI') ) {
318 $self->warn("Did not provide valid Bio::Tree::NodeI objects as nodes\n");
319 return undef;
320 }
321 return @$nodes;
322 }
323
324
325 =head2 is_monophyletic
326
327 Title : is_monophyletic
328 Usage : if( $tree->is_monophyletic(-nodes => \@nodes,
329 -outgroup => $outgroup)
330 Function: Will do a test of monophyly for the nodes specified
331 in comparison to a chosen outgroup
332 Returns : boolean
333 Args : -nodes => arrayref of nodes to test
334 -outgroup => outgroup to serve as a reference
335
336
337 =cut
338
339 sub is_monophyletic{
340 my ($self,@args) = @_;
341 my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args);
342
343 if( ! defined $nodes || ! defined $outgroup ) {
344 $self->warn("Must supply -nodes and -outgroup parameters to the method
345 is_monophyletic");
346 return undef;
347 }
348 if( ref($nodes) !~ /ARRAY/i ) {
349 $self->warn("Must provide a valid array reference for -nodes");
350 }
351 my $clade_root;
352 # this is to combine multiple tests into a single node
353 # order doesn't really matter as long as get_lca does its job right
354 while( @$nodes > 2 ) {
355 my ($a,$b) = ( shift @$nodes, shift @$nodes);
356 $clade_root = $self->get_lca(-nodes => [$a,$b] );
357 unshift @$nodes, $clade_root;
358 }
359 $clade_root = $self->get_lca(-nodes => $nodes );
360 my $og_ancestor = $outgroup->ancestor;
361 while( defined ($og_ancestor ) ) {
362 if( $og_ancestor->internal_id == $clade_root->internal_id ) {
363 # monophyly is violated
364 return 0;
365 }
366 $og_ancestor = $og_ancestor->ancestor;
367 }
368 return 1;
369 }
370
371 =head2 is_paraphyletic
372
373 Title : is_paraphyletic
374 Usage : if( $tree->is_paraphyletic(-nodes =>\@nodes,
375 -outgroup => $node) ){ }
376 Function: Tests whether or not a given set of nodes are paraphyletic
377 (representing the full clade) given an outgroup
378 Returns : [-1,0,1] , -1 if the group is not monophyletic
379 0 if the group is not paraphyletic
380 1 if the group is paraphyletic
381 Args : -nodes => Array of Bio::Tree::NodeI objects which are in the tree
382 -outgroup => a Bio::Tree::NodeI to compare the nodes to
383
384
385 =cut
386
387 sub is_paraphyletic{
388 my ($self,@args) = @_;
389 my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args);
390
391 if( ! defined $nodes || ! defined $outgroup ) {
392 $self->warn("Must suply -nodes and -outgroup parameters to the method is_paraphyletic");
393 return undef;
394 }
395 if( ref($nodes) !~ /ARRAY/i ) {
396 $self->warn("Must provide a valid array reference for -nodes");
397 return undef;
398 }
399
400 # Algorithm
401 # Find the lca
402 # Find all the nodes beneath the lca
403 # Test to see that none are missing from the nodes list
404 my %nodehash;
405 foreach my $n ( @$nodes ) {
406 $nodehash{$n->internal_id} = $n;
407 }
408 while( @$nodes > 2 ) {
409 unshift @$nodes, $self->get_lca(-nodes => [( shift @$nodes,
410 shift @$nodes)] );
411 }
412 my $clade_root = $self->get_lca(-nodes => $nodes );
413 unless( defined $clade_root ) {
414 $self->warn("could not find clade root via lca");
415 return undef;
416 }
417 my $og_ancestor = $outgroup->ancestor;
418
419 # Is this necessary/correct for paraphyly test?
420 while( defined ($og_ancestor ) ) {
421 if( $og_ancestor->internal_id == $clade_root->internal_id ) {
422 # monophyly is violated, could be paraphyletic
423 return -1;
424 }
425 $og_ancestor = $og_ancestor->ancestor;
426 }
427 my $tree = new Bio::Tree::Tree(-root => $clade_root,
428 -nodelete => 1);
429
430 foreach my $n ( $tree->get_nodes() ) {
431 next unless $n->is_Leaf();
432 # if any leaf node is not in the list
433 # then it is part of the clade and so the list
434 # must be paraphyletic
435 return 1 unless ( $nodehash{$n->internal_id} );
436 }
437 return 0;
438 }
439
440
441 =head2 reroot
442
443 Title : reroot_tree
444 Usage : $tree->reroot($node);
445 Function: Reroots a tree either making a new node the root
446 Returns : 1 on success, 0 on failure
447 Args : Bio::Tree::NodeI that is in the tree, but is not the current root
448
449 =cut
450
451 sub reroot {
452 my ($self,$new_root) = @_;
453 unless (defined $new_root && $new_root->isa("Bio::Tree::NodeI")) {
454 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
455 return 0;
456 }
457 if( $new_root->is_Leaf() ) {
458 $self->warn("Asking to root with a leaf, will use the leaf's ancestor");
459 $new_root = $new_root->ancestor;
460 }
461
462 my $old_root = $self->get_root_node;
463 if( $new_root == $old_root ) {
464 $self->warn("Node requested for reroot is already the root node!");
465 return 0;
466 }
467
468 my @path = (); # along tree, from newroot to oldroot
469 my $node = $new_root;
470 while ($node) {
471 push @path, $node;
472 $node = $node->ancestor;
473 }
474
475 my @path_from_oldroot = reverse @path;
476 for (my $i = 0; $i < @path_from_oldroot - 1; $i++) {
477 my $current = $path_from_oldroot[$i];
478 my $next = $path_from_oldroot[$i + 1];
479 $current->remove_Descendent($next);
480 $current->branch_length($next->branch_length);
481 $next->add_Descendent($current);
482
483 }
484 $new_root->branch_length(undef);
485 $self->set_root_node($new_root);
486
487 return 1;
488 }
489
490 =head2 reverse_edge
491
492 Title : reverse_edge
493 Usage : $node->reverse_edge(child);
494 Function: makes child be a parent of node
495 Requires: child must be a direct descendent of node
496 Returns : nothing
497 Args : Bio::Tree::NodeI that is in the tree
498
499 =cut
500
501 sub reverse_edge {
502 my ($self,$node) = @_;
503 delete_edge($self, $node);
504 $node->add_Descendent($self);
505 1;
506 }
507
508 =head2 delete_edge
509
510 Title : delete_edge
511 Usage : $node->reverse_edge(child);
512 Function: makes child be a parent of node
513 Requires: child must be a direct descendent of node
514 Returns : nothing
515 Args : Bio::Tree::NodeI that is in the tree
516
517 =cut
518
519 sub delete_edge {
520 my ($self,$node) = @_;
521 unless (defined $self && $self->isa("Bio::Tree::NodeI")) {
522 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
523 return 1;
524 }
525 unless (defined $node && $node->isa("Bio::Tree::NodeI")) {
526 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
527 return 1;
528 }
529 if( $self->{'_desc'}->{$node->internal_id} ) {
530 $node->ancestor(undef);
531 $self->{'_desc'}->{$node->internal_id}->ancestor(undef);
532 delete $self->{'_desc'}->{$node->internal_id};
533 } else {
534 $self->warn("First argument must be direct parent of node");
535 return 1;
536 }
537 1;
538 }
539
540 sub findnode_by_id {
541 my $tree = shift;
542 my $id = shift;
543 my $rootnode = $tree->get_root_node;
544 if ( ($rootnode->id) and ($rootnode->id eq $id) ) {
545 return $rootnode;
546 }
547 # process all the children
548 foreach my $node ( $rootnode->get_Descendents ) {
549 if ( ($node->id) and ($node->id eq $id ) ) {
550 return $node;
551 }
552 }
553 }
554
555 1;