0
|
1 # $Id: TreeFunctionsI.pm,v 1.5.2.3 2003/09/14 20:18:25 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Tree::TreeFunctionsI
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Tree::TreeFunctionsI - Decorated Interface implementing basic Tree exploration methods
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 use Bio::TreeIO;
|
|
20 my $in = new Bio::TreeIO(-format => 'newick', -file => 'tree.tre');
|
|
21
|
|
22 my $tree = $in->next_tree;
|
|
23
|
|
24 my @nodes = $tree->find_node('id1');
|
|
25
|
|
26 if( $tree->is_monophyletic(-clade => @nodes, -outgroup => $outnode) ){
|
|
27
|
|
28 }
|
|
29
|
|
30 =head1 DESCRIPTION
|
|
31
|
|
32 Describe the interface here
|
|
33
|
|
34 =head1 FEEDBACK
|
|
35
|
|
36 =head2 Mailing Lists
|
|
37
|
|
38 User feedback is an integral part of the evolution of this and other
|
|
39 Bioperl modules. Send your comments and suggestions preferably to
|
|
40 the Bioperl mailing list. Your participation is much appreciated.
|
|
41
|
|
42 bioperl-l@bioperl.org - General discussion
|
|
43 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
44
|
|
45 =head2 Reporting Bugs
|
|
46
|
|
47 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
48 of the bugs and their resolution. Bug reports can be submitted via
|
|
49 email or the web:
|
|
50
|
|
51 bioperl-bugs@bioperl.org
|
|
52 http://bugzilla.bioperl.org/
|
|
53
|
|
54 =head1 AUTHOR - Jason Stajich, Aaron Mackey, Justin Reese
|
|
55
|
|
56 Email jason-at-bioperl-dot-org
|
|
57 Email amackey-at-virginia.edu
|
|
58 Email jtr4v-at-virginia.edu
|
|
59
|
|
60 =head1 CONTRIBUTORS
|
|
61
|
|
62 Additional contributors names and emails here
|
|
63
|
|
64 Rerooting code was worked on by
|
|
65
|
|
66 Daniel Barker d.barker-at-reading.ac.uk
|
|
67 Ramiro Barrantes Ramiro.Barrantes-at-uvm.edu
|
|
68
|
|
69 =head1 APPENDIX
|
|
70
|
|
71 The rest of the documentation details each of the object methods.
|
|
72 Internal methods are usually preceded with a _
|
|
73
|
|
74 =cut
|
|
75
|
|
76
|
|
77 # Let the code begin...
|
|
78
|
|
79
|
|
80 package Bio::Tree::TreeFunctionsI;
|
|
81 use vars qw(@ISA);
|
|
82 use strict;
|
|
83 use Bio::Tree::TreeI;
|
|
84
|
|
85 @ISA = qw(Bio::Tree::TreeI);
|
|
86
|
|
87 =head2 find_node
|
|
88
|
|
89 Title : find_node
|
|
90 Usage : my @nodes = $self->find_node(-id => 'node1');
|
|
91 Function: returns all nodes that match a specific field, by default this
|
|
92 is id, but different branch_length,
|
|
93 Returns : List of nodes which matched search
|
|
94 Args : text string to search for
|
|
95 OR
|
|
96 -fieldname => $textstring
|
|
97
|
|
98 =cut
|
|
99
|
|
100 sub find_node {
|
|
101 my ($self,$type,$field) = @_;
|
|
102 if( ! defined $type ) {
|
|
103 $self->warn("Must request a either a string or field and string when searching");
|
|
104 }
|
|
105
|
|
106 # all this work for a '-' named field
|
|
107 # is so that we could potentially
|
|
108 # expand to other constraints in
|
|
109 # different implementations
|
|
110 # like 'find all nodes with boostrap < XX'
|
|
111
|
|
112 if( ! defined $field ) {
|
|
113 # only 1 argument, default to searching by id
|
|
114 $field= $type;
|
|
115 $type = 'id';
|
|
116 } else {
|
|
117 $type =~ s/^-//;
|
|
118 }
|
|
119
|
|
120 # could actually do this by testing $rootnode->can($type) but
|
|
121 # it is possible that a tree is implemeted with different node types
|
|
122 # - although it is unlikely that the root node would be richer than the
|
|
123 # leaf nodes. Can't handle NHX tags right now
|
|
124
|
|
125 unless( $type eq 'id' || $type eq 'name' ||
|
|
126 $type eq 'bootstrap' || $type eq 'description' ||
|
|
127 $type eq 'internal_id') {
|
|
128 $self->warn("unknown search type $type - will try anyways");
|
|
129 }
|
|
130 my @nodes = grep { $_->can($type) && defined $_->$type() &&
|
|
131 $_->$type() eq $field } $self->get_nodes();
|
|
132
|
|
133 if ( wantarray) {
|
|
134 return @nodes;
|
|
135 } else {
|
|
136 if( @nodes > 1 ) {
|
|
137 $self->warn("More than 1 node found but caller requested scalar, only returning first node");
|
|
138 }
|
|
139 return shift @nodes;
|
|
140 }
|
|
141 }
|
|
142
|
|
143 =head2 remove_Node
|
|
144
|
|
145 Title : remove_Node
|
|
146 Usage : $tree->remove_Node($node)
|
|
147 Function: Removes a node from the tree
|
|
148 Returns : boolean represent status of success
|
|
149 Args : either Bio::Tree::NodeI or string of the node id
|
|
150
|
|
151
|
|
152 =cut
|
|
153
|
|
154 sub remove_Node {
|
|
155 my ($self,$input) = @_;
|
|
156 my $node = undef;
|
|
157 unless( ref($input) ) {
|
|
158 $node = $self->find_node($input);
|
|
159 } elsif( ! $input->isa('Bio::Tree::NodeI') ) {
|
|
160 $self->warn("Did not provide either a valid Bio::Tree::NodeI object to remove_node or the node name");
|
|
161 return 0;
|
|
162 } else {
|
|
163 $node = $input;
|
|
164 }
|
|
165 if( ! $node->ancestor && $self->get_root_node->internal_id != $node->internal_id) {
|
|
166 $self->warn("Node (".$node->to_string . ") has no ancestor, can't remove!");
|
|
167 } else {
|
|
168 $node->ancestor->remove_Descendent($node);
|
|
169 }
|
|
170 }
|
|
171
|
|
172
|
|
173 # Added for Justin Reese by Jason
|
|
174
|
|
175 =head2 get_lca
|
|
176
|
|
177 Title : get_lca
|
|
178 Usage : get_lca(-nodes => \@nodes )
|
|
179 Function: given two nodes, returns the lowest common ancestor
|
|
180 Returns : node object
|
|
181 Args : -nodes => arrayref of nodes to test
|
|
182
|
|
183
|
|
184 =cut
|
|
185
|
|
186 sub get_lca {
|
|
187 my ($self,@args) = @_;
|
|
188 my ($nodes) = $self->_rearrange([qw(NODES)],@args);
|
|
189 if( ! defined $nodes ) {
|
|
190 $self->warn("Must supply -nodes parameter to get_lca() method");
|
|
191 return undef;
|
|
192 }
|
|
193 my ($node1,$node2) = $self->_check_two_nodes($nodes);
|
|
194 return undef unless $node1 && $node2;
|
|
195
|
|
196 # algorithm: Start with first node, find and save every node from it to
|
|
197 # root. Then start with second node; for it and each of its ancestor
|
|
198 # nodes, check to see if it's in the first node's ancestor list - if
|
|
199 # so it is the lca.
|
|
200 #
|
|
201 # This is very slow and naive, but I somehow doubt the overhead
|
|
202 # of mapping the tree to a complete binary tree and doing the linear
|
|
203 # lca search would be worth the overhead, especially for small trees.
|
|
204 # Maybe someday I'll write a linear get_lca and find out.
|
|
205
|
|
206 # find and save every ancestor of node1 (including itself)
|
|
207
|
|
208 my %node1_ancestors; # keys are internal ids, values are objects
|
|
209 my $place = $node1; # start at node1
|
|
210
|
|
211 while ( $place ){
|
|
212 $node1_ancestors{$place->internal_id} = $place;
|
|
213 $place = $place->ancestor;
|
|
214 }
|
|
215
|
|
216 # now climb up node2, for each node checking whether
|
|
217 # it's in node1_ancestors
|
|
218 $place = $node2; # start at node2
|
|
219 while ( $place ){
|
|
220 foreach my $key ( keys %node1_ancestors ){ # ugh
|
|
221 if ( $place->internal_id == $key){
|
|
222 return $node1_ancestors{$key};
|
|
223 }
|
|
224 }
|
|
225 $place = $place->ancestor;
|
|
226 }
|
|
227 $self->warn("Could not find lca!"); # should never execute,
|
|
228 # if so, there's a problem
|
|
229 return undef;
|
|
230 }
|
|
231
|
|
232 # Added for Justin Reese by Jason
|
|
233
|
|
234 =head2 distance
|
|
235
|
|
236 Title : distance
|
|
237 Usage : distance(-nodes => \@nodes )
|
|
238 Function: returns the distance between two given nodes
|
|
239 Returns : numerical distance
|
|
240 Args : -nodes => arrayref of nodes to test
|
|
241
|
|
242
|
|
243 =cut
|
|
244
|
|
245 sub distance {
|
|
246 my ($self,@args) = @_;
|
|
247 my ($nodes) = $self->_rearrange([qw(NODES)],@args);
|
|
248 if( ! defined $nodes ) {
|
|
249 $self->warn("Must supply -nodes parameter to distance() method");
|
|
250 return undef;
|
|
251 }
|
|
252 my ($node1,$node2) = $self->_check_two_nodes($nodes);
|
|
253 # algorithm:
|
|
254
|
|
255 # Find lca: Start with first node, find and save every node from it
|
|
256 # to root, saving cumulative distance. Then start with second node;
|
|
257 # for it and each of its ancestor nodes, check to see if it's in
|
|
258 # the first node's ancestor list - if so it is the lca. Return sum
|
|
259 # of (cumul. distance from node1 to lca) and (cumul. distance from
|
|
260 # node2 to lca)
|
|
261
|
|
262 # find and save every ancestor of node1 (including itself)
|
|
263
|
|
264 my %node1_ancestors; # keys are internal ids, values are objects
|
|
265 my %node1_cumul_dist; # keys are internal ids, values
|
|
266 # are cumulative distance from node1 to given node
|
|
267 my $place = $node1; # start at node1
|
|
268 my $cumul_dist = 0;
|
|
269
|
|
270 while ( $place ){
|
|
271 $node1_ancestors{$place->internal_id} = $place;
|
|
272 $node1_cumul_dist{$place->internal_id} = $cumul_dist;
|
|
273 if ($place->branch_length) {
|
|
274 $cumul_dist += $place->branch_length; # include current branch
|
|
275 # length in next iteration
|
|
276 }
|
|
277 $place = $place->ancestor;
|
|
278 }
|
|
279
|
|
280 # now climb up node2, for each node checking whether
|
|
281 # it's in node1_ancestors
|
|
282 $place = $node2; # start at node2
|
|
283 $cumul_dist = 0;
|
|
284 while ( $place ){
|
|
285 foreach my $key ( keys %node1_ancestors ){ # ugh
|
|
286 if ( $place->internal_id == $key){ # we're at lca
|
|
287 return $node1_cumul_dist{$key} + $cumul_dist;
|
|
288 }
|
|
289 }
|
|
290 # include current branch length in next iteration
|
|
291 $cumul_dist += $place->branch_length;
|
|
292 $place = $place->ancestor;
|
|
293 }
|
|
294 $self->warn("Could not find distance!"); # should never execute,
|
|
295 # if so, there's a problem
|
|
296 return undef;
|
|
297 }
|
|
298
|
|
299 # helper function to check lca and distance arguments
|
|
300
|
|
301 sub _check_two_nodes {
|
|
302 my ($self, $nodes) = @_;
|
|
303
|
|
304 if( ref($nodes) !~ /ARRAY/i ||
|
|
305 !ref($nodes->[0]) ||
|
|
306 !ref($nodes->[1])
|
|
307 ) {
|
|
308 $self->warn("Must provide a valid array reference for -nodes");
|
|
309 return undef;
|
|
310 } elsif( scalar(@$nodes) > 2 ){
|
|
311 $self->warn("More than two nodes given, using first two");
|
|
312 } elsif( scalar(@$nodes) < 2 ){
|
|
313 $self->warn("-nodes parameter does not contain reference to two nodes");
|
|
314 return undef;
|
|
315 }
|
|
316 unless( $nodes->[0]->isa('Bio::Tree::NodeI') &&
|
|
317 $nodes->[1]->isa('Bio::Tree::NodeI') ) {
|
|
318 $self->warn("Did not provide valid Bio::Tree::NodeI objects as nodes\n");
|
|
319 return undef;
|
|
320 }
|
|
321 return @$nodes;
|
|
322 }
|
|
323
|
|
324
|
|
325 =head2 is_monophyletic
|
|
326
|
|
327 Title : is_monophyletic
|
|
328 Usage : if( $tree->is_monophyletic(-nodes => \@nodes,
|
|
329 -outgroup => $outgroup)
|
|
330 Function: Will do a test of monophyly for the nodes specified
|
|
331 in comparison to a chosen outgroup
|
|
332 Returns : boolean
|
|
333 Args : -nodes => arrayref of nodes to test
|
|
334 -outgroup => outgroup to serve as a reference
|
|
335
|
|
336
|
|
337 =cut
|
|
338
|
|
339 sub is_monophyletic{
|
|
340 my ($self,@args) = @_;
|
|
341 my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args);
|
|
342
|
|
343 if( ! defined $nodes || ! defined $outgroup ) {
|
|
344 $self->warn("Must supply -nodes and -outgroup parameters to the method
|
|
345 is_monophyletic");
|
|
346 return undef;
|
|
347 }
|
|
348 if( ref($nodes) !~ /ARRAY/i ) {
|
|
349 $self->warn("Must provide a valid array reference for -nodes");
|
|
350 }
|
|
351 my $clade_root;
|
|
352 # this is to combine multiple tests into a single node
|
|
353 # order doesn't really matter as long as get_lca does its job right
|
|
354 while( @$nodes > 2 ) {
|
|
355 my ($a,$b) = ( shift @$nodes, shift @$nodes);
|
|
356 $clade_root = $self->get_lca(-nodes => [$a,$b] );
|
|
357 unshift @$nodes, $clade_root;
|
|
358 }
|
|
359 $clade_root = $self->get_lca(-nodes => $nodes );
|
|
360 my $og_ancestor = $outgroup->ancestor;
|
|
361 while( defined ($og_ancestor ) ) {
|
|
362 if( $og_ancestor->internal_id == $clade_root->internal_id ) {
|
|
363 # monophyly is violated
|
|
364 return 0;
|
|
365 }
|
|
366 $og_ancestor = $og_ancestor->ancestor;
|
|
367 }
|
|
368 return 1;
|
|
369 }
|
|
370
|
|
371 =head2 is_paraphyletic
|
|
372
|
|
373 Title : is_paraphyletic
|
|
374 Usage : if( $tree->is_paraphyletic(-nodes =>\@nodes,
|
|
375 -outgroup => $node) ){ }
|
|
376 Function: Tests whether or not a given set of nodes are paraphyletic
|
|
377 (representing the full clade) given an outgroup
|
|
378 Returns : [-1,0,1] , -1 if the group is not monophyletic
|
|
379 0 if the group is not paraphyletic
|
|
380 1 if the group is paraphyletic
|
|
381 Args : -nodes => Array of Bio::Tree::NodeI objects which are in the tree
|
|
382 -outgroup => a Bio::Tree::NodeI to compare the nodes to
|
|
383
|
|
384
|
|
385 =cut
|
|
386
|
|
387 sub is_paraphyletic{
|
|
388 my ($self,@args) = @_;
|
|
389 my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args);
|
|
390
|
|
391 if( ! defined $nodes || ! defined $outgroup ) {
|
|
392 $self->warn("Must suply -nodes and -outgroup parameters to the method is_paraphyletic");
|
|
393 return undef;
|
|
394 }
|
|
395 if( ref($nodes) !~ /ARRAY/i ) {
|
|
396 $self->warn("Must provide a valid array reference for -nodes");
|
|
397 return undef;
|
|
398 }
|
|
399
|
|
400 # Algorithm
|
|
401 # Find the lca
|
|
402 # Find all the nodes beneath the lca
|
|
403 # Test to see that none are missing from the nodes list
|
|
404 my %nodehash;
|
|
405 foreach my $n ( @$nodes ) {
|
|
406 $nodehash{$n->internal_id} = $n;
|
|
407 }
|
|
408 while( @$nodes > 2 ) {
|
|
409 unshift @$nodes, $self->get_lca(-nodes => [( shift @$nodes,
|
|
410 shift @$nodes)] );
|
|
411 }
|
|
412 my $clade_root = $self->get_lca(-nodes => $nodes );
|
|
413 unless( defined $clade_root ) {
|
|
414 $self->warn("could not find clade root via lca");
|
|
415 return undef;
|
|
416 }
|
|
417 my $og_ancestor = $outgroup->ancestor;
|
|
418
|
|
419 # Is this necessary/correct for paraphyly test?
|
|
420 while( defined ($og_ancestor ) ) {
|
|
421 if( $og_ancestor->internal_id == $clade_root->internal_id ) {
|
|
422 # monophyly is violated, could be paraphyletic
|
|
423 return -1;
|
|
424 }
|
|
425 $og_ancestor = $og_ancestor->ancestor;
|
|
426 }
|
|
427 my $tree = new Bio::Tree::Tree(-root => $clade_root,
|
|
428 -nodelete => 1);
|
|
429
|
|
430 foreach my $n ( $tree->get_nodes() ) {
|
|
431 next unless $n->is_Leaf();
|
|
432 # if any leaf node is not in the list
|
|
433 # then it is part of the clade and so the list
|
|
434 # must be paraphyletic
|
|
435 return 1 unless ( $nodehash{$n->internal_id} );
|
|
436 }
|
|
437 return 0;
|
|
438 }
|
|
439
|
|
440
|
|
441 =head2 reroot
|
|
442
|
|
443 Title : reroot_tree
|
|
444 Usage : $tree->reroot($node);
|
|
445 Function: Reroots a tree either making a new node the root
|
|
446 Returns : 1 on success, 0 on failure
|
|
447 Args : Bio::Tree::NodeI that is in the tree, but is not the current root
|
|
448
|
|
449 =cut
|
|
450
|
|
451 sub reroot {
|
|
452 my ($self,$new_root) = @_;
|
|
453 unless (defined $new_root && $new_root->isa("Bio::Tree::NodeI")) {
|
|
454 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
|
|
455 return 0;
|
|
456 }
|
|
457 if( $new_root->is_Leaf() ) {
|
|
458 $self->warn("Asking to root with a leaf, will use the leaf's ancestor");
|
|
459 $new_root = $new_root->ancestor;
|
|
460 }
|
|
461
|
|
462 my $old_root = $self->get_root_node;
|
|
463 if( $new_root == $old_root ) {
|
|
464 $self->warn("Node requested for reroot is already the root node!");
|
|
465 return 0;
|
|
466 }
|
|
467
|
|
468 my @path = (); # along tree, from newroot to oldroot
|
|
469 my $node = $new_root;
|
|
470 while ($node) {
|
|
471 push @path, $node;
|
|
472 $node = $node->ancestor;
|
|
473 }
|
|
474
|
|
475 my @path_from_oldroot = reverse @path;
|
|
476 for (my $i = 0; $i < @path_from_oldroot - 1; $i++) {
|
|
477 my $current = $path_from_oldroot[$i];
|
|
478 my $next = $path_from_oldroot[$i + 1];
|
|
479 $current->remove_Descendent($next);
|
|
480 $current->branch_length($next->branch_length);
|
|
481 $next->add_Descendent($current);
|
|
482
|
|
483 }
|
|
484 $new_root->branch_length(undef);
|
|
485 $self->set_root_node($new_root);
|
|
486
|
|
487 return 1;
|
|
488 }
|
|
489
|
|
490 =head2 reverse_edge
|
|
491
|
|
492 Title : reverse_edge
|
|
493 Usage : $node->reverse_edge(child);
|
|
494 Function: makes child be a parent of node
|
|
495 Requires: child must be a direct descendent of node
|
|
496 Returns : nothing
|
|
497 Args : Bio::Tree::NodeI that is in the tree
|
|
498
|
|
499 =cut
|
|
500
|
|
501 sub reverse_edge {
|
|
502 my ($self,$node) = @_;
|
|
503 delete_edge($self, $node);
|
|
504 $node->add_Descendent($self);
|
|
505 1;
|
|
506 }
|
|
507
|
|
508 =head2 delete_edge
|
|
509
|
|
510 Title : delete_edge
|
|
511 Usage : $node->reverse_edge(child);
|
|
512 Function: makes child be a parent of node
|
|
513 Requires: child must be a direct descendent of node
|
|
514 Returns : nothing
|
|
515 Args : Bio::Tree::NodeI that is in the tree
|
|
516
|
|
517 =cut
|
|
518
|
|
519 sub delete_edge {
|
|
520 my ($self,$node) = @_;
|
|
521 unless (defined $self && $self->isa("Bio::Tree::NodeI")) {
|
|
522 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
|
|
523 return 1;
|
|
524 }
|
|
525 unless (defined $node && $node->isa("Bio::Tree::NodeI")) {
|
|
526 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting");
|
|
527 return 1;
|
|
528 }
|
|
529 if( $self->{'_desc'}->{$node->internal_id} ) {
|
|
530 $node->ancestor(undef);
|
|
531 $self->{'_desc'}->{$node->internal_id}->ancestor(undef);
|
|
532 delete $self->{'_desc'}->{$node->internal_id};
|
|
533 } else {
|
|
534 $self->warn("First argument must be direct parent of node");
|
|
535 return 1;
|
|
536 }
|
|
537 1;
|
|
538 }
|
|
539
|
|
540 sub findnode_by_id {
|
|
541 my $tree = shift;
|
|
542 my $id = shift;
|
|
543 my $rootnode = $tree->get_root_node;
|
|
544 if ( ($rootnode->id) and ($rootnode->id eq $id) ) {
|
|
545 return $rootnode;
|
|
546 }
|
|
547 # process all the children
|
|
548 foreach my $node ( $rootnode->get_Descendents ) {
|
|
549 if ( ($node->id) and ($node->id eq $id ) ) {
|
|
550 return $node;
|
|
551 }
|
|
552 }
|
|
553 }
|
|
554
|
|
555 1;
|