Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tree/TreeFunctionsI.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: TreeFunctionsI.pm,v 1.5.2.3 2003/09/14 20:18:25 jason Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::Tree::TreeFunctionsI | |
| 4 # | |
| 5 # Cared for by Jason Stajich <jason@bioperl.org> | |
| 6 # | |
| 7 # Copyright Jason Stajich | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::Tree::TreeFunctionsI - Decorated Interface implementing basic Tree exploration methods | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 use Bio::TreeIO; | |
| 20 my $in = new Bio::TreeIO(-format => 'newick', -file => 'tree.tre'); | |
| 21 | |
| 22 my $tree = $in->next_tree; | |
| 23 | |
| 24 my @nodes = $tree->find_node('id1'); | |
| 25 | |
| 26 if( $tree->is_monophyletic(-clade => @nodes, -outgroup => $outnode) ){ | |
| 27 | |
| 28 } | |
| 29 | |
| 30 =head1 DESCRIPTION | |
| 31 | |
| 32 Describe the interface here | |
| 33 | |
| 34 =head1 FEEDBACK | |
| 35 | |
| 36 =head2 Mailing Lists | |
| 37 | |
| 38 User feedback is an integral part of the evolution of this and other | |
| 39 Bioperl modules. Send your comments and suggestions preferably to | |
| 40 the Bioperl mailing list. Your participation is much appreciated. | |
| 41 | |
| 42 bioperl-l@bioperl.org - General discussion | |
| 43 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 44 | |
| 45 =head2 Reporting Bugs | |
| 46 | |
| 47 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 48 of the bugs and their resolution. Bug reports can be submitted via | |
| 49 email or the web: | |
| 50 | |
| 51 bioperl-bugs@bioperl.org | |
| 52 http://bugzilla.bioperl.org/ | |
| 53 | |
| 54 =head1 AUTHOR - Jason Stajich, Aaron Mackey, Justin Reese | |
| 55 | |
| 56 Email jason-at-bioperl-dot-org | |
| 57 Email amackey-at-virginia.edu | |
| 58 Email jtr4v-at-virginia.edu | |
| 59 | |
| 60 =head1 CONTRIBUTORS | |
| 61 | |
| 62 Additional contributors names and emails here | |
| 63 | |
| 64 Rerooting code was worked on by | |
| 65 | |
| 66 Daniel Barker d.barker-at-reading.ac.uk | |
| 67 Ramiro Barrantes Ramiro.Barrantes-at-uvm.edu | |
| 68 | |
| 69 =head1 APPENDIX | |
| 70 | |
| 71 The rest of the documentation details each of the object methods. | |
| 72 Internal methods are usually preceded with a _ | |
| 73 | |
| 74 =cut | |
| 75 | |
| 76 | |
| 77 # Let the code begin... | |
| 78 | |
| 79 | |
| 80 package Bio::Tree::TreeFunctionsI; | |
| 81 use vars qw(@ISA); | |
| 82 use strict; | |
| 83 use Bio::Tree::TreeI; | |
| 84 | |
| 85 @ISA = qw(Bio::Tree::TreeI); | |
| 86 | |
| 87 =head2 find_node | |
| 88 | |
| 89 Title : find_node | |
| 90 Usage : my @nodes = $self->find_node(-id => 'node1'); | |
| 91 Function: returns all nodes that match a specific field, by default this | |
| 92 is id, but different branch_length, | |
| 93 Returns : List of nodes which matched search | |
| 94 Args : text string to search for | |
| 95 OR | |
| 96 -fieldname => $textstring | |
| 97 | |
| 98 =cut | |
| 99 | |
| 100 sub find_node { | |
| 101 my ($self,$type,$field) = @_; | |
| 102 if( ! defined $type ) { | |
| 103 $self->warn("Must request a either a string or field and string when searching"); | |
| 104 } | |
| 105 | |
| 106 # all this work for a '-' named field | |
| 107 # is so that we could potentially | |
| 108 # expand to other constraints in | |
| 109 # different implementations | |
| 110 # like 'find all nodes with boostrap < XX' | |
| 111 | |
| 112 if( ! defined $field ) { | |
| 113 # only 1 argument, default to searching by id | |
| 114 $field= $type; | |
| 115 $type = 'id'; | |
| 116 } else { | |
| 117 $type =~ s/^-//; | |
| 118 } | |
| 119 | |
| 120 # could actually do this by testing $rootnode->can($type) but | |
| 121 # it is possible that a tree is implemeted with different node types | |
| 122 # - although it is unlikely that the root node would be richer than the | |
| 123 # leaf nodes. Can't handle NHX tags right now | |
| 124 | |
| 125 unless( $type eq 'id' || $type eq 'name' || | |
| 126 $type eq 'bootstrap' || $type eq 'description' || | |
| 127 $type eq 'internal_id') { | |
| 128 $self->warn("unknown search type $type - will try anyways"); | |
| 129 } | |
| 130 my @nodes = grep { $_->can($type) && defined $_->$type() && | |
| 131 $_->$type() eq $field } $self->get_nodes(); | |
| 132 | |
| 133 if ( wantarray) { | |
| 134 return @nodes; | |
| 135 } else { | |
| 136 if( @nodes > 1 ) { | |
| 137 $self->warn("More than 1 node found but caller requested scalar, only returning first node"); | |
| 138 } | |
| 139 return shift @nodes; | |
| 140 } | |
| 141 } | |
| 142 | |
| 143 =head2 remove_Node | |
| 144 | |
| 145 Title : remove_Node | |
| 146 Usage : $tree->remove_Node($node) | |
| 147 Function: Removes a node from the tree | |
| 148 Returns : boolean represent status of success | |
| 149 Args : either Bio::Tree::NodeI or string of the node id | |
| 150 | |
| 151 | |
| 152 =cut | |
| 153 | |
| 154 sub remove_Node { | |
| 155 my ($self,$input) = @_; | |
| 156 my $node = undef; | |
| 157 unless( ref($input) ) { | |
| 158 $node = $self->find_node($input); | |
| 159 } elsif( ! $input->isa('Bio::Tree::NodeI') ) { | |
| 160 $self->warn("Did not provide either a valid Bio::Tree::NodeI object to remove_node or the node name"); | |
| 161 return 0; | |
| 162 } else { | |
| 163 $node = $input; | |
| 164 } | |
| 165 if( ! $node->ancestor && $self->get_root_node->internal_id != $node->internal_id) { | |
| 166 $self->warn("Node (".$node->to_string . ") has no ancestor, can't remove!"); | |
| 167 } else { | |
| 168 $node->ancestor->remove_Descendent($node); | |
| 169 } | |
| 170 } | |
| 171 | |
| 172 | |
| 173 # Added for Justin Reese by Jason | |
| 174 | |
| 175 =head2 get_lca | |
| 176 | |
| 177 Title : get_lca | |
| 178 Usage : get_lca(-nodes => \@nodes ) | |
| 179 Function: given two nodes, returns the lowest common ancestor | |
| 180 Returns : node object | |
| 181 Args : -nodes => arrayref of nodes to test | |
| 182 | |
| 183 | |
| 184 =cut | |
| 185 | |
| 186 sub get_lca { | |
| 187 my ($self,@args) = @_; | |
| 188 my ($nodes) = $self->_rearrange([qw(NODES)],@args); | |
| 189 if( ! defined $nodes ) { | |
| 190 $self->warn("Must supply -nodes parameter to get_lca() method"); | |
| 191 return undef; | |
| 192 } | |
| 193 my ($node1,$node2) = $self->_check_two_nodes($nodes); | |
| 194 return undef unless $node1 && $node2; | |
| 195 | |
| 196 # algorithm: Start with first node, find and save every node from it to | |
| 197 # root. Then start with second node; for it and each of its ancestor | |
| 198 # nodes, check to see if it's in the first node's ancestor list - if | |
| 199 # so it is the lca. | |
| 200 # | |
| 201 # This is very slow and naive, but I somehow doubt the overhead | |
| 202 # of mapping the tree to a complete binary tree and doing the linear | |
| 203 # lca search would be worth the overhead, especially for small trees. | |
| 204 # Maybe someday I'll write a linear get_lca and find out. | |
| 205 | |
| 206 # find and save every ancestor of node1 (including itself) | |
| 207 | |
| 208 my %node1_ancestors; # keys are internal ids, values are objects | |
| 209 my $place = $node1; # start at node1 | |
| 210 | |
| 211 while ( $place ){ | |
| 212 $node1_ancestors{$place->internal_id} = $place; | |
| 213 $place = $place->ancestor; | |
| 214 } | |
| 215 | |
| 216 # now climb up node2, for each node checking whether | |
| 217 # it's in node1_ancestors | |
| 218 $place = $node2; # start at node2 | |
| 219 while ( $place ){ | |
| 220 foreach my $key ( keys %node1_ancestors ){ # ugh | |
| 221 if ( $place->internal_id == $key){ | |
| 222 return $node1_ancestors{$key}; | |
| 223 } | |
| 224 } | |
| 225 $place = $place->ancestor; | |
| 226 } | |
| 227 $self->warn("Could not find lca!"); # should never execute, | |
| 228 # if so, there's a problem | |
| 229 return undef; | |
| 230 } | |
| 231 | |
| 232 # Added for Justin Reese by Jason | |
| 233 | |
| 234 =head2 distance | |
| 235 | |
| 236 Title : distance | |
| 237 Usage : distance(-nodes => \@nodes ) | |
| 238 Function: returns the distance between two given nodes | |
| 239 Returns : numerical distance | |
| 240 Args : -nodes => arrayref of nodes to test | |
| 241 | |
| 242 | |
| 243 =cut | |
| 244 | |
| 245 sub distance { | |
| 246 my ($self,@args) = @_; | |
| 247 my ($nodes) = $self->_rearrange([qw(NODES)],@args); | |
| 248 if( ! defined $nodes ) { | |
| 249 $self->warn("Must supply -nodes parameter to distance() method"); | |
| 250 return undef; | |
| 251 } | |
| 252 my ($node1,$node2) = $self->_check_two_nodes($nodes); | |
| 253 # algorithm: | |
| 254 | |
| 255 # Find lca: Start with first node, find and save every node from it | |
| 256 # to root, saving cumulative distance. Then start with second node; | |
| 257 # for it and each of its ancestor nodes, check to see if it's in | |
| 258 # the first node's ancestor list - if so it is the lca. Return sum | |
| 259 # of (cumul. distance from node1 to lca) and (cumul. distance from | |
| 260 # node2 to lca) | |
| 261 | |
| 262 # find and save every ancestor of node1 (including itself) | |
| 263 | |
| 264 my %node1_ancestors; # keys are internal ids, values are objects | |
| 265 my %node1_cumul_dist; # keys are internal ids, values | |
| 266 # are cumulative distance from node1 to given node | |
| 267 my $place = $node1; # start at node1 | |
| 268 my $cumul_dist = 0; | |
| 269 | |
| 270 while ( $place ){ | |
| 271 $node1_ancestors{$place->internal_id} = $place; | |
| 272 $node1_cumul_dist{$place->internal_id} = $cumul_dist; | |
| 273 if ($place->branch_length) { | |
| 274 $cumul_dist += $place->branch_length; # include current branch | |
| 275 # length in next iteration | |
| 276 } | |
| 277 $place = $place->ancestor; | |
| 278 } | |
| 279 | |
| 280 # now climb up node2, for each node checking whether | |
| 281 # it's in node1_ancestors | |
| 282 $place = $node2; # start at node2 | |
| 283 $cumul_dist = 0; | |
| 284 while ( $place ){ | |
| 285 foreach my $key ( keys %node1_ancestors ){ # ugh | |
| 286 if ( $place->internal_id == $key){ # we're at lca | |
| 287 return $node1_cumul_dist{$key} + $cumul_dist; | |
| 288 } | |
| 289 } | |
| 290 # include current branch length in next iteration | |
| 291 $cumul_dist += $place->branch_length; | |
| 292 $place = $place->ancestor; | |
| 293 } | |
| 294 $self->warn("Could not find distance!"); # should never execute, | |
| 295 # if so, there's a problem | |
| 296 return undef; | |
| 297 } | |
| 298 | |
| 299 # helper function to check lca and distance arguments | |
| 300 | |
| 301 sub _check_two_nodes { | |
| 302 my ($self, $nodes) = @_; | |
| 303 | |
| 304 if( ref($nodes) !~ /ARRAY/i || | |
| 305 !ref($nodes->[0]) || | |
| 306 !ref($nodes->[1]) | |
| 307 ) { | |
| 308 $self->warn("Must provide a valid array reference for -nodes"); | |
| 309 return undef; | |
| 310 } elsif( scalar(@$nodes) > 2 ){ | |
| 311 $self->warn("More than two nodes given, using first two"); | |
| 312 } elsif( scalar(@$nodes) < 2 ){ | |
| 313 $self->warn("-nodes parameter does not contain reference to two nodes"); | |
| 314 return undef; | |
| 315 } | |
| 316 unless( $nodes->[0]->isa('Bio::Tree::NodeI') && | |
| 317 $nodes->[1]->isa('Bio::Tree::NodeI') ) { | |
| 318 $self->warn("Did not provide valid Bio::Tree::NodeI objects as nodes\n"); | |
| 319 return undef; | |
| 320 } | |
| 321 return @$nodes; | |
| 322 } | |
| 323 | |
| 324 | |
| 325 =head2 is_monophyletic | |
| 326 | |
| 327 Title : is_monophyletic | |
| 328 Usage : if( $tree->is_monophyletic(-nodes => \@nodes, | |
| 329 -outgroup => $outgroup) | |
| 330 Function: Will do a test of monophyly for the nodes specified | |
| 331 in comparison to a chosen outgroup | |
| 332 Returns : boolean | |
| 333 Args : -nodes => arrayref of nodes to test | |
| 334 -outgroup => outgroup to serve as a reference | |
| 335 | |
| 336 | |
| 337 =cut | |
| 338 | |
| 339 sub is_monophyletic{ | |
| 340 my ($self,@args) = @_; | |
| 341 my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args); | |
| 342 | |
| 343 if( ! defined $nodes || ! defined $outgroup ) { | |
| 344 $self->warn("Must supply -nodes and -outgroup parameters to the method | |
| 345 is_monophyletic"); | |
| 346 return undef; | |
| 347 } | |
| 348 if( ref($nodes) !~ /ARRAY/i ) { | |
| 349 $self->warn("Must provide a valid array reference for -nodes"); | |
| 350 } | |
| 351 my $clade_root; | |
| 352 # this is to combine multiple tests into a single node | |
| 353 # order doesn't really matter as long as get_lca does its job right | |
| 354 while( @$nodes > 2 ) { | |
| 355 my ($a,$b) = ( shift @$nodes, shift @$nodes); | |
| 356 $clade_root = $self->get_lca(-nodes => [$a,$b] ); | |
| 357 unshift @$nodes, $clade_root; | |
| 358 } | |
| 359 $clade_root = $self->get_lca(-nodes => $nodes ); | |
| 360 my $og_ancestor = $outgroup->ancestor; | |
| 361 while( defined ($og_ancestor ) ) { | |
| 362 if( $og_ancestor->internal_id == $clade_root->internal_id ) { | |
| 363 # monophyly is violated | |
| 364 return 0; | |
| 365 } | |
| 366 $og_ancestor = $og_ancestor->ancestor; | |
| 367 } | |
| 368 return 1; | |
| 369 } | |
| 370 | |
| 371 =head2 is_paraphyletic | |
| 372 | |
| 373 Title : is_paraphyletic | |
| 374 Usage : if( $tree->is_paraphyletic(-nodes =>\@nodes, | |
| 375 -outgroup => $node) ){ } | |
| 376 Function: Tests whether or not a given set of nodes are paraphyletic | |
| 377 (representing the full clade) given an outgroup | |
| 378 Returns : [-1,0,1] , -1 if the group is not monophyletic | |
| 379 0 if the group is not paraphyletic | |
| 380 1 if the group is paraphyletic | |
| 381 Args : -nodes => Array of Bio::Tree::NodeI objects which are in the tree | |
| 382 -outgroup => a Bio::Tree::NodeI to compare the nodes to | |
| 383 | |
| 384 | |
| 385 =cut | |
| 386 | |
| 387 sub is_paraphyletic{ | |
| 388 my ($self,@args) = @_; | |
| 389 my ($nodes,$outgroup) = $self->_rearrange([qw(NODES OUTGROUP)],@args); | |
| 390 | |
| 391 if( ! defined $nodes || ! defined $outgroup ) { | |
| 392 $self->warn("Must suply -nodes and -outgroup parameters to the method is_paraphyletic"); | |
| 393 return undef; | |
| 394 } | |
| 395 if( ref($nodes) !~ /ARRAY/i ) { | |
| 396 $self->warn("Must provide a valid array reference for -nodes"); | |
| 397 return undef; | |
| 398 } | |
| 399 | |
| 400 # Algorithm | |
| 401 # Find the lca | |
| 402 # Find all the nodes beneath the lca | |
| 403 # Test to see that none are missing from the nodes list | |
| 404 my %nodehash; | |
| 405 foreach my $n ( @$nodes ) { | |
| 406 $nodehash{$n->internal_id} = $n; | |
| 407 } | |
| 408 while( @$nodes > 2 ) { | |
| 409 unshift @$nodes, $self->get_lca(-nodes => [( shift @$nodes, | |
| 410 shift @$nodes)] ); | |
| 411 } | |
| 412 my $clade_root = $self->get_lca(-nodes => $nodes ); | |
| 413 unless( defined $clade_root ) { | |
| 414 $self->warn("could not find clade root via lca"); | |
| 415 return undef; | |
| 416 } | |
| 417 my $og_ancestor = $outgroup->ancestor; | |
| 418 | |
| 419 # Is this necessary/correct for paraphyly test? | |
| 420 while( defined ($og_ancestor ) ) { | |
| 421 if( $og_ancestor->internal_id == $clade_root->internal_id ) { | |
| 422 # monophyly is violated, could be paraphyletic | |
| 423 return -1; | |
| 424 } | |
| 425 $og_ancestor = $og_ancestor->ancestor; | |
| 426 } | |
| 427 my $tree = new Bio::Tree::Tree(-root => $clade_root, | |
| 428 -nodelete => 1); | |
| 429 | |
| 430 foreach my $n ( $tree->get_nodes() ) { | |
| 431 next unless $n->is_Leaf(); | |
| 432 # if any leaf node is not in the list | |
| 433 # then it is part of the clade and so the list | |
| 434 # must be paraphyletic | |
| 435 return 1 unless ( $nodehash{$n->internal_id} ); | |
| 436 } | |
| 437 return 0; | |
| 438 } | |
| 439 | |
| 440 | |
| 441 =head2 reroot | |
| 442 | |
| 443 Title : reroot_tree | |
| 444 Usage : $tree->reroot($node); | |
| 445 Function: Reroots a tree either making a new node the root | |
| 446 Returns : 1 on success, 0 on failure | |
| 447 Args : Bio::Tree::NodeI that is in the tree, but is not the current root | |
| 448 | |
| 449 =cut | |
| 450 | |
| 451 sub reroot { | |
| 452 my ($self,$new_root) = @_; | |
| 453 unless (defined $new_root && $new_root->isa("Bio::Tree::NodeI")) { | |
| 454 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); | |
| 455 return 0; | |
| 456 } | |
| 457 if( $new_root->is_Leaf() ) { | |
| 458 $self->warn("Asking to root with a leaf, will use the leaf's ancestor"); | |
| 459 $new_root = $new_root->ancestor; | |
| 460 } | |
| 461 | |
| 462 my $old_root = $self->get_root_node; | |
| 463 if( $new_root == $old_root ) { | |
| 464 $self->warn("Node requested for reroot is already the root node!"); | |
| 465 return 0; | |
| 466 } | |
| 467 | |
| 468 my @path = (); # along tree, from newroot to oldroot | |
| 469 my $node = $new_root; | |
| 470 while ($node) { | |
| 471 push @path, $node; | |
| 472 $node = $node->ancestor; | |
| 473 } | |
| 474 | |
| 475 my @path_from_oldroot = reverse @path; | |
| 476 for (my $i = 0; $i < @path_from_oldroot - 1; $i++) { | |
| 477 my $current = $path_from_oldroot[$i]; | |
| 478 my $next = $path_from_oldroot[$i + 1]; | |
| 479 $current->remove_Descendent($next); | |
| 480 $current->branch_length($next->branch_length); | |
| 481 $next->add_Descendent($current); | |
| 482 | |
| 483 } | |
| 484 $new_root->branch_length(undef); | |
| 485 $self->set_root_node($new_root); | |
| 486 | |
| 487 return 1; | |
| 488 } | |
| 489 | |
| 490 =head2 reverse_edge | |
| 491 | |
| 492 Title : reverse_edge | |
| 493 Usage : $node->reverse_edge(child); | |
| 494 Function: makes child be a parent of node | |
| 495 Requires: child must be a direct descendent of node | |
| 496 Returns : nothing | |
| 497 Args : Bio::Tree::NodeI that is in the tree | |
| 498 | |
| 499 =cut | |
| 500 | |
| 501 sub reverse_edge { | |
| 502 my ($self,$node) = @_; | |
| 503 delete_edge($self, $node); | |
| 504 $node->add_Descendent($self); | |
| 505 1; | |
| 506 } | |
| 507 | |
| 508 =head2 delete_edge | |
| 509 | |
| 510 Title : delete_edge | |
| 511 Usage : $node->reverse_edge(child); | |
| 512 Function: makes child be a parent of node | |
| 513 Requires: child must be a direct descendent of node | |
| 514 Returns : nothing | |
| 515 Args : Bio::Tree::NodeI that is in the tree | |
| 516 | |
| 517 =cut | |
| 518 | |
| 519 sub delete_edge { | |
| 520 my ($self,$node) = @_; | |
| 521 unless (defined $self && $self->isa("Bio::Tree::NodeI")) { | |
| 522 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); | |
| 523 return 1; | |
| 524 } | |
| 525 unless (defined $node && $node->isa("Bio::Tree::NodeI")) { | |
| 526 $self->warn("Must provide a valid Bio::Tree::NodeI when rerooting"); | |
| 527 return 1; | |
| 528 } | |
| 529 if( $self->{'_desc'}->{$node->internal_id} ) { | |
| 530 $node->ancestor(undef); | |
| 531 $self->{'_desc'}->{$node->internal_id}->ancestor(undef); | |
| 532 delete $self->{'_desc'}->{$node->internal_id}; | |
| 533 } else { | |
| 534 $self->warn("First argument must be direct parent of node"); | |
| 535 return 1; | |
| 536 } | |
| 537 1; | |
| 538 } | |
| 539 | |
| 540 sub findnode_by_id { | |
| 541 my $tree = shift; | |
| 542 my $id = shift; | |
| 543 my $rootnode = $tree->get_root_node; | |
| 544 if ( ($rootnode->id) and ($rootnode->id eq $id) ) { | |
| 545 return $rootnode; | |
| 546 } | |
| 547 # process all the children | |
| 548 foreach my $node ( $rootnode->get_Descendents ) { | |
| 549 if ( ($node->id) and ($node->id eq $id ) ) { | |
| 550 return $node; | |
| 551 } | |
| 552 } | |
| 553 } | |
| 554 | |
| 555 1; |
