Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Coordinate/Graph.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: Graph.pm,v 1.2.2.2 2003/09/08 12:16:18 heikki Exp $ | |
| 2 # | |
| 3 # bioperl module for Bio::Coordinate::Graph | |
| 4 # | |
| 5 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> | |
| 6 # | |
| 7 # Copyright Heikki Lehvaslaiho | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::Coordinate::Graph - Finds shortest path between nodes in a graph | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 # get a hash of hashes representing the graph. E.g.: | |
| 20 my $hash= { | |
| 21 '1' => { | |
| 22 '2' => 1 | |
| 23 }, | |
| 24 '2' => { | |
| 25 '4' => 1, | |
| 26 '3' => 1 | |
| 27 }, | |
| 28 '3' => undef, | |
| 29 '4' => { | |
| 30 '5' => 1 | |
| 31 }, | |
| 32 '5' => undef | |
| 33 }; | |
| 34 | |
| 35 # create the object; | |
| 36 my $graph = Bio::Coordinate::Graph->new(-graph => $hash); | |
| 37 | |
| 38 # find the shortest path between two nodes | |
| 39 my $a = 1; | |
| 40 my $b = 6; | |
| 41 my @path = $graph->shortest_paths($a); | |
| 42 print join (", ", @path), "\n"; | |
| 43 | |
| 44 | |
| 45 =head1 DESCRIPTION | |
| 46 | |
| 47 This class calculates the shortest path between input and output | |
| 48 coordinate systems in a graph that defines the relationships between | |
| 49 them. This class is primarely designed to analyze gene-related | |
| 50 coordinate systems. See L<Bio::Coordinate::GeneMapper>. | |
| 51 | |
| 52 Note that this module can not be used to manage graphs. | |
| 53 | |
| 54 Technically the graph implemented here is known as Directed Acyclic | |
| 55 Graph (DAG). DAG is composed of vertices (nodes) and edges (with | |
| 56 optional weights) linking them. Nodes of the graph are the coordinate | |
| 57 systems in gene mapper. | |
| 58 | |
| 59 The shortest path is found using the Dijkstra's algorithm. This | |
| 60 algorithm is fast and greedy and requires all weights to be | |
| 61 positive. All weights in the gene coordinate system graph are | |
| 62 currently equal (1) making the graph unweighted. That makes the use of | |
| 63 Dijkstra's algorithm an overkill. A impler and faster breadth-first | |
| 64 would be enough. Luckily the difference for small graphs is not | |
| 65 signigicant and the implementation is capable to take weights into | |
| 66 account if needed at some later time. | |
| 67 | |
| 68 =head2 Input format | |
| 69 | |
| 70 The graph needs to be primed using a hash of hashes where there is a | |
| 71 key for each node. The second keys are the names of the downstream | |
| 72 neighboring nodes and values are the weights for reaching them. Here | |
| 73 is part of the gene coordiante system graph:: | |
| 74 | |
| 75 | |
| 76 $hash = { | |
| 77 '6' => undef, | |
| 78 '3' => { | |
| 79 '6' => 1 | |
| 80 }, | |
| 81 '2' => { | |
| 82 '6' => 1, | |
| 83 '4' => 1, | |
| 84 '3' => 1 | |
| 85 }, | |
| 86 '1' => { | |
| 87 '2' => 1 | |
| 88 }, | |
| 89 '4' => { | |
| 90 '5' => 1 | |
| 91 }, | |
| 92 '5' => undef | |
| 93 }; | |
| 94 | |
| 95 | |
| 96 Note that the names need to be positive integrers. Root should be '1' | |
| 97 and directness of the graph is taken advantage of to speed | |
| 98 calculations by assuming that downsream nodes always have larger | |
| 99 number as name. | |
| 100 | |
| 101 An alternative (shorter) way of describing input is to use hash of | |
| 102 arrays. See L<Bio::Coordinate::Graph::hash_of_arrays>. | |
| 103 | |
| 104 | |
| 105 =head1 FEEDBACK | |
| 106 | |
| 107 =head2 Mailing Lists | |
| 108 | |
| 109 User feedback is an integral part of the evolution of this and other | |
| 110 Bioperl modules. Send your comments and suggestions preferably to the | |
| 111 Bioperl mailing lists Your participation is much appreciated. | |
| 112 | |
| 113 bioperl-l@bioperl.org - General discussion | |
| 114 http://bio.perl.org/MailList.html - About the mailing lists | |
| 115 | |
| 116 =head2 Reporting Bugs | |
| 117 | |
| 118 report bugs to the Bioperl bug tracking system to help us keep track | |
| 119 the bugs and their resolution. Bug reports can be submitted via | |
| 120 email or the web: | |
| 121 | |
| 122 bioperl-bugs@bio.perl.org | |
| 123 http://bugzilla.bioperl.org/ | |
| 124 | |
| 125 =head1 AUTHOR - Heikki Lehvaslaiho | |
| 126 | |
| 127 Email: heikki@ebi.ac.uk | |
| 128 Address: | |
| 129 | |
| 130 EMBL Outstation, European Bioinformatics Institute | |
| 131 Wellcome Trust Genome Campus, Hinxton | |
| 132 Cambs. CB10 1SD, United Kingdom | |
| 133 | |
| 134 =head1 APPENDIX | |
| 135 | |
| 136 The rest of the documentation details each of the object | |
| 137 methods. Internal methods are usually preceded with a _ | |
| 138 | |
| 139 =cut | |
| 140 | |
| 141 | |
| 142 # Let the code begin... | |
| 143 | |
| 144 package Bio::Coordinate::Graph; | |
| 145 use vars qw(@ISA ); | |
| 146 use strict; | |
| 147 | |
| 148 # Object preamble - inherits from Bio::Root::Root | |
| 149 use Bio::Root::Root; | |
| 150 | |
| 151 @ISA = qw(Bio::Root::Root); | |
| 152 | |
| 153 | |
| 154 sub new { | |
| 155 my($class,@args) = @_; | |
| 156 my $self = $class->SUPER::new(@args); | |
| 157 | |
| 158 my($graph, $hasharray) = | |
| 159 $self->_rearrange([qw( | |
| 160 GRAPH | |
| 161 HASHARRAY | |
| 162 )], | |
| 163 @args); | |
| 164 | |
| 165 $graph && $self->graph($graph); | |
| 166 $hasharray && $self->hasharray($hasharray); | |
| 167 | |
| 168 $self->{'_root'} = undef; | |
| 169 | |
| 170 return $self; # success - we hope! | |
| 171 | |
| 172 } | |
| 173 | |
| 174 =head2 Graph structure input methods | |
| 175 | |
| 176 =cut | |
| 177 | |
| 178 =head2 graph | |
| 179 | |
| 180 Title : graph | |
| 181 Usage : $obj->graph($my_graph) | |
| 182 Function: Read/write method for the graph structure | |
| 183 Example : | |
| 184 Returns : hash of hashes grah structure | |
| 185 Args : reference to a hash of hashes | |
| 186 | |
| 187 =cut | |
| 188 | |
| 189 sub graph { | |
| 190 | |
| 191 my ($self,$value) = @_; | |
| 192 | |
| 193 if ($value) { | |
| 194 $self->throw("Need a hash of hashes") | |
| 195 unless ref($value) eq 'HASH' ; | |
| 196 $self->{'_dag'} = $value; | |
| 197 | |
| 198 # empty the cache | |
| 199 $self->{'_root'} = undef; | |
| 200 | |
| 201 } | |
| 202 | |
| 203 return $self->{'_dag'}; | |
| 204 | |
| 205 } | |
| 206 | |
| 207 | |
| 208 =head2 hash_of_arrays | |
| 209 | |
| 210 Title : hash_of_arrays | |
| 211 Usage : $obj->hash_of_array(%hasharray) | |
| 212 Function: An alternative method to read in the graph structure. | |
| 213 Hash arrays are easier to type. This method converts | |
| 214 arrays into hashes and assigns equal values "1" to | |
| 215 weights. | |
| 216 | |
| 217 Example : Here is an example of simple structure containing a graph. | |
| 218 | |
| 219 my $DAG = { | |
| 220 6 => [], | |
| 221 5 => [], | |
| 222 4 => [5], | |
| 223 3 => [6], | |
| 224 2 => [3, 4, 6], | |
| 225 1 => [2] | |
| 226 }; | |
| 227 | |
| 228 Returns : hash of hashes graph structure | |
| 229 Args : reference to a hash of arrays | |
| 230 | |
| 231 =cut | |
| 232 | |
| 233 sub hash_of_arrays { | |
| 234 | |
| 235 my ($self,$value) = @_; | |
| 236 | |
| 237 # empty the cache | |
| 238 $self->{'_root'} = undef; | |
| 239 | |
| 240 if ($value) { | |
| 241 | |
| 242 $self->throw("Need a hash of hashes") | |
| 243 unless ref($value) eq 'HASH' ; | |
| 244 | |
| 245 #copy the hash of arrays into a hash of hashes; | |
| 246 my %hash; | |
| 247 foreach my $start ( keys %{$value}){ | |
| 248 $hash{$start} = undef; | |
| 249 map { $hash{$start}{$_} = 1 } @{$value->{$start}}; | |
| 250 } | |
| 251 | |
| 252 $self->{'_dag'} = \%hash; | |
| 253 } | |
| 254 | |
| 255 return $self->{'_dag'}; | |
| 256 | |
| 257 } | |
| 258 | |
| 259 =head2 Methods for determining the shortest path in the graph | |
| 260 | |
| 261 =cut | |
| 262 | |
| 263 =head2 shortest_path | |
| 264 | |
| 265 Title : shortest_path | |
| 266 Usage : $obj->shortest_path($a, $b); | |
| 267 Function: Method for retrieving the shortest path between nodes. | |
| 268 If the start node remains the same, the method is sometimes | |
| 269 able to use cached results, otherwise it will recalculate | |
| 270 the paths. | |
| 271 Example : | |
| 272 Returns : array of node names, only the start node name if no path | |
| 273 Args : name of the start node | |
| 274 : name of the end node | |
| 275 | |
| 276 =cut | |
| 277 | |
| 278 | |
| 279 sub shortest_path { | |
| 280 my ($self, $root, $end) = @_; | |
| 281 | |
| 282 $self->throw("Two arguments needed") unless @_ == 3; | |
| 283 $self->throw("No node name [$root]") | |
| 284 unless exists $self->{'_dag'}->{$root}; | |
| 285 $self->throw("No node name [$end]") | |
| 286 unless exists $self->{'_dag'}->{$end}; | |
| 287 | |
| 288 my @res; # results | |
| 289 my $reverse; | |
| 290 | |
| 291 if ($root > $end) { | |
| 292 ($root, $end) = ($end, $root ); | |
| 293 $reverse++; | |
| 294 } | |
| 295 | |
| 296 # try to use cached paths | |
| 297 $self->dijkstra($root) unless | |
| 298 defined $self->{'_root'} and $self->{'_root'} eq $root; | |
| 299 | |
| 300 return @res unless $self->{'_paths'} ; | |
| 301 | |
| 302 # create the list | |
| 303 my $node = $end; | |
| 304 my $prev = $self->{'_paths'}->{$end}{'prev'}; | |
| 305 while ($prev) { | |
| 306 unshift @res, $node; | |
| 307 $node = $self->{'_paths'}->{$node}{'prev'}; | |
| 308 $prev = $self->{'_paths'}->{$node}{'prev'}; | |
| 309 } | |
| 310 unshift @res, $node; | |
| 311 | |
| 312 $reverse ? return reverse @res : return @res; | |
| 313 } | |
| 314 | |
| 315 | |
| 316 =head2 dijkstra | |
| 317 | |
| 318 Title : dijkstra | |
| 319 Usage : $graph->dijkstra(1); | |
| 320 Function: Implements Dijkstra's algorithm. | |
| 321 Returns or sets a list of mappers. The returned path | |
| 322 description is always directed down from the root. | |
| 323 Called from shortest_path(). | |
| 324 Example : | |
| 325 Returns : Reference to a hash of hashes representing a linked list | |
| 326 which contains shortest path down to all nodes from the start | |
| 327 node. E.g.: | |
| 328 | |
| 329 $res = { | |
| 330 '2' => { | |
| 331 'prev' => '1', | |
| 332 'dist' => 1 | |
| 333 }, | |
| 334 '1' => { | |
| 335 'prev' => undef, | |
| 336 'dist' => 0 | |
| 337 }, | |
| 338 }; | |
| 339 | |
| 340 Args : name of the start node | |
| 341 | |
| 342 =cut | |
| 343 | |
| 344 sub dijkstra { | |
| 345 my ($self,$root) = @_; | |
| 346 | |
| 347 $self->throw("I need the name of the root node input") unless $root; | |
| 348 $self->throw("No node name [$root]") | |
| 349 unless exists $self->{'_dag'}->{$root}; | |
| 350 | |
| 351 my %est = (); # estimate hash | |
| 352 my %res = (); # result hash | |
| 353 my $nodes = keys %{$self->{'_dag'}}; | |
| 354 my $maxdist = 1000000; | |
| 355 | |
| 356 # cache the root value | |
| 357 $self->{'_root'} = $root; | |
| 358 | |
| 359 foreach my $node ( keys %{$self->{'_dag'}} ){ | |
| 360 if ($node eq $root) { | |
| 361 $est{$node}{'prev'} = undef; | |
| 362 $est{$node}{'dist'} = 0; | |
| 363 } else { | |
| 364 $est{$node}{'prev'} = undef; | |
| 365 $est{$node}{'dist'} = $maxdist; | |
| 366 } | |
| 367 } | |
| 368 | |
| 369 # remove nodes from %est until it is empty | |
| 370 while (keys %est) { | |
| 371 | |
| 372 #select the node closest to current one, or root node | |
| 373 my $min_node; | |
| 374 my $min = $maxdist; | |
| 375 foreach my $node (reverse sort keys %est) { | |
| 376 if ( $est{$node}{'dist'} < $min ) { | |
| 377 $min = $est{$node}{'dist'}; | |
| 378 $min_node = $node; | |
| 379 } | |
| 380 } | |
| 381 | |
| 382 # no more links between nodes | |
| 383 last unless ($min_node); | |
| 384 | |
| 385 # move the node from %est into %res; | |
| 386 $res{$min_node} = delete $est{$min_node}; | |
| 387 | |
| 388 # recompute distances to the neighbours | |
| 389 my $dist = $res{$min_node}{'dist'}; | |
| 390 foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){ | |
| 391 next unless $est{$neighbour}; # might not be there any more | |
| 392 $est{$neighbour}{'prev'} = $min_node; | |
| 393 $est{$neighbour}{'dist'} = | |
| 394 $dist + $self->{'_dag'}{$min_node}{$neighbour} | |
| 395 if $est{$neighbour}{'dist'} > $dist + 1 ; | |
| 396 } | |
| 397 } | |
| 398 return $self->{'_paths'} = \%res; | |
| 399 } | |
| 400 | |
| 401 | |
| 402 1; | |
| 403 |
