0
|
1 # $Id: Graph.pm,v 1.2.2.2 2003/09/08 12:16:18 heikki Exp $
|
|
2 #
|
|
3 # bioperl module for Bio::Coordinate::Graph
|
|
4 #
|
|
5 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
|
|
6 #
|
|
7 # Copyright Heikki Lehvaslaiho
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Coordinate::Graph - Finds shortest path between nodes in a graph
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # get a hash of hashes representing the graph. E.g.:
|
|
20 my $hash= {
|
|
21 '1' => {
|
|
22 '2' => 1
|
|
23 },
|
|
24 '2' => {
|
|
25 '4' => 1,
|
|
26 '3' => 1
|
|
27 },
|
|
28 '3' => undef,
|
|
29 '4' => {
|
|
30 '5' => 1
|
|
31 },
|
|
32 '5' => undef
|
|
33 };
|
|
34
|
|
35 # create the object;
|
|
36 my $graph = Bio::Coordinate::Graph->new(-graph => $hash);
|
|
37
|
|
38 # find the shortest path between two nodes
|
|
39 my $a = 1;
|
|
40 my $b = 6;
|
|
41 my @path = $graph->shortest_paths($a);
|
|
42 print join (", ", @path), "\n";
|
|
43
|
|
44
|
|
45 =head1 DESCRIPTION
|
|
46
|
|
47 This class calculates the shortest path between input and output
|
|
48 coordinate systems in a graph that defines the relationships between
|
|
49 them. This class is primarely designed to analyze gene-related
|
|
50 coordinate systems. See L<Bio::Coordinate::GeneMapper>.
|
|
51
|
|
52 Note that this module can not be used to manage graphs.
|
|
53
|
|
54 Technically the graph implemented here is known as Directed Acyclic
|
|
55 Graph (DAG). DAG is composed of vertices (nodes) and edges (with
|
|
56 optional weights) linking them. Nodes of the graph are the coordinate
|
|
57 systems in gene mapper.
|
|
58
|
|
59 The shortest path is found using the Dijkstra's algorithm. This
|
|
60 algorithm is fast and greedy and requires all weights to be
|
|
61 positive. All weights in the gene coordinate system graph are
|
|
62 currently equal (1) making the graph unweighted. That makes the use of
|
|
63 Dijkstra's algorithm an overkill. A impler and faster breadth-first
|
|
64 would be enough. Luckily the difference for small graphs is not
|
|
65 signigicant and the implementation is capable to take weights into
|
|
66 account if needed at some later time.
|
|
67
|
|
68 =head2 Input format
|
|
69
|
|
70 The graph needs to be primed using a hash of hashes where there is a
|
|
71 key for each node. The second keys are the names of the downstream
|
|
72 neighboring nodes and values are the weights for reaching them. Here
|
|
73 is part of the gene coordiante system graph::
|
|
74
|
|
75
|
|
76 $hash = {
|
|
77 '6' => undef,
|
|
78 '3' => {
|
|
79 '6' => 1
|
|
80 },
|
|
81 '2' => {
|
|
82 '6' => 1,
|
|
83 '4' => 1,
|
|
84 '3' => 1
|
|
85 },
|
|
86 '1' => {
|
|
87 '2' => 1
|
|
88 },
|
|
89 '4' => {
|
|
90 '5' => 1
|
|
91 },
|
|
92 '5' => undef
|
|
93 };
|
|
94
|
|
95
|
|
96 Note that the names need to be positive integrers. Root should be '1'
|
|
97 and directness of the graph is taken advantage of to speed
|
|
98 calculations by assuming that downsream nodes always have larger
|
|
99 number as name.
|
|
100
|
|
101 An alternative (shorter) way of describing input is to use hash of
|
|
102 arrays. See L<Bio::Coordinate::Graph::hash_of_arrays>.
|
|
103
|
|
104
|
|
105 =head1 FEEDBACK
|
|
106
|
|
107 =head2 Mailing Lists
|
|
108
|
|
109 User feedback is an integral part of the evolution of this and other
|
|
110 Bioperl modules. Send your comments and suggestions preferably to the
|
|
111 Bioperl mailing lists Your participation is much appreciated.
|
|
112
|
|
113 bioperl-l@bioperl.org - General discussion
|
|
114 http://bio.perl.org/MailList.html - About the mailing lists
|
|
115
|
|
116 =head2 Reporting Bugs
|
|
117
|
|
118 report bugs to the Bioperl bug tracking system to help us keep track
|
|
119 the bugs and their resolution. Bug reports can be submitted via
|
|
120 email or the web:
|
|
121
|
|
122 bioperl-bugs@bio.perl.org
|
|
123 http://bugzilla.bioperl.org/
|
|
124
|
|
125 =head1 AUTHOR - Heikki Lehvaslaiho
|
|
126
|
|
127 Email: heikki@ebi.ac.uk
|
|
128 Address:
|
|
129
|
|
130 EMBL Outstation, European Bioinformatics Institute
|
|
131 Wellcome Trust Genome Campus, Hinxton
|
|
132 Cambs. CB10 1SD, United Kingdom
|
|
133
|
|
134 =head1 APPENDIX
|
|
135
|
|
136 The rest of the documentation details each of the object
|
|
137 methods. Internal methods are usually preceded with a _
|
|
138
|
|
139 =cut
|
|
140
|
|
141
|
|
142 # Let the code begin...
|
|
143
|
|
144 package Bio::Coordinate::Graph;
|
|
145 use vars qw(@ISA );
|
|
146 use strict;
|
|
147
|
|
148 # Object preamble - inherits from Bio::Root::Root
|
|
149 use Bio::Root::Root;
|
|
150
|
|
151 @ISA = qw(Bio::Root::Root);
|
|
152
|
|
153
|
|
154 sub new {
|
|
155 my($class,@args) = @_;
|
|
156 my $self = $class->SUPER::new(@args);
|
|
157
|
|
158 my($graph, $hasharray) =
|
|
159 $self->_rearrange([qw(
|
|
160 GRAPH
|
|
161 HASHARRAY
|
|
162 )],
|
|
163 @args);
|
|
164
|
|
165 $graph && $self->graph($graph);
|
|
166 $hasharray && $self->hasharray($hasharray);
|
|
167
|
|
168 $self->{'_root'} = undef;
|
|
169
|
|
170 return $self; # success - we hope!
|
|
171
|
|
172 }
|
|
173
|
|
174 =head2 Graph structure input methods
|
|
175
|
|
176 =cut
|
|
177
|
|
178 =head2 graph
|
|
179
|
|
180 Title : graph
|
|
181 Usage : $obj->graph($my_graph)
|
|
182 Function: Read/write method for the graph structure
|
|
183 Example :
|
|
184 Returns : hash of hashes grah structure
|
|
185 Args : reference to a hash of hashes
|
|
186
|
|
187 =cut
|
|
188
|
|
189 sub graph {
|
|
190
|
|
191 my ($self,$value) = @_;
|
|
192
|
|
193 if ($value) {
|
|
194 $self->throw("Need a hash of hashes")
|
|
195 unless ref($value) eq 'HASH' ;
|
|
196 $self->{'_dag'} = $value;
|
|
197
|
|
198 # empty the cache
|
|
199 $self->{'_root'} = undef;
|
|
200
|
|
201 }
|
|
202
|
|
203 return $self->{'_dag'};
|
|
204
|
|
205 }
|
|
206
|
|
207
|
|
208 =head2 hash_of_arrays
|
|
209
|
|
210 Title : hash_of_arrays
|
|
211 Usage : $obj->hash_of_array(%hasharray)
|
|
212 Function: An alternative method to read in the graph structure.
|
|
213 Hash arrays are easier to type. This method converts
|
|
214 arrays into hashes and assigns equal values "1" to
|
|
215 weights.
|
|
216
|
|
217 Example : Here is an example of simple structure containing a graph.
|
|
218
|
|
219 my $DAG = {
|
|
220 6 => [],
|
|
221 5 => [],
|
|
222 4 => [5],
|
|
223 3 => [6],
|
|
224 2 => [3, 4, 6],
|
|
225 1 => [2]
|
|
226 };
|
|
227
|
|
228 Returns : hash of hashes graph structure
|
|
229 Args : reference to a hash of arrays
|
|
230
|
|
231 =cut
|
|
232
|
|
233 sub hash_of_arrays {
|
|
234
|
|
235 my ($self,$value) = @_;
|
|
236
|
|
237 # empty the cache
|
|
238 $self->{'_root'} = undef;
|
|
239
|
|
240 if ($value) {
|
|
241
|
|
242 $self->throw("Need a hash of hashes")
|
|
243 unless ref($value) eq 'HASH' ;
|
|
244
|
|
245 #copy the hash of arrays into a hash of hashes;
|
|
246 my %hash;
|
|
247 foreach my $start ( keys %{$value}){
|
|
248 $hash{$start} = undef;
|
|
249 map { $hash{$start}{$_} = 1 } @{$value->{$start}};
|
|
250 }
|
|
251
|
|
252 $self->{'_dag'} = \%hash;
|
|
253 }
|
|
254
|
|
255 return $self->{'_dag'};
|
|
256
|
|
257 }
|
|
258
|
|
259 =head2 Methods for determining the shortest path in the graph
|
|
260
|
|
261 =cut
|
|
262
|
|
263 =head2 shortest_path
|
|
264
|
|
265 Title : shortest_path
|
|
266 Usage : $obj->shortest_path($a, $b);
|
|
267 Function: Method for retrieving the shortest path between nodes.
|
|
268 If the start node remains the same, the method is sometimes
|
|
269 able to use cached results, otherwise it will recalculate
|
|
270 the paths.
|
|
271 Example :
|
|
272 Returns : array of node names, only the start node name if no path
|
|
273 Args : name of the start node
|
|
274 : name of the end node
|
|
275
|
|
276 =cut
|
|
277
|
|
278
|
|
279 sub shortest_path {
|
|
280 my ($self, $root, $end) = @_;
|
|
281
|
|
282 $self->throw("Two arguments needed") unless @_ == 3;
|
|
283 $self->throw("No node name [$root]")
|
|
284 unless exists $self->{'_dag'}->{$root};
|
|
285 $self->throw("No node name [$end]")
|
|
286 unless exists $self->{'_dag'}->{$end};
|
|
287
|
|
288 my @res; # results
|
|
289 my $reverse;
|
|
290
|
|
291 if ($root > $end) {
|
|
292 ($root, $end) = ($end, $root );
|
|
293 $reverse++;
|
|
294 }
|
|
295
|
|
296 # try to use cached paths
|
|
297 $self->dijkstra($root) unless
|
|
298 defined $self->{'_root'} and $self->{'_root'} eq $root;
|
|
299
|
|
300 return @res unless $self->{'_paths'} ;
|
|
301
|
|
302 # create the list
|
|
303 my $node = $end;
|
|
304 my $prev = $self->{'_paths'}->{$end}{'prev'};
|
|
305 while ($prev) {
|
|
306 unshift @res, $node;
|
|
307 $node = $self->{'_paths'}->{$node}{'prev'};
|
|
308 $prev = $self->{'_paths'}->{$node}{'prev'};
|
|
309 }
|
|
310 unshift @res, $node;
|
|
311
|
|
312 $reverse ? return reverse @res : return @res;
|
|
313 }
|
|
314
|
|
315
|
|
316 =head2 dijkstra
|
|
317
|
|
318 Title : dijkstra
|
|
319 Usage : $graph->dijkstra(1);
|
|
320 Function: Implements Dijkstra's algorithm.
|
|
321 Returns or sets a list of mappers. The returned path
|
|
322 description is always directed down from the root.
|
|
323 Called from shortest_path().
|
|
324 Example :
|
|
325 Returns : Reference to a hash of hashes representing a linked list
|
|
326 which contains shortest path down to all nodes from the start
|
|
327 node. E.g.:
|
|
328
|
|
329 $res = {
|
|
330 '2' => {
|
|
331 'prev' => '1',
|
|
332 'dist' => 1
|
|
333 },
|
|
334 '1' => {
|
|
335 'prev' => undef,
|
|
336 'dist' => 0
|
|
337 },
|
|
338 };
|
|
339
|
|
340 Args : name of the start node
|
|
341
|
|
342 =cut
|
|
343
|
|
344 sub dijkstra {
|
|
345 my ($self,$root) = @_;
|
|
346
|
|
347 $self->throw("I need the name of the root node input") unless $root;
|
|
348 $self->throw("No node name [$root]")
|
|
349 unless exists $self->{'_dag'}->{$root};
|
|
350
|
|
351 my %est = (); # estimate hash
|
|
352 my %res = (); # result hash
|
|
353 my $nodes = keys %{$self->{'_dag'}};
|
|
354 my $maxdist = 1000000;
|
|
355
|
|
356 # cache the root value
|
|
357 $self->{'_root'} = $root;
|
|
358
|
|
359 foreach my $node ( keys %{$self->{'_dag'}} ){
|
|
360 if ($node eq $root) {
|
|
361 $est{$node}{'prev'} = undef;
|
|
362 $est{$node}{'dist'} = 0;
|
|
363 } else {
|
|
364 $est{$node}{'prev'} = undef;
|
|
365 $est{$node}{'dist'} = $maxdist;
|
|
366 }
|
|
367 }
|
|
368
|
|
369 # remove nodes from %est until it is empty
|
|
370 while (keys %est) {
|
|
371
|
|
372 #select the node closest to current one, or root node
|
|
373 my $min_node;
|
|
374 my $min = $maxdist;
|
|
375 foreach my $node (reverse sort keys %est) {
|
|
376 if ( $est{$node}{'dist'} < $min ) {
|
|
377 $min = $est{$node}{'dist'};
|
|
378 $min_node = $node;
|
|
379 }
|
|
380 }
|
|
381
|
|
382 # no more links between nodes
|
|
383 last unless ($min_node);
|
|
384
|
|
385 # move the node from %est into %res;
|
|
386 $res{$min_node} = delete $est{$min_node};
|
|
387
|
|
388 # recompute distances to the neighbours
|
|
389 my $dist = $res{$min_node}{'dist'};
|
|
390 foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){
|
|
391 next unless $est{$neighbour}; # might not be there any more
|
|
392 $est{$neighbour}{'prev'} = $min_node;
|
|
393 $est{$neighbour}{'dist'} =
|
|
394 $dist + $self->{'_dag'}{$min_node}{$neighbour}
|
|
395 if $est{$neighbour}{'dist'} > $dist + 1 ;
|
|
396 }
|
|
397 }
|
|
398 return $self->{'_paths'} = \%res;
|
|
399 }
|
|
400
|
|
401
|
|
402 1;
|
|
403
|