comparison variant_effect_predictor/Bio/Coordinate/Graph.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 # $Id: Graph.pm,v 1.2.2.2 2003/09/08 12:16:18 heikki Exp $
2 #
3 # bioperl module for Bio::Coordinate::Graph
4 #
5 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
6 #
7 # Copyright Heikki Lehvaslaiho
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Coordinate::Graph - Finds shortest path between nodes in a graph
16
17 =head1 SYNOPSIS
18
19 # get a hash of hashes representing the graph. E.g.:
20 my $hash= {
21 '1' => {
22 '2' => 1
23 },
24 '2' => {
25 '4' => 1,
26 '3' => 1
27 },
28 '3' => undef,
29 '4' => {
30 '5' => 1
31 },
32 '5' => undef
33 };
34
35 # create the object;
36 my $graph = Bio::Coordinate::Graph->new(-graph => $hash);
37
38 # find the shortest path between two nodes
39 my $a = 1;
40 my $b = 6;
41 my @path = $graph->shortest_paths($a);
42 print join (", ", @path), "\n";
43
44
45 =head1 DESCRIPTION
46
47 This class calculates the shortest path between input and output
48 coordinate systems in a graph that defines the relationships between
49 them. This class is primarely designed to analyze gene-related
50 coordinate systems. See L<Bio::Coordinate::GeneMapper>.
51
52 Note that this module can not be used to manage graphs.
53
54 Technically the graph implemented here is known as Directed Acyclic
55 Graph (DAG). DAG is composed of vertices (nodes) and edges (with
56 optional weights) linking them. Nodes of the graph are the coordinate
57 systems in gene mapper.
58
59 The shortest path is found using the Dijkstra's algorithm. This
60 algorithm is fast and greedy and requires all weights to be
61 positive. All weights in the gene coordinate system graph are
62 currently equal (1) making the graph unweighted. That makes the use of
63 Dijkstra's algorithm an overkill. A impler and faster breadth-first
64 would be enough. Luckily the difference for small graphs is not
65 signigicant and the implementation is capable to take weights into
66 account if needed at some later time.
67
68 =head2 Input format
69
70 The graph needs to be primed using a hash of hashes where there is a
71 key for each node. The second keys are the names of the downstream
72 neighboring nodes and values are the weights for reaching them. Here
73 is part of the gene coordiante system graph::
74
75
76 $hash = {
77 '6' => undef,
78 '3' => {
79 '6' => 1
80 },
81 '2' => {
82 '6' => 1,
83 '4' => 1,
84 '3' => 1
85 },
86 '1' => {
87 '2' => 1
88 },
89 '4' => {
90 '5' => 1
91 },
92 '5' => undef
93 };
94
95
96 Note that the names need to be positive integrers. Root should be '1'
97 and directness of the graph is taken advantage of to speed
98 calculations by assuming that downsream nodes always have larger
99 number as name.
100
101 An alternative (shorter) way of describing input is to use hash of
102 arrays. See L<Bio::Coordinate::Graph::hash_of_arrays>.
103
104
105 =head1 FEEDBACK
106
107 =head2 Mailing Lists
108
109 User feedback is an integral part of the evolution of this and other
110 Bioperl modules. Send your comments and suggestions preferably to the
111 Bioperl mailing lists Your participation is much appreciated.
112
113 bioperl-l@bioperl.org - General discussion
114 http://bio.perl.org/MailList.html - About the mailing lists
115
116 =head2 Reporting Bugs
117
118 report bugs to the Bioperl bug tracking system to help us keep track
119 the bugs and their resolution. Bug reports can be submitted via
120 email or the web:
121
122 bioperl-bugs@bio.perl.org
123 http://bugzilla.bioperl.org/
124
125 =head1 AUTHOR - Heikki Lehvaslaiho
126
127 Email: heikki@ebi.ac.uk
128 Address:
129
130 EMBL Outstation, European Bioinformatics Institute
131 Wellcome Trust Genome Campus, Hinxton
132 Cambs. CB10 1SD, United Kingdom
133
134 =head1 APPENDIX
135
136 The rest of the documentation details each of the object
137 methods. Internal methods are usually preceded with a _
138
139 =cut
140
141
142 # Let the code begin...
143
144 package Bio::Coordinate::Graph;
145 use vars qw(@ISA );
146 use strict;
147
148 # Object preamble - inherits from Bio::Root::Root
149 use Bio::Root::Root;
150
151 @ISA = qw(Bio::Root::Root);
152
153
154 sub new {
155 my($class,@args) = @_;
156 my $self = $class->SUPER::new(@args);
157
158 my($graph, $hasharray) =
159 $self->_rearrange([qw(
160 GRAPH
161 HASHARRAY
162 )],
163 @args);
164
165 $graph && $self->graph($graph);
166 $hasharray && $self->hasharray($hasharray);
167
168 $self->{'_root'} = undef;
169
170 return $self; # success - we hope!
171
172 }
173
174 =head2 Graph structure input methods
175
176 =cut
177
178 =head2 graph
179
180 Title : graph
181 Usage : $obj->graph($my_graph)
182 Function: Read/write method for the graph structure
183 Example :
184 Returns : hash of hashes grah structure
185 Args : reference to a hash of hashes
186
187 =cut
188
189 sub graph {
190
191 my ($self,$value) = @_;
192
193 if ($value) {
194 $self->throw("Need a hash of hashes")
195 unless ref($value) eq 'HASH' ;
196 $self->{'_dag'} = $value;
197
198 # empty the cache
199 $self->{'_root'} = undef;
200
201 }
202
203 return $self->{'_dag'};
204
205 }
206
207
208 =head2 hash_of_arrays
209
210 Title : hash_of_arrays
211 Usage : $obj->hash_of_array(%hasharray)
212 Function: An alternative method to read in the graph structure.
213 Hash arrays are easier to type. This method converts
214 arrays into hashes and assigns equal values "1" to
215 weights.
216
217 Example : Here is an example of simple structure containing a graph.
218
219 my $DAG = {
220 6 => [],
221 5 => [],
222 4 => [5],
223 3 => [6],
224 2 => [3, 4, 6],
225 1 => [2]
226 };
227
228 Returns : hash of hashes graph structure
229 Args : reference to a hash of arrays
230
231 =cut
232
233 sub hash_of_arrays {
234
235 my ($self,$value) = @_;
236
237 # empty the cache
238 $self->{'_root'} = undef;
239
240 if ($value) {
241
242 $self->throw("Need a hash of hashes")
243 unless ref($value) eq 'HASH' ;
244
245 #copy the hash of arrays into a hash of hashes;
246 my %hash;
247 foreach my $start ( keys %{$value}){
248 $hash{$start} = undef;
249 map { $hash{$start}{$_} = 1 } @{$value->{$start}};
250 }
251
252 $self->{'_dag'} = \%hash;
253 }
254
255 return $self->{'_dag'};
256
257 }
258
259 =head2 Methods for determining the shortest path in the graph
260
261 =cut
262
263 =head2 shortest_path
264
265 Title : shortest_path
266 Usage : $obj->shortest_path($a, $b);
267 Function: Method for retrieving the shortest path between nodes.
268 If the start node remains the same, the method is sometimes
269 able to use cached results, otherwise it will recalculate
270 the paths.
271 Example :
272 Returns : array of node names, only the start node name if no path
273 Args : name of the start node
274 : name of the end node
275
276 =cut
277
278
279 sub shortest_path {
280 my ($self, $root, $end) = @_;
281
282 $self->throw("Two arguments needed") unless @_ == 3;
283 $self->throw("No node name [$root]")
284 unless exists $self->{'_dag'}->{$root};
285 $self->throw("No node name [$end]")
286 unless exists $self->{'_dag'}->{$end};
287
288 my @res; # results
289 my $reverse;
290
291 if ($root > $end) {
292 ($root, $end) = ($end, $root );
293 $reverse++;
294 }
295
296 # try to use cached paths
297 $self->dijkstra($root) unless
298 defined $self->{'_root'} and $self->{'_root'} eq $root;
299
300 return @res unless $self->{'_paths'} ;
301
302 # create the list
303 my $node = $end;
304 my $prev = $self->{'_paths'}->{$end}{'prev'};
305 while ($prev) {
306 unshift @res, $node;
307 $node = $self->{'_paths'}->{$node}{'prev'};
308 $prev = $self->{'_paths'}->{$node}{'prev'};
309 }
310 unshift @res, $node;
311
312 $reverse ? return reverse @res : return @res;
313 }
314
315
316 =head2 dijkstra
317
318 Title : dijkstra
319 Usage : $graph->dijkstra(1);
320 Function: Implements Dijkstra's algorithm.
321 Returns or sets a list of mappers. The returned path
322 description is always directed down from the root.
323 Called from shortest_path().
324 Example :
325 Returns : Reference to a hash of hashes representing a linked list
326 which contains shortest path down to all nodes from the start
327 node. E.g.:
328
329 $res = {
330 '2' => {
331 'prev' => '1',
332 'dist' => 1
333 },
334 '1' => {
335 'prev' => undef,
336 'dist' => 0
337 },
338 };
339
340 Args : name of the start node
341
342 =cut
343
344 sub dijkstra {
345 my ($self,$root) = @_;
346
347 $self->throw("I need the name of the root node input") unless $root;
348 $self->throw("No node name [$root]")
349 unless exists $self->{'_dag'}->{$root};
350
351 my %est = (); # estimate hash
352 my %res = (); # result hash
353 my $nodes = keys %{$self->{'_dag'}};
354 my $maxdist = 1000000;
355
356 # cache the root value
357 $self->{'_root'} = $root;
358
359 foreach my $node ( keys %{$self->{'_dag'}} ){
360 if ($node eq $root) {
361 $est{$node}{'prev'} = undef;
362 $est{$node}{'dist'} = 0;
363 } else {
364 $est{$node}{'prev'} = undef;
365 $est{$node}{'dist'} = $maxdist;
366 }
367 }
368
369 # remove nodes from %est until it is empty
370 while (keys %est) {
371
372 #select the node closest to current one, or root node
373 my $min_node;
374 my $min = $maxdist;
375 foreach my $node (reverse sort keys %est) {
376 if ( $est{$node}{'dist'} < $min ) {
377 $min = $est{$node}{'dist'};
378 $min_node = $node;
379 }
380 }
381
382 # no more links between nodes
383 last unless ($min_node);
384
385 # move the node from %est into %res;
386 $res{$min_node} = delete $est{$min_node};
387
388 # recompute distances to the neighbours
389 my $dist = $res{$min_node}{'dist'};
390 foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){
391 next unless $est{$neighbour}; # might not be there any more
392 $est{$neighbour}{'prev'} = $min_node;
393 $est{$neighbour}{'dist'} =
394 $dist + $self->{'_dag'}{$min_node}{$neighbour}
395 if $est{$neighbour}{'dist'} > $dist + 1 ;
396 }
397 }
398 return $self->{'_paths'} = \%res;
399 }
400
401
402 1;
403