Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/Coordinate/Graph.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 # $Id: Graph.pm,v 1.2.2.2 2003/09/08 12:16:18 heikki Exp $ | |
2 # | |
3 # bioperl module for Bio::Coordinate::Graph | |
4 # | |
5 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> | |
6 # | |
7 # Copyright Heikki Lehvaslaiho | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Coordinate::Graph - Finds shortest path between nodes in a graph | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # get a hash of hashes representing the graph. E.g.: | |
20 my $hash= { | |
21 '1' => { | |
22 '2' => 1 | |
23 }, | |
24 '2' => { | |
25 '4' => 1, | |
26 '3' => 1 | |
27 }, | |
28 '3' => undef, | |
29 '4' => { | |
30 '5' => 1 | |
31 }, | |
32 '5' => undef | |
33 }; | |
34 | |
35 # create the object; | |
36 my $graph = Bio::Coordinate::Graph->new(-graph => $hash); | |
37 | |
38 # find the shortest path between two nodes | |
39 my $a = 1; | |
40 my $b = 6; | |
41 my @path = $graph->shortest_paths($a); | |
42 print join (", ", @path), "\n"; | |
43 | |
44 | |
45 =head1 DESCRIPTION | |
46 | |
47 This class calculates the shortest path between input and output | |
48 coordinate systems in a graph that defines the relationships between | |
49 them. This class is primarely designed to analyze gene-related | |
50 coordinate systems. See L<Bio::Coordinate::GeneMapper>. | |
51 | |
52 Note that this module can not be used to manage graphs. | |
53 | |
54 Technically the graph implemented here is known as Directed Acyclic | |
55 Graph (DAG). DAG is composed of vertices (nodes) and edges (with | |
56 optional weights) linking them. Nodes of the graph are the coordinate | |
57 systems in gene mapper. | |
58 | |
59 The shortest path is found using the Dijkstra's algorithm. This | |
60 algorithm is fast and greedy and requires all weights to be | |
61 positive. All weights in the gene coordinate system graph are | |
62 currently equal (1) making the graph unweighted. That makes the use of | |
63 Dijkstra's algorithm an overkill. A impler and faster breadth-first | |
64 would be enough. Luckily the difference for small graphs is not | |
65 signigicant and the implementation is capable to take weights into | |
66 account if needed at some later time. | |
67 | |
68 =head2 Input format | |
69 | |
70 The graph needs to be primed using a hash of hashes where there is a | |
71 key for each node. The second keys are the names of the downstream | |
72 neighboring nodes and values are the weights for reaching them. Here | |
73 is part of the gene coordiante system graph:: | |
74 | |
75 | |
76 $hash = { | |
77 '6' => undef, | |
78 '3' => { | |
79 '6' => 1 | |
80 }, | |
81 '2' => { | |
82 '6' => 1, | |
83 '4' => 1, | |
84 '3' => 1 | |
85 }, | |
86 '1' => { | |
87 '2' => 1 | |
88 }, | |
89 '4' => { | |
90 '5' => 1 | |
91 }, | |
92 '5' => undef | |
93 }; | |
94 | |
95 | |
96 Note that the names need to be positive integrers. Root should be '1' | |
97 and directness of the graph is taken advantage of to speed | |
98 calculations by assuming that downsream nodes always have larger | |
99 number as name. | |
100 | |
101 An alternative (shorter) way of describing input is to use hash of | |
102 arrays. See L<Bio::Coordinate::Graph::hash_of_arrays>. | |
103 | |
104 | |
105 =head1 FEEDBACK | |
106 | |
107 =head2 Mailing Lists | |
108 | |
109 User feedback is an integral part of the evolution of this and other | |
110 Bioperl modules. Send your comments and suggestions preferably to the | |
111 Bioperl mailing lists Your participation is much appreciated. | |
112 | |
113 bioperl-l@bioperl.org - General discussion | |
114 http://bio.perl.org/MailList.html - About the mailing lists | |
115 | |
116 =head2 Reporting Bugs | |
117 | |
118 report bugs to the Bioperl bug tracking system to help us keep track | |
119 the bugs and their resolution. Bug reports can be submitted via | |
120 email or the web: | |
121 | |
122 bioperl-bugs@bio.perl.org | |
123 http://bugzilla.bioperl.org/ | |
124 | |
125 =head1 AUTHOR - Heikki Lehvaslaiho | |
126 | |
127 Email: heikki@ebi.ac.uk | |
128 Address: | |
129 | |
130 EMBL Outstation, European Bioinformatics Institute | |
131 Wellcome Trust Genome Campus, Hinxton | |
132 Cambs. CB10 1SD, United Kingdom | |
133 | |
134 =head1 APPENDIX | |
135 | |
136 The rest of the documentation details each of the object | |
137 methods. Internal methods are usually preceded with a _ | |
138 | |
139 =cut | |
140 | |
141 | |
142 # Let the code begin... | |
143 | |
144 package Bio::Coordinate::Graph; | |
145 use vars qw(@ISA ); | |
146 use strict; | |
147 | |
148 # Object preamble - inherits from Bio::Root::Root | |
149 use Bio::Root::Root; | |
150 | |
151 @ISA = qw(Bio::Root::Root); | |
152 | |
153 | |
154 sub new { | |
155 my($class,@args) = @_; | |
156 my $self = $class->SUPER::new(@args); | |
157 | |
158 my($graph, $hasharray) = | |
159 $self->_rearrange([qw( | |
160 GRAPH | |
161 HASHARRAY | |
162 )], | |
163 @args); | |
164 | |
165 $graph && $self->graph($graph); | |
166 $hasharray && $self->hasharray($hasharray); | |
167 | |
168 $self->{'_root'} = undef; | |
169 | |
170 return $self; # success - we hope! | |
171 | |
172 } | |
173 | |
174 =head2 Graph structure input methods | |
175 | |
176 =cut | |
177 | |
178 =head2 graph | |
179 | |
180 Title : graph | |
181 Usage : $obj->graph($my_graph) | |
182 Function: Read/write method for the graph structure | |
183 Example : | |
184 Returns : hash of hashes grah structure | |
185 Args : reference to a hash of hashes | |
186 | |
187 =cut | |
188 | |
189 sub graph { | |
190 | |
191 my ($self,$value) = @_; | |
192 | |
193 if ($value) { | |
194 $self->throw("Need a hash of hashes") | |
195 unless ref($value) eq 'HASH' ; | |
196 $self->{'_dag'} = $value; | |
197 | |
198 # empty the cache | |
199 $self->{'_root'} = undef; | |
200 | |
201 } | |
202 | |
203 return $self->{'_dag'}; | |
204 | |
205 } | |
206 | |
207 | |
208 =head2 hash_of_arrays | |
209 | |
210 Title : hash_of_arrays | |
211 Usage : $obj->hash_of_array(%hasharray) | |
212 Function: An alternative method to read in the graph structure. | |
213 Hash arrays are easier to type. This method converts | |
214 arrays into hashes and assigns equal values "1" to | |
215 weights. | |
216 | |
217 Example : Here is an example of simple structure containing a graph. | |
218 | |
219 my $DAG = { | |
220 6 => [], | |
221 5 => [], | |
222 4 => [5], | |
223 3 => [6], | |
224 2 => [3, 4, 6], | |
225 1 => [2] | |
226 }; | |
227 | |
228 Returns : hash of hashes graph structure | |
229 Args : reference to a hash of arrays | |
230 | |
231 =cut | |
232 | |
233 sub hash_of_arrays { | |
234 | |
235 my ($self,$value) = @_; | |
236 | |
237 # empty the cache | |
238 $self->{'_root'} = undef; | |
239 | |
240 if ($value) { | |
241 | |
242 $self->throw("Need a hash of hashes") | |
243 unless ref($value) eq 'HASH' ; | |
244 | |
245 #copy the hash of arrays into a hash of hashes; | |
246 my %hash; | |
247 foreach my $start ( keys %{$value}){ | |
248 $hash{$start} = undef; | |
249 map { $hash{$start}{$_} = 1 } @{$value->{$start}}; | |
250 } | |
251 | |
252 $self->{'_dag'} = \%hash; | |
253 } | |
254 | |
255 return $self->{'_dag'}; | |
256 | |
257 } | |
258 | |
259 =head2 Methods for determining the shortest path in the graph | |
260 | |
261 =cut | |
262 | |
263 =head2 shortest_path | |
264 | |
265 Title : shortest_path | |
266 Usage : $obj->shortest_path($a, $b); | |
267 Function: Method for retrieving the shortest path between nodes. | |
268 If the start node remains the same, the method is sometimes | |
269 able to use cached results, otherwise it will recalculate | |
270 the paths. | |
271 Example : | |
272 Returns : array of node names, only the start node name if no path | |
273 Args : name of the start node | |
274 : name of the end node | |
275 | |
276 =cut | |
277 | |
278 | |
279 sub shortest_path { | |
280 my ($self, $root, $end) = @_; | |
281 | |
282 $self->throw("Two arguments needed") unless @_ == 3; | |
283 $self->throw("No node name [$root]") | |
284 unless exists $self->{'_dag'}->{$root}; | |
285 $self->throw("No node name [$end]") | |
286 unless exists $self->{'_dag'}->{$end}; | |
287 | |
288 my @res; # results | |
289 my $reverse; | |
290 | |
291 if ($root > $end) { | |
292 ($root, $end) = ($end, $root ); | |
293 $reverse++; | |
294 } | |
295 | |
296 # try to use cached paths | |
297 $self->dijkstra($root) unless | |
298 defined $self->{'_root'} and $self->{'_root'} eq $root; | |
299 | |
300 return @res unless $self->{'_paths'} ; | |
301 | |
302 # create the list | |
303 my $node = $end; | |
304 my $prev = $self->{'_paths'}->{$end}{'prev'}; | |
305 while ($prev) { | |
306 unshift @res, $node; | |
307 $node = $self->{'_paths'}->{$node}{'prev'}; | |
308 $prev = $self->{'_paths'}->{$node}{'prev'}; | |
309 } | |
310 unshift @res, $node; | |
311 | |
312 $reverse ? return reverse @res : return @res; | |
313 } | |
314 | |
315 | |
316 =head2 dijkstra | |
317 | |
318 Title : dijkstra | |
319 Usage : $graph->dijkstra(1); | |
320 Function: Implements Dijkstra's algorithm. | |
321 Returns or sets a list of mappers. The returned path | |
322 description is always directed down from the root. | |
323 Called from shortest_path(). | |
324 Example : | |
325 Returns : Reference to a hash of hashes representing a linked list | |
326 which contains shortest path down to all nodes from the start | |
327 node. E.g.: | |
328 | |
329 $res = { | |
330 '2' => { | |
331 'prev' => '1', | |
332 'dist' => 1 | |
333 }, | |
334 '1' => { | |
335 'prev' => undef, | |
336 'dist' => 0 | |
337 }, | |
338 }; | |
339 | |
340 Args : name of the start node | |
341 | |
342 =cut | |
343 | |
344 sub dijkstra { | |
345 my ($self,$root) = @_; | |
346 | |
347 $self->throw("I need the name of the root node input") unless $root; | |
348 $self->throw("No node name [$root]") | |
349 unless exists $self->{'_dag'}->{$root}; | |
350 | |
351 my %est = (); # estimate hash | |
352 my %res = (); # result hash | |
353 my $nodes = keys %{$self->{'_dag'}}; | |
354 my $maxdist = 1000000; | |
355 | |
356 # cache the root value | |
357 $self->{'_root'} = $root; | |
358 | |
359 foreach my $node ( keys %{$self->{'_dag'}} ){ | |
360 if ($node eq $root) { | |
361 $est{$node}{'prev'} = undef; | |
362 $est{$node}{'dist'} = 0; | |
363 } else { | |
364 $est{$node}{'prev'} = undef; | |
365 $est{$node}{'dist'} = $maxdist; | |
366 } | |
367 } | |
368 | |
369 # remove nodes from %est until it is empty | |
370 while (keys %est) { | |
371 | |
372 #select the node closest to current one, or root node | |
373 my $min_node; | |
374 my $min = $maxdist; | |
375 foreach my $node (reverse sort keys %est) { | |
376 if ( $est{$node}{'dist'} < $min ) { | |
377 $min = $est{$node}{'dist'}; | |
378 $min_node = $node; | |
379 } | |
380 } | |
381 | |
382 # no more links between nodes | |
383 last unless ($min_node); | |
384 | |
385 # move the node from %est into %res; | |
386 $res{$min_node} = delete $est{$min_node}; | |
387 | |
388 # recompute distances to the neighbours | |
389 my $dist = $res{$min_node}{'dist'}; | |
390 foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){ | |
391 next unless $est{$neighbour}; # might not be there any more | |
392 $est{$neighbour}{'prev'} = $min_node; | |
393 $est{$neighbour}{'dist'} = | |
394 $dist + $self->{'_dag'}{$min_node}{$neighbour} | |
395 if $est{$neighbour}{'dist'} > $dist + 1 ; | |
396 } | |
397 } | |
398 return $self->{'_paths'} = \%res; | |
399 } | |
400 | |
401 | |
402 1; | |
403 |