Mercurial > repos > willmclaren > ensembl_vep

diff variant_effect_predictor/Bio/Coordinate/Graph.pm @ 0:21066c0abaf5 draft
Uploaded
author: willmclaren
date: Fri, 03 Aug 2012 10:04:48 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Coordinate/Graph.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,403 @@
+# $Id: Graph.pm,v 1.2.2.2 2003/09/08 12:16:18 heikki Exp $
+#
+# bioperl module for Bio::Coordinate::Graph
+#
+# Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
+#
+# Copyright Heikki Lehvaslaiho
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Coordinate::Graph - Finds shortest path between nodes in a graph
+
+=head1 SYNOPSIS
+
+  # get a hash of hashes representing the graph. E.g.:
+  my $hash= {
+	     '1' => {
+		     '2' => 1
+		    },
+	     '2' => {
+		     '4' => 1,
+		     '3' => 1
+		    },
+	     '3' => undef,
+	     '4' => {
+		     '5' => 1
+		    },
+	     '5' => undef
+	    };
+
+  # create the object;
+  my $graph = Bio::Coordinate::Graph->new(-graph => $hash);
+
+  # find the shortest path between two nodes
+  my $a = 1;
+  my $b = 6;
+  my @path = $graph->shortest_paths($a);
+  print join (", ", @path), "\n";
+
+
+=head1 DESCRIPTION
+
+This class calculates the shortest path between input and output
+coordinate systems in a graph that defines the relationships between
+them. This class is primarely designed to analyze gene-related
+coordinate systems. See L<Bio::Coordinate::GeneMapper>.
+
+Note that this module can not be used to manage graphs.
+
+Technically the graph implemented here is known as Directed Acyclic
+Graph (DAG). DAG is composed of vertices (nodes) and edges (with
+optional weights) linking them. Nodes of the graph are the coordinate
+systems in gene mapper.
+
+The shortest path is found using the Dijkstra's algorithm. This
+algorithm is fast and greedy and requires all weights to be
+positive. All weights in the gene coordinate system graph are
+currently equal (1) making the graph unweighted. That makes the use of
+Dijkstra's algorithm an overkill. A impler and faster breadth-first
+would be enough. Luckily the difference for small graphs is not
+signigicant and the implementation is capable to take weights into
+account if needed at some later time.
+
+=head2 Input format
+
+The graph needs to be primed using a hash of hashes where there is a
+key for each node. The second keys are the names of the downstream
+neighboring nodes and values are the weights for reaching them. Here
+is part of the gene coordiante system graph::
+
+
+    $hash = {
+	     '6' => undef,
+	     '3' => {
+		     '6' => 1
+		    },
+	     '2' => {
+		     '6' => 1,
+		     '4' => 1,
+		     '3' => 1
+		    },
+	     '1' => {
+		     '2' => 1
+		    },
+	     '4' => {
+		     '5' => 1
+		    },
+	     '5' => undef
+	    };
+
+
+Note that the names need to be positive integrers. Root should be '1'
+and directness of the graph is taken advantage of to speed
+calculations by assuming that downsream nodes always have larger
+number as name.
+
+An alternative (shorter) way of describing input is to use hash of
+arrays. See L<Bio::Coordinate::Graph::hash_of_arrays>.
+
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to the
+Bioperl mailing lists  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                         - General discussion
+  http://bio.perl.org/MailList.html             - About the mailing lists
+
+=head2 Reporting Bugs
+
+report bugs to the Bioperl bug tracking system to help us keep track
+ the bugs and their resolution.  Bug reports can be submitted via
+ email or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Heikki Lehvaslaiho
+
+Email:  heikki@ebi.ac.uk
+Address:
+
+     EMBL Outstation, European Bioinformatics Institute
+     Wellcome Trust Genome Campus, Hinxton
+     Cambs. CB10 1SD, United Kingdom
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+package Bio::Coordinate::Graph;
+use vars qw(@ISA );
+use strict;
+
+# Object preamble - inherits from Bio::Root::Root
+use Bio::Root::Root;
+
+@ISA = qw(Bio::Root::Root);
+
+
+sub new {
+    my($class,@args) = @_;
+    my $self = $class->SUPER::new(@args);
+
+    my($graph, $hasharray) =
+	$self->_rearrange([qw(
+                              GRAPH
+                              HASHARRAY
+			     )],
+			 @args);
+
+    $graph  && $self->graph($graph);
+    $hasharray  && $self->hasharray($hasharray);
+
+    $self->{'_root'} = undef;
+
+    return $self; # success - we hope!
+
+}
+
+=head2 Graph structure input methods
+
+=cut
+
+=head2 graph
+
+ Title   : graph
+ Usage   : $obj->graph($my_graph)
+ Function: Read/write method for the graph structure
+ Example : 
+ Returns : hash of hashes grah structure
+ Args    : reference to a hash of hashes
+
+=cut
+
+sub graph {
+
+  my ($self,$value) = @_;
+
+  if ($value) {
+      $self->throw("Need a hash of hashes")
+	  unless  ref($value) eq 'HASH' ;
+      $self->{'_dag'} = $value;
+
+      # empty the cache
+      $self->{'_root'} = undef;
+
+  }
+
+  return $self->{'_dag'};
+
+}
+
+
+=head2 hash_of_arrays
+
+ Title   : hash_of_arrays
+ Usage   : $obj->hash_of_array(%hasharray)
+ Function: An alternative method to read in the graph structure.
+           Hash arrays are easier to type. This method converts
+           arrays into hashes and assigns equal values "1" to
+           weights.
+
+ Example : Here is an example of simple structure containing a graph.
+
+           my $DAG = {
+	              6  => [],
+	              5  => [],
+	              4  => [5],
+	              3  => [6],
+	              2  => [3, 4, 6],
+	              1  => [2]
+	             };
+
+ Returns : hash of hashes graph structure
+ Args    : reference to a hash of arrays
+
+=cut
+
+sub hash_of_arrays {
+
+  my ($self,$value) = @_;
+
+  # empty the cache
+  $self->{'_root'} = undef;
+
+  if ($value) {
+
+      $self->throw("Need a hash of hashes")
+	  unless  ref($value) eq 'HASH' ;
+
+      #copy the hash of arrays into a hash of hashes;
+      my %hash;
+      foreach my $start ( keys %{$value}){
+	  $hash{$start} = undef;
+	  map { $hash{$start}{$_} = 1 } @{$value->{$start}};
+      }
+
+      $self->{'_dag'} = \%hash;
+  }
+
+  return $self->{'_dag'};
+
+}
+
+=head2 Methods for determining the shortest path in the graph
+
+=cut
+
+=head2 shortest_path
+
+ Title   : shortest_path
+ Usage   : $obj->shortest_path($a, $b);
+ Function: Method for retrieving the shortest path between nodes.
+           If the start node remains the same, the method is sometimes
+           able to use cached results, otherwise it will recalculate
+           the paths.
+ Example : 
+ Returns : array of node names, only the start node name if no path
+ Args    : name of the start node
+         : name of the end node
+
+=cut
+
+
+sub shortest_path {
+    my ($self, $root, $end) = @_;
+
+    $self->throw("Two arguments needed") unless @_ == 3;
+    $self->throw("No node name [$root]")
+	unless exists $self->{'_dag'}->{$root};
+    $self->throw("No node name [$end]")
+	unless exists $self->{'_dag'}->{$end};
+
+    my @res;     # results
+    my $reverse;
+
+    if ($root > $end) {
+	($root, $end) = ($end, $root );
+	$reverse++;
+    }
+
+    # try to use cached paths
+    $self->dijkstra($root) unless
+	defined $self->{'_root'} and $self->{'_root'} eq $root;
+
+    return @res unless $self->{'_paths'} ;
+
+    # create the list
+    my $node = $end;
+    my $prev = $self->{'_paths'}->{$end}{'prev'};
+    while ($prev) {
+	unshift @res, $node;
+	$node = $self->{'_paths'}->{$node}{'prev'};
+	$prev = $self->{'_paths'}->{$node}{'prev'};
+    }
+    unshift @res, $node;
+
+    $reverse ? return reverse @res : return @res;
+}
+
+
+=head2 dijkstra
+
+ Title   : dijkstra
+ Usage   : $graph->dijkstra(1);
+ Function: Implements Dijkstra's algorithm.
+           Returns or sets a list of mappers. The returned path
+           description is always directed down from the root.
+           Called from shortest_path().
+ Example : 
+ Returns : Reference to a hash of hashes representing a linked list
+           which contains shortest path down to all nodes from the start
+           node. E.g.:
+
+            $res = {
+                      '2' => {
+                               'prev' => '1',
+                               'dist' => 1
+                             },
+                      '1' => {
+                               'prev' => undef,
+                               'dist' => 0
+                             },
+                    };
+
+ Args    : name of the start node
+
+=cut
+
+sub dijkstra {
+    my ($self,$root) = @_;
+
+    $self->throw("I need the name of the root node input") unless $root;
+    $self->throw("No node name [$root]")
+	unless exists $self->{'_dag'}->{$root};
+
+    my %est = ();          # estimate hash
+    my %res = ();          # result hash
+    my $nodes = keys %{$self->{'_dag'}};
+    my $maxdist = 1000000;
+
+    # cache the root value
+    $self->{'_root'} = $root;
+
+    foreach my $node ( keys %{$self->{'_dag'}} ){
+	if ($node eq $root) {
+	    $est{$node}{'prev'} = undef;
+	    $est{$node}{'dist'} = 0;
+	} else {
+	    $est{$node}{'prev'} = undef;
+	    $est{$node}{'dist'} = $maxdist;
+	}
+    }
+
+    # remove nodes from %est until it is empty
+    while (keys %est) {
+
+	#select the node closest to current one, or root node
+	my $min_node;
+	my $min = $maxdist;
+	foreach my $node (reverse sort keys %est) {
+	    if ( $est{$node}{'dist'} < $min ) {
+		$min = $est{$node}{'dist'};
+		$min_node = $node;
+	    }
+	}
+
+	# no more links between nodes
+	last unless ($min_node);
+
+	# move the node from %est into %res;
+	$res{$min_node} = delete $est{$min_node};
+
+	# recompute distances to the neighbours
+	my $dist = $res{$min_node}{'dist'};
+	foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){
+	    next unless $est{$neighbour}; # might not be there any more
+	    $est{$neighbour}{'prev'} = $min_node;
+	    $est{$neighbour}{'dist'} =
+		$dist + $self->{'_dag'}{$min_node}{$neighbour}
+		if $est{$neighbour}{'dist'} > $dist + 1 ;
+	}
+    }
+    return $self->{'_paths'} = \%res;
+}
+
+
+1;
+
author	willmclaren
date	Fri, 03 Aug 2012 10:04:48 -0400
parents
children