ensembl_vep: variant_effect_predictor/Bio/Coordinate/Graph.pm comparison

comparison variant_effect_predictor/Bio/Coordinate/Graph.pm @ 0:21066c0abaf5 draft

Uploaded

author	willmclaren
date	Fri, 03 Aug 2012 10:04:48 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:21066c0abaf5
+# $Id: Graph.pm,v 1.2.2.2 2003/09/08 12:16:18 heikki Exp $
+#
+# bioperl module for Bio::Coordinate::Graph
+#
+# Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
+#
+# Copyright Heikki Lehvaslaiho
+#
+# You may distribute this module under the same terms as perl itself
+# POD documentation - main docs before the code
+=head1 NAME
+Bio::Coordinate::Graph - Finds shortest path between nodes in a graph
+=head1 SYNOPSIS
+# get a hash of hashes representing the graph. E.g.:
+my $hash= {
+	     '1' => {
+		     '2' => 1
+		    },
+	     '2' => {
+		     '4' => 1,
+		     '3' => 1
+		    },
+	     '3' => undef,
+	     '4' => {
+		     '5' => 1
+		    },
+	     '5' => undef
+	    };
+# create the object;
+my $graph = Bio::Coordinate::Graph->new(-graph => $hash);
+# find the shortest path between two nodes
+my $a = 1;
+my $b = 6;
+my @path = $graph->shortest_paths($a);
+print join (", ", @path), "\n";
+=head1 DESCRIPTION
+This class calculates the shortest path between input and output
+coordinate systems in a graph that defines the relationships between
+them. This class is primarely designed to analyze gene-related
+coordinate systems. See L<Bio::Coordinate::GeneMapper>.
+Note that this module can not be used to manage graphs.
+Technically the graph implemented here is known as Directed Acyclic
+Graph (DAG). DAG is composed of vertices (nodes) and edges (with
+optional weights) linking them. Nodes of the graph are the coordinate
+systems in gene mapper.
+The shortest path is found using the Dijkstra's algorithm. This
+algorithm is fast and greedy and requires all weights to be
+positive. All weights in the gene coordinate system graph are
+currently equal (1) making the graph unweighted. That makes the use of
+Dijkstra's algorithm an overkill. A impler and faster breadth-first
+would be enough. Luckily the difference for small graphs is not
+signigicant and the implementation is capable to take weights into
+account if needed at some later time.
+=head2 Input format
+The graph needs to be primed using a hash of hashes where there is a
+key for each node. The second keys are the names of the downstream
+neighboring nodes and values are the weights for reaching them. Here
+is part of the gene coordiante system graph::
+$hash = {
+	     '6' => undef,
+	     '3' => {
+		     '6' => 1
+		    },
+	     '2' => {
+		     '6' => 1,
+		     '4' => 1,
+		     '3' => 1
+		    },
+	     '1' => {
+		     '2' => 1
+		    },
+	     '4' => {
+		     '5' => 1
+		    },
+	     '5' => undef
+	    };
+Note that the names need to be positive integrers. Root should be '1'
+and directness of the graph is taken advantage of to speed
+calculations by assuming that downsream nodes always have larger
+number as name.
+An alternative (shorter) way of describing input is to use hash of
+arrays. See L<Bio::Coordinate::Graph::hash_of_arrays>.
+=head1 FEEDBACK
+=head2 Mailing Lists
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to the
+Bioperl mailing lists  Your participation is much appreciated.
+bioperl-l@bioperl.org                         - General discussion
+http://bio.perl.org/MailList.html             - About the mailing lists
+=head2 Reporting Bugs
+report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via
+email or the web:
+bioperl-bugs@bio.perl.org
+http://bugzilla.bioperl.org/
+=head1 AUTHOR - Heikki Lehvaslaiho
+Email:  heikki@ebi.ac.uk
+Address:
+EMBL Outstation, European Bioinformatics Institute
+Wellcome Trust Genome Campus, Hinxton
+Cambs. CB10 1SD, United Kingdom
+=head1 APPENDIX
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+=cut
+# Let the code begin...
+package Bio::Coordinate::Graph;
+use vars qw(@ISA );
+use strict;
+# Object preamble - inherits from Bio::Root::Root
+use Bio::Root::Root;
+@ISA = qw(Bio::Root::Root);
+sub new {
+my($class,@args) = @_;
+my $self = $class->SUPER::new(@args);
+my($graph, $hasharray) =
+	$self->_rearrange([qw(
+GRAPH
+HASHARRAY
+			     )],
+			 @args);
+$graph  && $self->graph($graph);
+$hasharray  && $self->hasharray($hasharray);
+$self->{'_root'} = undef;
+return $self; # success - we hope!
+}
+=head2 Graph structure input methods
+=cut
+=head2 graph
+Title   : graph
+Usage   : $obj->graph($my_graph)
+Function: Read/write method for the graph structure
+Example :
+Returns : hash of hashes grah structure
+Args    : reference to a hash of hashes
+=cut
+sub graph {
+my ($self,$value) = @_;
+if ($value) {
+$self->throw("Need a hash of hashes")
+	  unless  ref($value) eq 'HASH' ;
+$self->{'_dag'} = $value;
+# empty the cache
+$self->{'_root'} = undef;
+}
+return $self->{'_dag'};
+}
+=head2 hash_of_arrays
+Title   : hash_of_arrays
+Usage   : $obj->hash_of_array(%hasharray)
+Function: An alternative method to read in the graph structure.
+Hash arrays are easier to type. This method converts
+arrays into hashes and assigns equal values "1" to
+weights.
+Example : Here is an example of simple structure containing a graph.
+my $DAG = {
+	              6  => [],
+	              5  => [],
+	              4  => [5],
+	              3  => [6],
+	              2  => [3, 4, 6],
+	              1  => [2]
+	             };
+Returns : hash of hashes graph structure
+Args    : reference to a hash of arrays
+=cut
+sub hash_of_arrays {
+my ($self,$value) = @_;
+# empty the cache
+$self->{'_root'} = undef;
+if ($value) {
+$self->throw("Need a hash of hashes")
+	  unless  ref($value) eq 'HASH' ;
+#copy the hash of arrays into a hash of hashes;
+my %hash;
+foreach my $start ( keys %{$value}){
+	  $hash{$start} = undef;
+	  map { $hash{$start}{$_} = 1 } @{$value->{$start}};
+}
+$self->{'_dag'} = \%hash;
+}
+return $self->{'_dag'};
+}
+=head2 Methods for determining the shortest path in the graph
+=cut
+=head2 shortest_path
+Title   : shortest_path
+Usage   : $obj->shortest_path($a, $b);
+Function: Method for retrieving the shortest path between nodes.
+If the start node remains the same, the method is sometimes
+able to use cached results, otherwise it will recalculate
+the paths.
+Example :
+Returns : array of node names, only the start node name if no path
+Args    : name of the start node
+: name of the end node
+=cut
+sub shortest_path {
+my ($self, $root, $end) = @_;
+$self->throw("Two arguments needed") unless @_ == 3;
+$self->throw("No node name [$root]")
+	unless exists $self->{'_dag'}->{$root};
+$self->throw("No node name [$end]")
+	unless exists $self->{'_dag'}->{$end};
+my @res;     # results
+my $reverse;
+if ($root > $end) {
+	($root, $end) = ($end, $root );
+	$reverse++;
+}
+# try to use cached paths
+$self->dijkstra($root) unless
+	defined $self->{'_root'} and $self->{'_root'} eq $root;
+return @res unless $self->{'_paths'} ;
+# create the list
+my $node = $end;
+my $prev = $self->{'_paths'}->{$end}{'prev'};
+while ($prev) {
+	unshift @res, $node;
+	$node = $self->{'_paths'}->{$node}{'prev'};
+	$prev = $self->{'_paths'}->{$node}{'prev'};
+}
+unshift @res, $node;
+$reverse ? return reverse @res : return @res;
+}
+=head2 dijkstra
+Title   : dijkstra
+Usage   : $graph->dijkstra(1);
+Function: Implements Dijkstra's algorithm.
+Returns or sets a list of mappers. The returned path
+description is always directed down from the root.
+Called from shortest_path().
+Example :
+Returns : Reference to a hash of hashes representing a linked list
+which contains shortest path down to all nodes from the start
+node. E.g.:
+$res = {
+'2' => {
+'prev' => '1',
+'dist' => 1
+},
+'1' => {
+'prev' => undef,
+'dist' => 0
+},
+};
+Args    : name of the start node
+=cut
+sub dijkstra {
+my ($self,$root) = @_;
+$self->throw("I need the name of the root node input") unless $root;
+$self->throw("No node name [$root]")
+	unless exists $self->{'_dag'}->{$root};
+my %est = ();          # estimate hash
+my %res = ();          # result hash
+my $nodes = keys %{$self->{'_dag'}};
+my $maxdist = 1000000;
+# cache the root value
+$self->{'_root'} = $root;
+foreach my $node ( keys %{$self->{'_dag'}} ){
+	if ($node eq $root) {
+	    $est{$node}{'prev'} = undef;
+	    $est{$node}{'dist'} = 0;
+	} else {
+	    $est{$node}{'prev'} = undef;
+	    $est{$node}{'dist'} = $maxdist;
+	}
+}
+# remove nodes from %est until it is empty
+while (keys %est) {
+	#select the node closest to current one, or root node
+	my $min_node;
+	my $min = $maxdist;
+	foreach my $node (reverse sort keys %est) {
+	    if ( $est{$node}{'dist'} < $min ) {
+		$min = $est{$node}{'dist'};
+		$min_node = $node;
+	    }
+	}
+	# no more links between nodes
+	last unless ($min_node);
+	# move the node from %est into %res;
+	$res{$min_node} = delete $est{$min_node};
+	# recompute distances to the neighbours
+	my $dist = $res{$min_node}{'dist'};
+	foreach my $neighbour ( keys %{$self->{'_dag'}->{$min_node}} ){
+	    next unless $est{$neighbour}; # might not be there any more
+	    $est{$neighbour}{'prev'} = $min_node;
+	    $est{$neighbour}{'dist'} =
+		$dist + $self->{'_dag'}{$min_node}{$neighbour}
+		if $est{$neighbour}{'dist'} > $dist + 1 ;
+	}
+}
+return $self->{'_paths'} = \%res;
+}
+1;

Mercurial > repos > willmclaren > ensembl_vep

comparison variant_effect_predictor/Bio/Coordinate/Graph.pm @ 0:21066c0abaf5 draft