diff variant_effect_predictor/Bio/EnsEMBL/Mapper.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Mapper.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,1041 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::Mapper
+
+=head1 SYNOPSIS
+
+  $map = Bio::EnsEMBL::Mapper->new( 'rawcontig', 'chromosome' );
+
+  # add a coodinate mapping - supply two pairs or coordinates
+  $map->add_map_coordinates(
+    $contig_id, $contig_start, $contig_end, $contig_ori,
+    $chr_name,  chr_start,     $chr_end
+  );
+
+  # map from one coordinate system to another
+  my @coordlist =
+    $mapper->map_coordinates( 627012, 2, 5, -1, "rawcontig" );
+
+=head1 DESCRIPTION
+
+Generic mapper to provide coordinate transforms between two disjoint
+coordinate systems. This mapper is intended to be 'context neutral' - in
+that it does not contain any code relating to any particular coordinate
+system. This is provided in, for example, Bio::EnsEMBL::AssemblyMapper.
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::Mapper;
+use strict;
+use integer;
+
+use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump);
+use Bio::EnsEMBL::Mapper::Pair;
+use Bio::EnsEMBL::Mapper::IndelPair;
+use Bio::EnsEMBL::Mapper::Unit;
+use Bio::EnsEMBL::Mapper::Coordinate;
+use Bio::EnsEMBL::Mapper::IndelCoordinate;
+use Bio::EnsEMBL::Mapper::Gap;
+
+use Bio::EnsEMBL::Utils::Exception qw(throw);
+
+# use Data::Dumper;
+
+=head2 new
+
+  Arg [1]    : string $from
+               The name of the 'from' coordinate system
+  Arg [2]    : string $to
+               The name of the 'to' coordinate system
+  Arg [3]    : (optional) Bio::EnsEMBL::CoordSystem $from_cs
+               The 'from' coordinate system
+  Arg [4]    : (optional) Bio::EnsEMBL::CoordSystem $to_cs
+  Example    : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO');
+  Description: Constructor.  Creates a new Bio::EnsEMBL::Mapper object.
+  Returntype : Bio::EnsEMBL::Mapper
+  Exceptions : none
+  Caller     : general
+
+=cut
+
+sub new {
+  my ( $proto, $from, $to, $from_cs, $to_cs ) = @_;
+
+  if ( !defined($to) || !defined($from) ) {
+    throw("Must supply 'to' and 'from' tags");
+  }
+
+  my $class = ref($proto) || $proto;
+
+  my $self = bless( { "_pair_$from" => {},
+                      "_pair_$to"   => {},
+                      'pair_count'  => 0,
+                      'to'          => $to,
+                      'from'        => $from,
+                      'to_cs'       => $to_cs,
+                      'from_cs'     => $from_cs
+                    },
+                    $class );
+
+  # do sql to get any componente with muliple assemblys.
+
+  return $self;
+}
+
+=head2 flush
+
+  Args       : none
+  Example    : none
+  Description: removes all cached information out of this mapper
+  Returntype : none
+  Exceptions : none
+  Caller     : AssemblyMapper, ChainedAssemblyMapper
+
+=cut
+
+sub flush {
+  my $self = shift;
+  my $from = $self->from();
+  my $to = $self->to();
+
+  $self->{"_pair_$from"} = {};
+  $self->{"_pair_$to"} = {};
+
+  $self->{'pair_count'} = 0;
+}
+
+
+
+=head2 map_coordinates
+
+    Arg  1      string $id
+                id of 'source' sequence
+    Arg  2      int $start
+                start coordinate of 'source' sequence
+    Arg  3      int $end
+                end coordinate of 'source' sequence
+    Arg  4      int $strand
+                raw contig orientation (+/- 1)
+    Arg  5      string $type
+                nature of transform - gives the type of
+                coordinates to be transformed *from*
+    Function    generic map method
+    Returntype  array of Bio::EnsEMBL::Mapper::Coordinate
+                and/or   Bio::EnsEMBL::Mapper::Gap
+    Exceptions  none
+    Caller      Bio::EnsEMBL::Mapper
+
+=cut
+
+sub map_coordinates {
+  my ( $self, $id, $start, $end, $strand, $type ) = @_;
+
+  unless (    defined($id)
+           && defined($start)
+           && defined($end)
+           && defined($strand)
+           && defined($type) )
+  {
+    throw("Expecting 5 arguments");
+  }
+
+  # special case for handling inserts:
+  if ( $start == $end + 1 ) {
+    return $self->map_insert( $id, $start, $end, $strand, $type );
+  }
+
+  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
+
+  my $hash = $self->{"_pair_$type"};
+
+  my ( $from, $to, $cs );
+
+  if ( $type eq $self->{'to'} ) {
+    $from = 'to';
+    $to   = 'from';
+    $cs   = $self->{'from_cs'};
+  } else {
+    $from = 'from';
+    $to   = 'to';
+    $cs   = $self->{'to_cs'};
+  }
+
+  unless ( defined $hash ) {
+    throw("Type $type is neither to or from coordinate systems");
+  }
+
+  if ( !defined $hash->{ uc($id) } ) {
+    # one big gap!
+    my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
+    return $gap;
+  }
+
+  my $last_used_pair;
+  my @result;
+
+  my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord );
+  my $lr = $hash->{ uc($id) };
+
+  $start_idx = 0;
+  $end_idx   = $#{$lr};
+
+  # binary search the relevant pairs
+  # helps if the list is big
+  while ( ( $end_idx - $start_idx ) > 1 ) {
+    $mid_idx    = ( $start_idx + $end_idx ) >> 1;
+    $pair       = $lr->[$mid_idx];
+    $self_coord = $pair->{$from};
+    if ( $self_coord->{'end'} < $start ) {
+      $start_idx = $mid_idx;
+    } else {
+      $end_idx = $mid_idx;
+    }
+  }
+
+  my $rank              = 0;
+  my $orig_start        = $start;
+  my $last_target_coord = undef;
+  for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) {
+    $pair = $lr->[$i];
+    my $self_coord   = $pair->{$from};
+    my $target_coord = $pair->{$to};
+
+    #
+    # But not the case for haplotypes!! need to test for this case???
+    # so removing this till a better solution is found
+    #
+    #
+    #     if($self_coord->{'start'} < $start){
+    #       $start = $orig_start;
+    #       $rank++;
+    #     }
+
+    if ( defined($last_target_coord)
+         and $target_coord->{'id'} ne $last_target_coord )
+    {
+      if ( $self_coord->{'start'} < $start )
+      {    # i.e. the same bit is being mapped to another assembled bit
+        $start = $orig_start;
+      }
+    } else {
+      $last_target_coord = $target_coord->{'id'};
+    }
+
+    # if we haven't even reached the start, move on
+    if ( $self_coord->{'end'} < $orig_start ) {
+      next;
+    }
+
+    # if we have over run, break
+    if ( $self_coord->{'start'} > $end ) {
+      last;
+    }
+
+    if ( $start < $self_coord->{'start'} ) {
+      # gap detected
+      my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
+                                    $self_coord->{'start'} - 1, $rank );
+
+      push( @result, $gap );
+      $start = $gap->{'end'} + 1;
+    }
+    my ( $target_start, $target_end, $target_ori );
+    my $res;
+    if ( exists $pair->{'indel'} ) {
+      # When next pair is an IndelPair and not a Coordinate, create the
+      # new mapping Coordinate, the IndelCoordinate.
+      $target_start = $target_coord->{'start'};
+      $target_end   = $target_coord->{'end'};
+
+      #create a Gap object
+      my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
+        ( $self_coord->{'end'} < $end ? $self_coord->{'end'} : $end ) );
+      #create the Coordinate object
+      my $coord =
+        Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
+              $target_start, $target_end, $pair->{'ori'}*$strand, $cs );
+      #and finally, the IndelCoordinate object with
+      $res = Bio::EnsEMBL::Mapper::IndelCoordinate->new( $gap, $coord );
+    } else {
+      # start is somewhere inside the region
+      if ( $pair->{'ori'} == 1 ) {
+        $target_start =
+          $target_coord->{'start'} +
+          ( $start - $self_coord->{'start'} );
+      } else {
+        $target_end =
+          $target_coord->{'end'} - ( $start - $self_coord->{'start'} );
+      }
+
+      # Either we are enveloping this map or not.  If yes, then end
+      # point (self perspective) is determined solely by target.  If
+      # not we need to adjust.
+
+      if ( $end > $self_coord->{'end'} ) {
+        # enveloped
+        if ( $pair->{'ori'} == 1 ) {
+          $target_end = $target_coord->{'end'};
+        } else {
+          $target_start = $target_coord->{'start'};
+        }
+      } else {
+        # need to adjust end
+        if ( $pair->{'ori'} == 1 ) {
+          $target_end =
+            $target_coord->{'start'} +
+            ( $end - $self_coord->{'start'} );
+        } else {
+          $target_start =
+            $target_coord->{'end'} - ( $end - $self_coord->{'start'} );
+        }
+      }
+
+      $res =
+        Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
+                     $target_start, $target_end, $pair->{'ori'}*$strand,
+                     $cs, $rank );
+
+    } ## end else [ if ( exists $pair->{'indel'...})]
+
+    push( @result, $res );
+
+    $last_used_pair = $pair;
+    $start          = $self_coord->{'end'} + 1;
+
+  } ## end for ( my $i = $start_idx...)
+
+  if ( !defined $last_used_pair ) {
+    my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
+    push( @result, $gap );
+
+  } elsif ( $last_used_pair->{$from}->{'end'} < $end ) {
+    # gap at the end
+    my $gap =
+      Bio::EnsEMBL::Mapper::Gap->new(
+                                  $last_used_pair->{$from}->{'end'} + 1,
+                                  $end, $rank );
+    push( @result, $gap );
+  }
+
+  if ( $strand == -1 ) {
+    @result = reverse(@result);
+  }
+
+  return @result;
+} ## end sub map_coordinates
+
+
+
+=head2 map_insert
+
+  Arg [1]    : string $id
+  Arg [2]    : int $start - start coord. Since this is an insert should always
+               be one greater than end.
+  Arg [3]    : int $end - end coord. Since this is an insert should always
+               be one less than start.
+  Arg [4]    : int $strand (0, 1, -1)
+  Arg [5]    : string $type - the coordinate system name the coords are from.
+  Arg [6]    : boolean $fastmap - if specified, this is being called from
+               the fastmap call. The mapping done is not any faster for
+               inserts, but the return value is different.
+  Example    : 
+  Description: This is in internal function which handles the special mapping
+               case for inserts (start = end +1).  This function will be called
+               automatically by the map function so there is no reason to
+               call it directly.
+  Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and/or Gap objects
+  Exceptions : none
+  Caller     : map_coordinates()
+
+=cut
+
+sub map_insert {
+  my ($self, $id, $start, $end, $strand, $type, $fastmap) = @_;
+
+  # swap start/end and map the resultant 2bp coordinate
+  ($start, $end) =($end,$start);
+
+  my @coords = $self->map_coordinates($id, $start, $end, $strand, $type);
+
+  if(@coords == 1) {
+    my $c = $coords[0];
+    # swap start and end to convert back into insert
+    ($c->{'start'}, $c->{'end'}) = ($c->{'end'}, $c->{'start'});
+  } else {
+    throw("Unexpected: Got ",scalar(@coords)," expected 2.") if(@coords != 2);
+
+    # adjust coordinates, remove gaps
+    my ($c1, $c2);
+    if($strand == -1) {
+      ($c2,$c1) = @coords;
+    } else {
+      ($c1, $c2) = @coords;
+    }
+    @coords = ();
+
+    if(ref($c1) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
+      # insert is after first coord
+      if($c1->{'strand'} * $strand == -1) {
+        $c1->{'end'}--;
+      } else {
+        $c1->{'start'}++;
+      }
+      @coords = ($c1);
+    }
+    if(ref($c2) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
+      # insert is before second coord
+      if($c2->{'strand'} * $strand == -1) {
+        $c2->{'start'}++;
+      } else {
+        $c2->{'end'}--;
+      }
+      if($strand == -1) {
+        unshift @coords, $c2;
+      } else {
+        push @coords, $c2;
+      }
+    }
+  }
+
+  if($fastmap) {
+    return undef if(@coords != 1);
+    my $c = $coords[0];
+    return ($c->{'id'}, $c->{'start'}, $c->{'end'},
+            $c->{'strand'}, $c->{'coord_system'});
+  }
+
+  return @coords;
+}
+
+
+
+
+
+
+=head2 fastmap
+
+    Arg  1      string $id
+                id of 'source' sequence
+    Arg  2      int $start
+                start coordinate of 'source' sequence
+    Arg  3      int $end
+                end coordinate of 'source' sequence
+    Arg  4      int $strand
+                raw contig orientation (+/- 1)
+    Arg  5      int $type
+                nature of transform - gives the type of
+                coordinates to be transformed *from*
+    Function    inferior map method. Will only do ungapped unsplit mapping.
+                Will return id, start, end strand in a list.
+    Returntype  list of results
+    Exceptions  none
+    Caller      Bio::EnsEMBL::AssemblyMapper
+
+=cut
+
+sub fastmap {
+   my ($self, $id, $start, $end, $strand, $type) = @_;
+
+   my ($from, $to, $cs);
+
+   if($end+1 == $start) {
+     return $self->map_insert($id, $start, $end, $strand, $type, 1);
+   }
+
+   if( ! $self->{'_is_sorted'} ) { $self->_sort() }
+
+   if($type eq $self->{'to'}) {
+     $from = 'to';
+     $to = 'from';
+     $cs = $self->{'from_cs'};
+   } else {
+     $from = 'from';
+     $to = 'to';
+     $cs = $self->{'to_cs'};
+   }
+
+   my $hash = $self->{"_pair_$type"} or
+       throw("Type $type is neither to or from coordinate systems");
+
+
+   my $pairs = $hash->{uc($id)};
+
+   foreach my $pair (@$pairs) {
+     my $self_coord   = $pair->{$from};
+     my $target_coord = $pair->{$to};
+
+     # only super easy mapping is done 
+     if( $start < $self_coord->{'start'} ||
+         $end > $self_coord->{'end'} ) {
+       next;
+     }
+
+     if( $pair->{'ori'} == 1 ) {
+       return ( $target_coord->{'id'},
+                $target_coord->{'start'}+$start-$self_coord->{'start'},
+                $target_coord->{'start'}+$end-$self_coord->{'start'},
+                $strand, $cs );
+     } else {
+       return ( $target_coord->{'id'},
+                $target_coord->{'end'} - ($end - $self_coord->{'start'}),
+                $target_coord->{'end'} - ($start - $self_coord->{'start'}),
+                -$strand, $cs );
+     }
+   }
+
+   return ();
+}
+
+
+
+=head2 add_map_coordinates
+
+    Arg  1      int $id
+                id of 'source' sequence
+    Arg  2      int $start
+                start coordinate of 'source' sequence
+    Arg  3      int $end
+                end coordinate of 'source' sequence
+    Arg  4      int $strand
+                relative orientation of source and target (+/- 1)
+    Arg  5      int $id
+                id of 'target' sequence
+    Arg  6      int $start
+                start coordinate of 'target' sequence
+    Arg  7      int $end
+                end coordinate of 'target' sequence
+    Function    Stores details of mapping between
+                'source' and 'target' regions.
+    Returntype  none
+    Exceptions  none
+    Caller      Bio::EnsEMBL::Mapper
+
+=cut
+
+sub add_map_coordinates {
+  my ( $self, $contig_id, $contig_start, $contig_end, $contig_ori,
+       $chr_name, $chr_start, $chr_end )
+    = @_;
+
+  unless (    defined($contig_id)
+           && defined($contig_start)
+           && defined($contig_end)
+           && defined($contig_ori)
+           && defined($chr_name)
+           && defined($chr_start)
+           && defined($chr_end) )
+  {
+    throw("7 arguments expected");
+  }
+
+  if ( ( $contig_end - $contig_start ) != ( $chr_end - $chr_start ) ) {
+    throw("Cannot deal with mis-lengthed mappings so far");
+  }
+
+  my $from = Bio::EnsEMBL::Mapper::Unit->new( $contig_id, $contig_start,
+                                              $contig_end );
+  my $to =
+    Bio::EnsEMBL::Mapper::Unit->new( $chr_name, $chr_start, $chr_end );
+
+  my $pair = Bio::EnsEMBL::Mapper::Pair->new( $from, $to, $contig_ori );
+
+  # place into hash on both ids
+  my $map_to   = $self->{'to'};
+  my $map_from = $self->{'from'};
+
+  push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } },    $pair );
+  push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair );
+
+  $self->{'pair_count'}++;
+  $self->{'_is_sorted'} = 0;
+} ## end sub add_map_coordinates
+
+
+=head2 add_indel_coordinates
+
+    Arg  1      int $id
+                id of 'source' sequence
+    Arg  2      int $start
+                start coordinate of 'source' sequence
+    Arg  3      int $end
+                end coordinate of 'source' sequence
+    Arg  4      int $strand
+                relative orientation of source and target (+/- 1)
+    Arg  5      int $id
+                id of 'targe' sequence
+    Arg  6      int $start
+                start coordinate of 'targe' sequence
+    Arg  7      int $end
+                end coordinate of 'targe' sequence
+    Function    stores details of mapping between two regions:
+                'source' and 'target'. Returns 1 if the pair was added, 0 if it
+                was already in. Used when adding an indel
+    Returntype  int 0,1
+    Exceptions  none
+    Caller      Bio::EnsEMBL::Mapper
+
+=cut
+
+sub add_indel_coordinates{
+  my ($self, $contig_id, $contig_start, $contig_end, 
+      $contig_ori, $chr_name, $chr_start, $chr_end) = @_;
+
+  unless(defined($contig_id) && defined($contig_start) && defined($contig_end)
+	 && defined($contig_ori) && defined($chr_name) && defined($chr_start)
+	 && defined($chr_end)) {
+    throw("7 arguments expected");
+  }
+
+  #we need to create the IndelPair object to add to both lists, to and from
+  my $from =
+    Bio::EnsEMBL::Mapper::Unit->new($contig_id, $contig_start, $contig_end);
+  my $to   =
+    Bio::EnsEMBL::Mapper::Unit->new($chr_name, $chr_start, $chr_end);
+
+  my $pair = Bio::EnsEMBL::Mapper::IndelPair->new($from, $to, $contig_ori);
+
+  # place into hash on both ids
+  my $map_to = $self->{'to'};
+  my $map_from = $self->{'from'};
+
+  push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair );
+  push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair );
+
+  $self->{'pair_count'}++;
+
+  $self->{'_is_sorted'} = 0;
+  return 1;
+}
+
+
+=head2 map_indel
+
+  Arg [1]    : string $id
+  Arg [2]    : int $start - start coord. Since this is an indel should always
+               be one greater than end.
+  Arg [3]    : int $end - end coord. Since this is an indel should always
+               be one less than start.
+  Arg [4]    : int $strand (0, 1, -1)
+  Arg [5]    : string $type - the coordinate system name the coords are from.
+  Example    : @coords = $mapper->map_indel();
+  Description: This is in internal function which handles the special mapping
+               case for indels (start = end +1). It will be used to map from
+               a coordinate system with a gap to another that contains an
+               insertion. It will be mainly used by the Variation API.
+  Returntype : Bio::EnsEMBL::Mapper::Unit objects
+  Exceptions : none
+  Caller     : general
+
+=cut
+
+sub map_indel {
+  my ( $self, $id, $start, $end, $strand, $type ) = @_;
+
+  # swap start/end and map the resultant 2bp coordinate
+  ( $start, $end ) = ( $end, $start );
+
+  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
+
+  my $hash = $self->{"_pair_$type"};
+
+  my ( $from, $to, $cs );
+
+  if ( $type eq $self->{'to'} ) {
+    $from = 'to';
+    $to   = 'from';
+    $cs   = $self->{'from_cs'};
+  } else {
+    $from = 'from';
+    $to   = 'to';
+    $cs   = $self->{'to_cs'};
+  }
+
+  unless ( defined $hash ) {
+    throw("Type $type is neither to or from coordinate systems");
+  }
+  my @indel_coordinates;
+
+  my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord );
+  my $lr = $hash->{ uc($id) };
+
+  $start_idx = 0;
+  $end_idx   = $#{$lr};
+
+  # binary search the relevant pairs
+  # helps if the list is big
+  while ( ( $end_idx - $start_idx ) > 1 ) {
+    $mid_idx    = ( $start_idx + $end_idx ) >> 1;
+    $pair       = $lr->[$mid_idx];
+    $self_coord = $pair->{$from};
+    if ( $self_coord->{'end'} <= $start ) {
+      $start_idx = $mid_idx;
+    } else {
+      $end_idx = $mid_idx;
+    }
+  }
+
+  for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) {
+    $pair = $lr->[$i];
+    my $self_coord   = $pair->{$from};
+    my $target_coord = $pair->{$to};
+
+    if ( exists $pair->{'indel'} ) {
+      #need to return unit coordinate
+      my $to =
+        Bio::EnsEMBL::Mapper::Unit->new( $target_coord->{'id'},
+                                         $target_coord->{'start'},
+                                         $target_coord->{'end'}, );
+      push @indel_coordinates, $to;
+      last;
+    }
+  }
+
+  return @indel_coordinates;
+} ## end sub map_indel
+
+
+=head2 add_Mapper
+
+    Arg  1      Bio::EnsEMBL::Mapper $mapper2
+    Example     $mapper->add_Mapper($mapper2)
+    Function    add all the map coordinates from $mapper to this mapper.
+                This object will contain mapping pairs from both the old
+                object and $mapper2.
+    Returntype  int 0,1
+    Exceptions  throw if 'to' and 'from' from both Bio::EnsEMBL::Mappers
+                are incompatible
+    Caller      $mapper->methodname()
+
+=cut
+
+sub add_Mapper{
+  my ($self, $mapper) = @_;
+
+  my $mapper_to = $mapper->{'to'};
+  my $mapper_from = $mapper->{'from'};
+  if ($mapper_to ne $self->{'to'} or $mapper_from ne $self->{'from'}) {
+    throw("Trying to add an incompatible Mapper");
+  }
+
+  my $count_a = 0;
+  foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) {
+    push(@{$self->{"_pair_$mapper_to"}->{$seq_name}},
+        @{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
+    $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
+  }
+  my $count_b = 0;
+  foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) {
+    push(@{$self->{"_pair_$mapper_from"}->{$seq_name}},
+        @{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
+    $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
+  }
+
+  if ($count_a == $count_b) {
+    $self->{'pair_count'} += $count_a;
+  } else {
+    throw("Trying to add a funny Mapper");
+  }
+
+  $self->{'_is_sorted'} = 0;
+  return 1;
+}
+
+
+
+=head2 list_pairs
+
+    Arg  1      int $id
+                id of 'source' sequence
+    Arg  2      int $start
+                start coordinate of 'source' sequence
+    Arg  3      int $end
+                end coordinate of 'source' sequence
+    Arg  4      string $type
+                nature of transform - gives the type of
+                coordinates to be transformed *from*
+    Function    list all pairs of mappings in a region
+    Returntype  list of Bio::EnsEMBL::Mapper::Pair
+    Exceptions  none
+    Caller      Bio::EnsEMBL::Mapper
+
+=cut
+
+sub list_pairs {
+  my ( $self, $id, $start, $end, $type ) = @_;
+
+  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
+
+  if ( !defined $type ) {
+    throw("Expected 4 arguments");
+  }
+
+  if ( $start > $end ) {
+    throw(   "Start is greater than end "
+           . "for id $id, start $start, end $end\n" );
+  }
+
+  my $hash = $self->{"_pair_$type"};
+
+  my ( $from, $to );
+
+  if ( $type eq $self->{'to'} ) {
+    $from = 'to';
+    $to   = 'from';
+  } else {
+    $from = 'from';
+    $to   = 'to';
+  }
+
+  unless ( defined $hash ) {
+    throw("Type $type is neither to or from coordinate systems");
+  }
+
+  my @list;
+
+  unless ( exists $hash->{ uc($id) } ) {
+    return ();
+  }
+
+  @list = @{ $hash->{ uc($id) } };
+
+  my @output;
+  if ( $start == -1 && $end == -1 ) {
+    return @list;
+  } else {
+
+    foreach my $p (@list) {
+
+      if ( $p->{$from}->{'end'} < $start ) {
+        next;
+      }
+      if ( $p->{$from}->{'start'} > $end ) {
+        last;
+      }
+      push( @output, $p );
+    }
+    return @output;
+  }
+} ## end sub list_pairs
+
+
+=head2 to
+
+    Arg  1      Bio::EnsEMBL::Mapper::Unit $id
+                id of 'source' sequence
+    Function    accessor method form the 'source'
+                and 'target' in a Mapper::Pair
+    Returntype  Bio::EnsEMBL::Mapper::Unit
+    Exceptions  none
+    Caller      Bio::EnsEMBL::Mapper
+
+=cut
+
+sub to {
+  my ( $self, $value ) = @_;
+
+  if ( defined($value) ) {
+    $self->{'to'} = $value;
+  }
+
+  return $self->{'to'};
+}
+
+=head2 from
+
+    Arg  1      Bio::EnsEMBL::Mapper::Unit $id
+                id of 'source' sequence
+    Function    accessor method form the 'source'
+                and 'target' in a Mapper::Pair
+    Returntype  Bio::EnsEMBL::Mapper::Unit
+    Exceptions  none
+    Caller      Bio::EnsEMBL::Mapper
+
+=cut
+sub from {
+  my ( $self, $value ) = @_;
+
+  if ( defined($value) ) {
+    $self->{'from'} = $value;
+  }
+
+  return $self->{'from'};
+}
+
+
+#  _dump
+#
+#     Arg  1      *FileHandle $fh
+#     Function    convenience dump function
+#                 possibly useful for debugging
+#     Returntype  none
+#     Exceptions  none
+#     Caller      internal
+#
+
+sub _dump{
+   my ($self,$fh) = @_;
+
+   if( !defined $fh ) {
+       $fh = \*STDERR;
+   }
+
+   foreach my $id ( keys %{$self->{'_pair_hash_from'}} ) {
+       print $fh "From Hash $id\n";
+       foreach my $pair ( @{$self->{'_pair_hash_from'}->{uc($id)}} ) {
+	   print $fh "    ",$pair->from->start," ",$pair->from->end,":",$pair->to->start," ",$pair->to->end," ",$pair->to->id,"\n";
+       }
+   }
+
+}
+
+
+# _sort
+#
+#    Function    sort function so that all
+#                mappings are sorted by
+#                chromosome start
+#    Returntype  none
+#    Exceptions  none
+#    Caller      internal
+#
+
+sub _sort {
+  my ($self) = @_;
+
+  my $to   = $self->{'to'};
+  my $from = $self->{'from'};
+
+  foreach my $id ( keys %{ $self->{"_pair_$from"} } ) {
+    @{ $self->{"_pair_$from"}->{$id} } =
+      sort { $a->{'from'}->{'start'} <=> $b->{'from'}->{'start'} }
+      @{ $self->{"_pair_$from"}->{$id} };
+  }
+
+  foreach my $id ( keys %{ $self->{"_pair_$to"} } ) {
+    @{ $self->{"_pair_$to"}->{$id} } =
+      sort { $a->{'to'}->{'start'} <=> $b->{'to'}->{'start'} }
+      @{ $self->{"_pair_$to"}->{$id} };
+  }
+
+  $self->_merge_pairs();
+  $self->_is_sorted(1);
+}
+
+# this function merges pairs that are adjacent into one
+sub _merge_pairs {
+  my $self = shift;
+
+  my ( $lr, $lr_from, $del_pair, $next_pair, $current_pair );
+
+  my $map_to = $self->{'to'};
+  my $map_from = $self->{'from'};
+  $self->{'pair_count'} = 0;
+
+  for my $key ( keys %{$self->{"_pair_$map_to"}} ) {
+    $lr = $self->{"_pair_$map_to"}->{$key}; 
+    
+    my $i = 0;
+    my $next = 1;
+    my $length = $#{$lr};
+    while( $next <= $length ) {
+      $current_pair = $lr->[$i];
+      $next_pair = $lr->[$next];
+      $del_pair = undef;
+      
+      if(exists $current_pair->{'indel'} || exists $next_pair->{'indel'}){
+	  #necessary to modify the merge function to not merge indels
+	  $next++;
+	  $i++;
+      }
+      else{
+	  # duplicate filter
+	  if( $current_pair->{'to'}->{'start'} == $next_pair->{'to'}->{'start'} 
+	     and $current_pair->{'from'}->{'id'} == $next_pair->{'from'}->{'id'} ) {
+	    $del_pair = $next_pair;
+	  } elsif(( $current_pair->{'from'}->{'id'} eq $next_pair->{'from'}->{'id'} ) &&
+		  ( $next_pair->{'ori'} == $current_pair->{'ori'} ) &&
+		  ( $next_pair->{'to'}->{'start'} -1 == $current_pair->{'to'}->{'end'} )) {
+	      
+	      if( $current_pair->{'ori'} == 1 ) {
+		  # check forward strand merge
+		  if( $next_pair->{'from'}->{'start'} - 1 == $current_pair->{'from'}->{'end'} ) {
+		      # normal merge with previous element
+		      $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
+		      $current_pair->{'from'}->{'end'} = $next_pair->{'from'}->{'end'};
+		      $del_pair = $next_pair;
+		  }
+	      } else {
+		  # check backward strand merge
+		  if( $next_pair->{'from'}->{'end'} + 1 == $current_pair->{'from'}->{'start'} ) {
+		      # yes its a merge
+		      $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
+		      $current_pair->{'from'}->{'start'} = $next_pair->{'from'}->{'start'};
+		      $del_pair = $next_pair;
+		  }
+	      }
+	  }
+      
+	  if( defined $del_pair ) {
+	      splice( @$lr, $next, 1 );
+	      $lr_from = $self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})};
+	      for( my $j=0; $j <= $#{$lr_from}; $j++ ) {
+		  if( $lr_from->[$j] == $del_pair ) {
+		      splice( @$lr_from, $j, 1 );
+		      last;
+		  }
+	      }
+	      $length--;
+	      if( $length < $next ) { last; }
+	  }
+	  else {
+	      $next++;
+	      $i++;
+	  }
+      }  
+      
+   }
+    $self->{'pair_count'} += scalar( @$lr );
+ }
+}
+
+
+# _is_sorted
+#
+#    Arg  1      int $sorted
+#    Function    toggle for whether the (internal)
+#                map data are sorted
+#    Returntype  int
+#    Exceptions  none
+#    Caller      internal
+#
+
+sub _is_sorted {
+   my ($self, $value) = @_;
+   $self->{'_is_sorted'} = $value if (defined($value));
+   return $self->{'_is_sorted'};
+}
+
+
+1;