view variant_effect_predictor/Bio/EnsEMBL/Compara/BaseGenomicAlignSet.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line source

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=head1 NAME

Bio::EnsEMBL::Compara::BaseGenomicAlignSet - Base class for GenomicAlignBlock and GenomicAlignTree

=head1 APPENDIX

The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _

=cut


# Let the code begin...

package Bio::EnsEMBL::Compara::BaseGenomicAlignSet;
use strict;

# Object preamble
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning info deprecate verbose);


=head2 slice

  Arg [1]    : (optional) Bio::EnsEMBL::Slice $reference_slice
  Example    : my $slice = $genomic_align_block->slice;
  Example    : $genomic_align_block->slice($slice);
  Description: get/set for attribute slice.
  Returntype : Bio::EnsEMBL::Slice object
  Exceptions : 
  Caller     : general
  Status     : Stable

=cut

sub slice {
  my ($self, $reference_slice) = @_;

  if (defined($reference_slice)) {
#     throw "[$reference_slice] is not a Bio::EnsEMBL::Slice"
#         unless $reference_slice->isa("Bio::EnsEMBL::Slice");
    $self->{'reference_slice'} = $reference_slice;
  }

  return $self->{'reference_slice'};
}

=head2 reference_slice

  Arg [1]    : (optional) Bio::EnsEMBL::Slice $reference_slice
  Example    : my $reference_slice = $genomic_align_block->reference_slice;
  Example    : $genomic_align_block->reference_slice($slice);
  Description: Alias for slice method. TO BE DEPRECATED.
  Returntype : Bio::EnsEMBL::Slice object
  Exceptions : 
  Caller     : general
  Status     : Stable

=cut

sub reference_slice {
  my ($self, $reference_slice) = @_;

  return $self->slice($reference_slice);
}

=head2 start

  Arg [1]    : (optional) integer $start
  Example    : my $start = $genomic_align_block->start;
  Example    : $genomic_align_block->start(1035);
  Description: get/set for attribute reference_slice_start. A value of 0 will set
               the attribute to undefined.
  Returntype : integer
  Exceptions : none
  Caller     : general

=cut

sub start {
  my $self = shift;
  return $self->reference_slice_start(@_);
}


=head2 reference_slice_start

  Arg [1]    : integer $reference_slice_start
  Example    : my $reference_slice_start = $genomic_align_block->reference_slice_start;
  Example    : $genomic_align_block->reference_slice_start(1035);
  Description: get/set for attribute reference_slice_start. A value of 0 will set
               the attribute to undefined.
  Returntype : integer
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub reference_slice_start {
    my ($self, $reference_slice_start) = @_;
    
    if (defined($reference_slice_start)) {
	$self->{'reference_slice_start'} = ($reference_slice_start or undef);
    }
    
    return $self->{'reference_slice_start'};
}


=head2 end

  Arg [1]    : (optional) integer $end
  Example    : my $end = $genomic_align_block->end;
  Example    : $genomic_align_block->end(1283);
  Description: get/set for attribute reference_slice_end. A value of 0 will set
               the attribute to undefined.
  Returntype : integer
  Exceptions : none
  Caller     : general

=cut

sub end {
  my $self = shift;
  return $self->reference_slice_end(@_);
}


=head2 reference_slice_end

  Arg [1]    : integer $reference_slice_end
  Example    : my $reference_slice_end = $genomic_align_block->reference_slice_end;
  Example    : $genomic_align_block->reference_slice_end(1283);
  Description: get/set for attribute reference_slice_end. A value of 0 will set
               the attribute to undefined.
  Returntype : integer
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub reference_slice_end {
  my ($self, $reference_slice_end) = @_;
 
  if (defined($reference_slice_end)) {
    $self->{'reference_slice_end'} = ($reference_slice_end or undef);
  }
  
  return $self->{'reference_slice_end'};

}

=head2 strand

  Arg [1]    : integer $strand
  Example    : my $strand = $genomic_align_block->strand;
  Example    : $genomic_align_block->strand(-1);
  Description: get/set for attribute strand. A value of 0 will set
               the attribute to undefined.
  Returntype : integer
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub strand {
    my ($self, $reference_slice_strand) = @_;
    
    if (defined($reference_slice_strand)) {
	$self->{'reference_slice_strand'} = ($reference_slice_strand or undef);
    }
    
    return $self->{'reference_slice_strand'};
}

=head2 reference_slice_strand

  Arg [1]    : integer $reference_slice_strand
  Example    : my $reference_slice_strand = $genomic_align_block->reference_slice_strand;
  Example    : $genomic_align_block->reference_slice_strand(-1);
  Description: get/set for attribute reference_slice_strand. A value of 0 will set
               the attribute to undefined.
  Returntype : integer
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub reference_slice_strand {
  my ($self, $reference_slice_strand) = @_;
 
  if (defined($reference_slice_strand)) {
    $self->{'reference_slice_strand'} = ($reference_slice_strand or undef);
  }
  
  return $self->{'reference_slice_strand'};
}

=head2 get_original_strand

  Args       : none
  Example    : if (!$genomic_align_block->get_original_strand()) {
                 # original GenomicAlignBlock has been reverse-complemented
               }
  Description: getter for the _orignal_strand attribute
  Returntype : none
  Exceptions : none
  Caller     : general
  Status     : Stable

=cut

sub get_original_strand {
  my ($self) = @_;

  if (!defined($self->{_original_strand})) {
    $self->{_original_strand} = 1;
  }

  return $self->{_original_strand};
}

=head2 restrict_between_reference_positions

  Arg[1]     : [optional] int $start, refers to the reference_dnafrag
  Arg[2]     : [optional] int $end, refers to the reference_dnafrag
  Arg[3]     : [optional] Bio::EnsEMBL::Compara::GenomicAlign $reference_GenomicAlign
  Arg[4]     : [optional] boolean $skip_empty_GenomicAligns
  Example    : none
  Description: restrict this GenomicAlignBlock. It returns a new object unless no
               restriction is needed. In that case, it returns the original unchanged
               object
               It might be the case that the restricted region coincide with a gap
               in one or several GenomicAligns. By default these GenomicAligns are
               returned with a dnafrag_end equals to its dnafrag_start + 1. For instance,
               a GenomicAlign with dnafrag_start = 12345 and dnafrag_end = 12344
               correspond to a block which goes on this region from before 12345 to
               after 12344, ie just between 12344 and 12345. You can choose to remove
               these empty GenomicAligns by setting $skip_empty_GenomicAligns to any
               true value.
  Returntype : Bio::EnsEMBL::Compara::GenomicAlignBlock object in scalar context. In
               list context, returns the previous object and the start and end
               positions of the restriction in alignment coordinates (from 1 to
               alignment_length)
  Exceptions : return undef if reference positions lie outside of the alignment
  Caller     : general
  Status     : At risk

=cut

sub restrict_between_reference_positions {
  my ($self, $start, $end, $reference_genomic_align, $skip_empty_GenomicAligns) = @_;
  my $genomic_align_set;
  my $new_reference_genomic_align;
  my $new_genomic_aligns = [];
  
  $reference_genomic_align ||= $self->reference_genomic_align;
  throw("A reference Bio::EnsEMBL::Compara::GenomicAlign must be given") if (!$reference_genomic_align);
  $start = $reference_genomic_align->dnafrag_start if (!defined($start));
  $end = $reference_genomic_align->dnafrag_end if (!defined($end));

  if ($start > $reference_genomic_align->dnafrag_end or $end < $reference_genomic_align->dnafrag_start) {
    # restricting outside of boundaries => return undef object
    warn("restricting outside of boundaries => return undef object: $start-$end (".$reference_genomic_align->dnafrag_start."-".$reference_genomic_align->dnafrag_end.")");
    return wantarray ? (undef, undef, undef) : undef;
  }
  my $number_of_base_pairs_to_trim_from_the_start = $start - $reference_genomic_align->dnafrag_start;
  my $number_of_base_pairs_to_trim_from_the_end  = $reference_genomic_align->dnafrag_end - $end;

  my $is_ref_low_coverage = 0;
  if ($reference_genomic_align->cigar_line =~ /X/) {
      $is_ref_low_coverage = 1;
  }

  ## Skip if no restriction is needed. Return original object! We are still going on with the
  ## restriction when either excess_at_the_start or excess_at_the_end are 0 as a (multiple)
  ## alignment may start or end with gaps in the reference species. In that case, we want to
  ## trim these gaps from the alignment as they fall just outside of the region of interest

  ##Exception if the reference species is low coverage, then need to continue 
  ##with this routine to find out the correct align positions
  if ($number_of_base_pairs_to_trim_from_the_start < 0 and $number_of_base_pairs_to_trim_from_the_end < 0 and !$is_ref_low_coverage) {
    return wantarray ? ($self, 1, $self->length) : $self;
  }

  my $negative_strand = ($reference_genomic_align->dnafrag_strand == -1);

  ## Create a new Bio::EnsEMBL::Compara::GenomicAlignBlock object
  throw("Reference GenomicAlign not found!") if (!$reference_genomic_align);

  my @reference_cigar = grep {$_} split(/(\d*[GDMXI])/, $reference_genomic_align->cigar_line);
  if ($negative_strand) {
    @reference_cigar = reverse @reference_cigar;
  }

  #If this is negative, eg when a slice starts in one block and ends in
  #another, need to set to 0 to ensure we enter the loop below. This
  #fixes a bug when using a 2x species as the reference and fetching using
  #an expanded align slice. 
  if ($number_of_base_pairs_to_trim_from_the_start < 0) {
      $number_of_base_pairs_to_trim_from_the_start = 0;
  }

  ## Parse start of cigar_line for the reference GenomicAlign
  my $counter_of_trimmed_columns_from_the_start = 0; # num. of bp in the alignment to be trimmed

  if ($number_of_base_pairs_to_trim_from_the_start >= 0) {
    my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the start)
    ## Loop through the cigar pieces
    while (my $cigar = shift(@reference_cigar)) {
      # Parse each cigar piece
      my ($num, $type) = ($cigar =~ /^(\d*)([GDMXI])/);
      $num = 1 if ($num eq "");

      # Insertions are not part of the alignment, don't count them
      if ($type ne "I") {
        $counter_of_trimmed_columns_from_the_start += $num;
      }

      # Matches and insertions are actual base pairs in the reference
      if ($type eq "M" or $type eq "I") {
        $counter_of_trimmed_base_pairs += $num;
        # If this cigar piece is too long and we overshoot the number of base pairs we want to trim,
        # we substitute this cigar piece by a shorter one
        if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_start) {
          my $new_cigar_piece = "";
          # length of the new cigar piece
          my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_start;
          if ($length > 1) {
            $new_cigar_piece = $length.$type;
          } elsif ($length == 1) {
            $new_cigar_piece = $type;
          }
          unshift(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece);

          # There is no need to correct the counter of trimmed columns if we are in an insertion
          # when we overshoot
          if ($type eq "M") {
            $counter_of_trimmed_columns_from_the_start -= $length;
          }

          ## We don't want to start with an insertion or a deletion. Trim them!
          while (@reference_cigar and $reference_cigar[0] =~ /[DI]/) {
            my ($num, $type) = ($reference_cigar[0] =~ /^(\d*)([DIGMX])/);
            $num = 1 if ($num eq "");
            # only counts deletions, insertions are not part of the aligment
            $counter_of_trimmed_columns_from_the_start += $num if ($type eq "D");
            shift(@reference_cigar);
          }
          last;
        }
      }
    }
  }

  #If this is negative, eg when a slice starts in one block and ends in
  #another, need to set to 0 to ensure we enter the loop below. This
  #fixes a bug when using a 2x species as the reference and fetching using
  #an expanded align slice. 
  if ($number_of_base_pairs_to_trim_from_the_end < 0) {
      $number_of_base_pairs_to_trim_from_the_end = 0;
  }

  ## Parse end of cigar_line for the reference GenomicAlign
  my $counter_of_trimmed_columns_from_the_end = 0; # num. of bp in the alignment to be trimmed
  if ($number_of_base_pairs_to_trim_from_the_end >= 0) {
    my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the end)
    ## Loop through the cigar pieces
    while (my $cigar = pop(@reference_cigar)) {
      # Parse each cigar piece
      my ($num, $type) = ($cigar =~ /^(\d*)([DIGMX])/);
      $num = 1 if ($num eq "");

      # Insertions are not part of the alignment, don't count them
      if ($type ne "I") {
        $counter_of_trimmed_columns_from_the_end += $num;
      }

      # Matches and insertions are actual base pairs in the reference
      if ($type eq "M" or $type eq "I") {
        $counter_of_trimmed_base_pairs += $num;
        # If this cigar piece is too long and we overshoot the number of base pairs we want to trim,
        # we substitute this cigar piece by a shorter one
        if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_end) {
          my $new_cigar_piece = "";
          # length of the new cigar piece
          my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_end;
          if ($length > 1) {
            $new_cigar_piece = $length.$type;
          } elsif ($length == 1) {
            $new_cigar_piece = $type;
          }
          push(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece);

          # There is no need to correct the counter of trimmed columns if we are in an insertion
          # when we overshoot
          if ($type eq "M") {
            $counter_of_trimmed_columns_from_the_end -= $length;
          }

          ## We don't want to end with an insertion or a deletion. Trim them!
          while (@reference_cigar and $reference_cigar[-1] =~ /[DI]/) {
            my ($num, $type) = ($reference_cigar[-1] =~ /^(\d*)([DIGMX])/);
            $num = 1 if ($num eq "");
            # only counts deletions, insertions are not part of the aligment
            $counter_of_trimmed_columns_from_the_end += $num if ($type eq "D");
            pop(@reference_cigar);
          }
          last;
        }
      }
    }
  }

  ## Skip if no restriction is needed. Return original object! This may happen when
  ## either excess_at_the_start or excess_at_the_end are 0 but the alignment does not
  ## start or end with gaps in the reference species.
  if ($counter_of_trimmed_columns_from_the_start <= 0 and $counter_of_trimmed_columns_from_the_end <= 0) {
    return wantarray ? ($self, 1, $self->length) : $self;
  }

  my ($aln_start, $aln_end);
  if ($negative_strand) {
    $aln_start = $counter_of_trimmed_columns_from_the_end + 1;
    $aln_end = $self->length - $counter_of_trimmed_columns_from_the_start;
  } else {
    $aln_start = $counter_of_trimmed_columns_from_the_start + 1;
    $aln_end = $self->length - $counter_of_trimmed_columns_from_the_end;
  }

  $genomic_align_set = $self->restrict_between_alignment_positions($aln_start, $aln_end, $skip_empty_GenomicAligns);
  $new_reference_genomic_align = $genomic_align_set->reference_genomic_align;

  if (!defined $self->{'restricted_aln_start'}) {
      $self->{'restricted_aln_start'} = 0;
  }
  if (!defined $self->{'restricted_aln_end'}) {
      $self->{'restricted_aln_end'} = 0;
  }
  $genomic_align_set->{'restricted_aln_start'} = $counter_of_trimmed_columns_from_the_start + $self->{'restricted_aln_start'};
  $genomic_align_set->{'restricted_aln_end'} = $counter_of_trimmed_columns_from_the_end + $self->{'restricted_aln_end'};
  #$genomic_align_set->{'original_length'} = $self->length;

  #Need to use original gab length. If original_length is not set, length has
  #not changed. Needed when use 2X genome as reference. 
  if (defined $self->{'original_length'}) {
      $genomic_align_set->{'original_length'} = $self->{'original_length'};
  } else {
      $genomic_align_set->{'original_length'} = $self->length;
  }

  if (defined $self->slice) {
    if ($self->strand == 1) {
      $genomic_align_set->start($new_reference_genomic_align->dnafrag_start -
          $self->slice->start + 1);
      $genomic_align_set->end($new_reference_genomic_align->dnafrag_end -
          $self->slice->start + 1);
      $genomic_align_set->strand(1);
    } else {
      $genomic_align_set->start($self->{reference_slice}->{end} -
          $new_reference_genomic_align->{dnafrag_end} + 1);
      $genomic_align_set->end($self->{reference_slice}->{end} -
          $new_reference_genomic_align->{dnafrag_start} + 1);
      $genomic_align_set->strand(-1);
    }
  }

  return wantarray ? ($genomic_align_set, $aln_start, $aln_end) : $genomic_align_set;
}

1;