diff variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMember.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMember.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,486 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 AUTHORSHIP
+
+Ensembl Team. Individual contributions can be found in the CVS log.
+
+=cut
+
+=head1 NAME
+
+AlignedMember - DESCRIPTION of Object
+
+=head1 DESCRIPTION
+
+A subclass of Member which extends it to allow it to be aligned with other AlignedMember objects.
+General enough to allow for global, local, pair-wise and multiple alignments.
+At the moment used primarily in NestedSet Tree data-structure, but there are plans to extend its usage.
+
+=head1 INHERITANCE TREE
+
+  Bio::EnsEMBL::Compara::AlignedMember
+  +- Bio::EnsEMBL::Compara::Member
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::Compara::AlignedMember;
+
+use strict;
+use Bio::EnsEMBL::Utils::Exception;
+use Bio::EnsEMBL::Compara::Member;
+
+use base ('Bio::EnsEMBL::Compara::Member');
+
+
+##################################
+# overriden superclass methods
+##################################
+
+=head2 copy
+
+  Arg [1]     : none
+  Example     : $copy = $aligned_member->copy();
+  Description : Creates a new AlignedMember object from an existing one
+  Returntype  : Bio::EnsEMBL::Compara::AlignedMember
+  Exceptions  : none
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub copy {
+  my $self = shift;
+  
+  my $mycopy = @_ ? shift : {};     # extending or from scratch?
+               $self->SUPER::copy($mycopy);
+  bless $mycopy, 'Bio::EnsEMBL::Compara::AlignedMember';
+
+  # The following does not Work if the initial object is only a Member
+  if (UNIVERSAL::isa($self, 'Bio::EnsEMBL::Compara::AlignedMember')) {
+    $mycopy->cigar_line($self->cigar_line);
+    $mycopy->cigar_start($self->cigar_start);
+    $mycopy->cigar_end($self->cigar_end);
+    $mycopy->perc_cov($self->perc_cov);
+    $mycopy->perc_id($self->perc_id);
+    $mycopy->perc_pos($self->perc_pos);
+    $mycopy->method_link_species_set_id($self->method_link_species_set_id);
+  }
+
+  return $mycopy;
+}
+
+
+#####################################################
+
+
+=head2 cigar_line
+
+  Arg [1]     : (optional) $cigar_line
+  Example     : $object->cigar_line($cigar_line);
+  Example     : $cigar_line = $object->cigar_line();
+  Description : Getter/setter for the cigar_line attribute. The cigar line
+                represents the modifications that are required to go from
+                the original sequence to the aligned sequence. In particular,
+                it shows the location of the gaps in the sequence. The cigar
+                line is built with a series of numbers and characters where
+                the number represents the number of positions in the mode
+                defined by the next charcater. When the number is 1, it can be
+                omitted. For example, the cigar line '23MD4M' means that there
+                are 23 matches or mismatches, then 1 deletion (gap) and then
+                another 4 matches or mismatches. The aligned sequence is
+                obtained by inserting 1 gap at the right location.
+  Returntype  : string
+  Exceptions  : none
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub cigar_line {
+  my $self = shift;
+  $self->{'_cigar_line'} = shift if(@_);
+  return $self->{'_cigar_line'};
+}
+
+
+=head2 cigar_start
+
+  Arg [1]     : (optional) $cigar_start
+  Example     : $object->cigar_start($cigar_start);
+  Example     : $cigar_start = $object->cigar_start();
+  Description : Getter/setter for the cigar_start attribute. For non-global
+                alignments, this represent the starting point of the local
+                alignment.
+                Currently the data provided as AlignedMembers (leaves of the
+                GeneTree) are obtained using global alignments and the
+                cigar_start is always undefined.
+  Returntype  : integer
+  Exceptions  : none
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub cigar_start {
+  my $self = shift;
+  $self->{'_cigar_start'} = shift if(@_);
+  return $self->{'_cigar_start'};
+}
+
+
+=head2 cigar_end
+
+  Arg [1]     : (optional) $cigar_end
+  Example     : $object->cigar_end($cigar_end);
+  Example     : $cigar_end = $object->cigar_end();
+  Description : Getter/setter for the cigar_end attribute. For non-global
+                alignments, this represent the ending point of the local
+                alignment.
+                Currently the data provided as AlignedMembers (leaves of the
+                GeneTree) are obtained using global alignments and the
+                cigar_end is always undefined.
+  Returntype  : integer
+  Exceptions  : none
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub cigar_end {
+  my $self = shift;
+  $self->{'_cigar_end'} = shift if(@_);
+  return $self->{'_cigar_end'};
+}
+
+
+=head2 perc_cov
+
+  Arg [1]     : (optional) $perc_cov
+  Example     : $object->perc_cov($perc_cov);
+  Example     : $perc_cov = $object->perc_cov();
+  Description : Getter/setter for the perc_cov attribute. For non-global
+                alignments, this represent the coverage of the alignment in
+                percentage of the total length of the sequence.
+                Currently the data provided as AlignedMembers (leaves of the
+                GeneTree) are obtained using global alignments (the whole
+                sequence is always included) and the perc_cov is always undefined.
+  Returntype  : integer
+  Exceptions  : none
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub perc_cov {
+  my $self = shift;
+  $self->{'perc_cov'} = shift if(@_);
+  return $self->{'perc_cov'};
+}
+
+
+=head2 perc_id
+
+  Arg [1]     : (optional) $perc_id
+  Example     : $object->perc_id($perc_id);
+  Example     : $perc_id = $object->perc_id();
+  Description : Getter/setter for the perc_id attribute. This is generally
+                used for pairwise relationships. The percentage identity
+                reprensents the number of positions that are identical in
+                the alignment in both sequences.
+                Currently the data provided as AlignedMembers (leaves of the
+                GeneTree) are obtained using multiple alignments and the
+                perc_id is always undefined.
+  Returntype  : integer
+  Exceptions  : none
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub perc_id {
+  my $self = shift;
+  $self->{'perc_id'} = shift if(@_);
+  return $self->{'perc_id'};
+}
+
+
+=head2 perc_pos
+
+  Arg [1]     : (optional) $perc_pos
+  Example     : $object->perc_pos($perc_pos);
+  Example     : $perc_pos = $object->perc_pos();
+  Description : Getter/setter for the perc_pos attribute. This is generally
+                used for pairwise relationships. The percentage positivity
+                reprensents the number of positions that are positive in
+                the alignment in both sequences. Currently, this is calculated
+                for protein sequences using the BLOSUM62 scoring matrix.
+                Currently the data provided as AlignedMembers (leaves of the
+                GeneTree) are obtained using multiple alignments and the
+                perc_cov is always undefined.
+  Returntype  : integer
+  Exceptions  : none
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub perc_pos {
+  my $self = shift;
+  $self->{'perc_pos'} = shift if(@_);
+  return $self->{'perc_pos'};
+}
+
+
+=head2 method_link_species_set_id
+
+  Arg [1]     : (optional) $method_link_species_set_id
+  Example     : $object->method_link_species_set_id($method_link_species_set_id);
+  Example     : $method_link_species_set_id = $object->method_link_species_set_id();
+  Description : Getter/setter for the method_link_species_set_id attribute. Please,
+                refer to the Bio::EnsEMBL::Compara::MethodLinkSpeciesSet module
+                for more information on the method_link_species_set_id.
+  Returntype  : int
+  Exceptions  : Returns 0 if the method_link_species_set_id is not defined.
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub method_link_species_set_id {
+  my $self = shift;
+  $self->{'method_link_species_set_id'} = shift if(@_);
+  $self->{'method_link_species_set_id'} = 0 unless(defined($self->{'method_link_species_set_id'}));
+  return $self->{'method_link_species_set_id'};
+}
+
+
+sub set {
+    my $self = shift;
+    return $self->{'set'};
+}
+
+
+=head2 alignment_string
+
+  Arg [1]     : (optional) bool $exon_cased
+  Example     : my $alignment_string = $object->alignment_string();
+  Example     : my $alignment_string = $object->alignment_string(1);
+  Description : Returns the aligned sequence for this object. For sequences
+                split in exons, the $exon_cased flag permits to request
+                that each exon is represented in alternative upper and lower
+                case.
+                For local alignments, when the alignment does not cover the
+                whole protein, only the part of the sequence in the alignemnt
+                is returned. Currently only global alignments are provided.
+                Therefore the alignment_string always returns the whole aligned
+                sequence.
+  Returntype  : string
+  Exceptions  : throws if the cigar_line is not defined for this object.
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub alignment_string {
+  my $self = shift;
+  my $exon_cased = shift;
+
+  unless (defined $self->cigar_line && $self->cigar_line ne "") {
+    throw("To get an alignment_string, the cigar_line needs to be define\n");
+  }
+
+  # Use different keys for exon-cased and non exon-cased sequences
+  my $key = 'alignment_string';
+  if ($exon_cased) {
+    $key = 'alignment_string_cased';
+  } elsif (defined $self->{'alignment_string_cased'} and !defined($self->{'alignment_string'})) {
+    # non exon-cased sequence can be easily obtained from the exon-cased one.
+    $self->{'alignment_string'} = uc($self->{'alignment_string_cased'})
+  }
+
+  unless (defined $self->{$key}) {
+    my $sequence;
+    if ($exon_cased) {
+      $sequence = $self->sequence_exon_cased;
+    } else {
+      $sequence = $self->sequence;
+    }
+    if (defined $self->cigar_start || defined $self->cigar_end) {
+      unless (defined $self->cigar_start && defined $self->cigar_end) {
+        throw("both cigar_start and cigar_end should be defined");
+      }
+      my $offset = $self->cigar_start - 1;
+      my $length = $self->cigar_end - $self->cigar_start + 1;
+      $sequence = substr($sequence, $offset, $length);
+    }
+
+    my $cigar_line = $self->cigar_line;
+    $cigar_line =~ s/([MD])/$1 /g;
+
+    my @cigar_segments = split " ",$cigar_line;
+    my $alignment_string = "";
+    my $seq_start = 0;
+    foreach my $segment (@cigar_segments) {
+      if ($segment =~ /^(\d*)D$/) {
+        my $length = $1;
+        $length = 1 if ($length eq "");
+        $alignment_string .= "-" x $length;
+      } elsif ($segment =~ /^(\d*)M$/) {
+        my $length = $1;
+        $length = 1 if ($length eq "");
+        $alignment_string .= substr($sequence,$seq_start,$length);
+        $seq_start += $length;
+      }
+    }
+    $self->{$key} = $alignment_string;
+  }
+
+  return $self->{$key};
+}
+
+
+=head2 alignment_string_bounded
+
+  Arg [1]     : none
+  Example     : my $alignment_string_bounded = $object->alignment_string_bounded();
+  Description : Returns the aligned sequence for this object with padding characters
+                representing the introns.
+  Returntype  : string
+  Exceptions  : throws if the cigar_line is not defined for this object or if the
+                cigar_start or cigar_end are defined.
+  Caller      : general
+  Status      : Stable
+
+=cut
+
+sub alignment_string_bounded {
+  my $self = shift;
+
+  unless (defined $self->cigar_line && $self->cigar_line ne "") {
+    throw("To get an alignment_string, the cigar_line needs to be define\n");
+  }
+  unless (defined $self->{'alignment_string_bounded'}) {
+    my $sequence_exon_bounded = $self->sequence_exon_bounded;
+    if (defined $self->cigar_start || defined $self->cigar_end) {
+      throw("method doesnt implement defined cigar_start and cigar_end");
+    }
+    $sequence_exon_bounded =~ s/b|o|j/\ /g;
+    my $cigar_line = $self->cigar_line;
+    $cigar_line =~ s/([MD])/$1 /g;
+
+    my @cigar_segments = split " ",$cigar_line;
+    my $alignment_string_bounded = "";
+    my $seq_start = 0;
+    my $exon_count = 1;
+    foreach my $segment (@cigar_segments) {
+      if ($segment =~ /^(\d*)D$/) {
+        my $length = $1;
+        $length = 1 if ($length eq "");
+        $alignment_string_bounded .= "-" x $length;
+      } elsif ($segment =~ /^(\d*)M$/) {
+        my $length = $1;
+        $length = 1 if ($length eq "");
+        my $substring = substr($sequence_exon_bounded,$seq_start,$length);
+        if ($substring =~ /\ /) {
+          my $num_boundaries = $substring =~ s/(\ )/$1/g;
+          $length += $num_boundaries;
+          $substring = substr($sequence_exon_bounded,$seq_start,$length);
+        }
+        $alignment_string_bounded .= $substring;
+        $seq_start += $length;
+      }
+    }
+    $self->{'alignment_string_bounded'} = $alignment_string_bounded;
+  }
+
+  return $self->{'alignment_string_bounded'};
+}
+
+
+=head2 cdna_alignment_string
+
+  Arg [1]    : none
+  Example    : my $cdna_alignment = $aligned_member->cdna_alignment_string();
+  Description: Converts the peptide alignment string to a cdna alignment
+               string.  This only works for EnsEMBL peptides whose cdna can
+               be retrieved from the attached core databse.
+               If the cdna cannot be retrieved undef is returned and a
+               warning is thrown.
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+
+=cut
+
+sub cdna_alignment_string {
+  my ($self, $changeSelenos) = @_;
+  $changeSelenos = 0 unless (defined $changeSelenos);
+
+  unless (defined $self->{'cdna_alignment_string'}) {
+
+    my $cdna;
+    eval { $cdna = $self->sequence_cds;};
+    if ($@) {
+      throw("can't connect to CORE to get transcript and cdna for "
+            . "genome_db_id:" . $self->genome_db_id )
+        unless($self->get_Transcript);
+      $cdna = $self->get_Transcript->translateable_seq;
+    }
+
+    if (defined $self->cigar_start || defined $self->cigar_end) {
+      unless (defined $self->cigar_start && defined $self->cigar_end) {
+        throw("both cigar_start and cigar_end should be defined");
+      }
+      my $offset = $self->cigar_start * 3 - 3;
+      my $length = ($self->cigar_end - $self->cigar_start + 1) * 3;
+      $cdna = substr($cdna, $offset, $length);
+    }
+
+    my $start = 0;
+    my $cdna_align_string = '';
+
+    # foreach my $pep (split(//, $self->alignment_string)) { # Speed up below
+    my $alignment_string = $self->alignment_string;
+    foreach my $pep (unpack("A1" x length($alignment_string), $alignment_string)) {
+      if($pep eq '-') {
+        $cdna_align_string .= '--- ';
+      } elsif ((($pep eq 'U') and $changeSelenos) or ($pep eq '*')) {
+	  $cdna_align_string .= 'NNN ';
+	  $start += 3;  
+      } else {
+        my $codon = substr($cdna, $start, 3);
+        unless (length($codon) == 3) {
+          # sometimes the last codon contains only 1 or 2 nucleotides.
+          # making sure that it has 3 by adding as many Ns as necessary
+          $codon .= 'N' x (3 - length($codon));
+        }
+        $cdna_align_string .= $codon . ' ';
+        $start += 3;
+      }
+    }
+    $self->{'cdna_alignment_string'} = $cdna_align_string;
+  }
+  
+  return $self->{'cdna_alignment_string'};
+}
+
+
+1;