diff variant_effect_predictor/Bio/EnsEMBL/DnaDnaAlignFeature.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/DnaDnaAlignFeature.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,400 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::DnaDnaAlignFeature - Ensembl specific dna-dna pairwise
+alignment feature
+
+=head1 SYNOPSIS
+
+  See BaseAlignFeature
+
+=cut
+
+
+package Bio::EnsEMBL::DnaDnaAlignFeature;
+
+use strict;
+
+use Bio::EnsEMBL::BaseAlignFeature;
+
+use vars qw(@ISA);
+use Bio::SimpleAlign;
+use Bio::LocatableSeq;
+use Bio::EnsEMBL::Utils::Argument qw(rearrange);
+
+@ISA = qw( Bio::EnsEMBL::BaseAlignFeature );
+
+
+=head2 new
+
+  Arg [..]   : List of named arguments. (-pair_dna_align_feature_id) defined
+               in this constructor, others defined in BaseFeaturePair and 
+               SeqFeature superclasses.  
+  Example    : $daf = new DnaDnaAlignFeature(-cigar_string => '3M3I12M');
+  Description: Creates a new DnaDnaAlignFeature using either a cigarstring or
+               a list of ungapped features.  
+  Returntype : Bio::EnsEMBL::DnaDnaAlignFeature
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub new {
+  
+  my $caller = shift;
+
+  my $class = ref($caller) || $caller;
+
+  my $self = $class->SUPER::new(@_);
+
+  my ($pair_dna_align_feature_id) = rearrange([qw(PAIR_DNA_ALIGN_FEATURE_ID)], @_);
+  if (defined $pair_dna_align_feature_id){
+      $self->{'pair_dna_align_feature_id'} = $pair_dna_align_feature_id;
+  }
+  return $self;
+}
+
+
+=head2 pair_dna_align_feature_id
+
+  Arg[1]     : (optional) String $arg - value to set
+  Example    : $self->pair_dna_align_feature_id($pair_feature_id);
+  Description: Getter/setter for attribute 'pair_dna_align_feature_id'
+               The id of the dna feature aligned
+  Returntype : String
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub pair_dna_align_feature_id{
+    my ($self, $arg) = @_;
+    if (defined $arg){
+	$self->{pair_dna_align_feature_id} = $arg;
+    }
+    return $self->{pair_dna_align_feature_id};
+}
+
+=head2 _hit_unit
+
+  Arg [1]    : none
+  Description: PRIVATE implementation of abstract superclass method.  Returns
+               1 as the 'unit' used for the hit sequence. 
+  Returntype : int
+  Exceptions : none
+  Caller     : Bio::EnsEMBL::BaseAlignFeature
+  Status     : Stable
+
+=cut
+
+sub _hit_unit {
+  return 1;
+}
+
+
+
+=head2 _query_unit
+
+  Arg [1]    : none
+  Description: PRIVATE implementation of abstract superclass method Returns
+               1 as the 'unit' used for the hit sequence.
+  Returntype : int
+  Exceptions : none
+  Caller     : Bio::EnsEMBL::BaseAlignFeature
+  Status     : Stable
+
+=cut
+
+sub _query_unit {
+  return 1;
+}
+
+=head2 restrict_between_positions
+
+  Arg [1]    : int $start
+  Arg [2]    : int $end
+  Arg [3]    : string $flags
+               SEQ = $start and $end apply to the seq sequence
+                     i.e. start and end methods
+               HSEQ = $start and $end apply to the hseq sequence
+                      i.e. hstart and hend methods
+  Example    : $daf->restrict_between_positions(150,543,"SEQ")
+  Description: Build a new DnaDnaAlignFeature object that fits within
+               the new specified coordinates and sequence reference, cutting
+               any pieces hanging upstream and downstream.
+  Returntype : Bio::EnsEMBL::DnaDnaAlignFeature object
+  Exceptions : 
+  Caller     : 
+  Status     : Stable
+
+=cut
+
+sub restrict_between_positions {
+  my ($self,$start,$end,$seqref) = @_;
+
+  unless (defined $start && $start =~ /^\d+$/) {
+    $self->throw("The first argument is not defined or is not an integer");
+  }
+  unless (defined $end && $end =~ /^\d+$/) {
+    $self->throw("The second argument is not defined or is not an integer");
+  }
+  unless (defined $seqref &&
+          ($seqref eq "SEQ" || $seqref eq "HSEQ")) {
+    $self->throw("The third argument is not defined or is not equal to 'SEQ' or 'HSEQ'");
+  }
+
+# symbolic method references should be forbidden!
+# need to be rewrite at some stage.
+
+  my ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) =
+    qw(start end strand hstart hend hstrand);
+
+  if ($seqref eq "HSEQ") {
+    ($start_method1,$end_method1,$strand_method1,$start_method2,$end_method2,$strand_method2) =
+    qw(hstart hend hstrand start end strand);
+  }
+
+  my @restricted_features;
+
+  foreach my $ungapped_feature ($self->ungapped_features) {
+
+    if ($ungapped_feature->$start_method1() > $end ||
+        $ungapped_feature->$end_method1() < $start) {
+
+      next;
+
+    } elsif ($ungapped_feature->$end_method1() <= $end &&
+             $ungapped_feature->$start_method1() >= $start) {
+
+      push @restricted_features, $ungapped_feature;
+
+    } else {
+
+      if ($ungapped_feature->$strand_method1() eq $ungapped_feature->$strand_method2()) {
+
+        if ($ungapped_feature->$start_method1() < $start) {
+
+          my $offset = $start - $ungapped_feature->$start_method1();
+          $ungapped_feature->$start_method1($start);
+          $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset);
+
+        }
+        if ($ungapped_feature->$end_method1() > $end) {
+
+          my $offset = $ungapped_feature->$end_method1() - $end;
+          $ungapped_feature->$end_method1($end);
+          $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset);
+
+        }
+      } else {
+
+        if ($ungapped_feature->$start_method1() < $start) {
+
+          my $offset = $start - $ungapped_feature->$start_method1();
+          $ungapped_feature->$start_method1($start);
+          $ungapped_feature->$end_method2($ungapped_feature->$end_method2() - $offset);
+
+        }
+        if ($ungapped_feature->$end_method1() > $end) {
+
+          my $offset = $ungapped_feature->$end_method1() - $end;
+          $ungapped_feature->$end_method1($end);
+          $ungapped_feature->$start_method2($ungapped_feature->$start_method2() + $offset);
+
+        }
+      }
+      
+      push @restricted_features, $ungapped_feature;
+    }
+  }
+
+  if (scalar @restricted_features) {
+    my $DnaDnaAlignFeature = new Bio::EnsEMBL::DnaDnaAlignFeature('-features' =>\@restricted_features);
+    if (defined $self->slice) {
+      $DnaDnaAlignFeature->slice($self->slice);
+    }
+    if (defined $self->hslice) {
+      $DnaDnaAlignFeature->hslice($self->hslice);
+    }
+    return $DnaDnaAlignFeature;
+  } else {
+    return undef;
+  }
+}
+
+=head2 alignment_strings
+
+  Arg [1]    : list of string $flags
+               FIX_SEQ = does not introduce gaps (dashes) in seq aligned sequence
+                         and delete the corresponding insertions in hseq aligned sequence
+               FIX_HSEQ = does not introduce gaps (dashes) in hseq aligned sequence
+                         and delete the corresponding insertions in seq aligned sequence
+               NO_SEQ = return the seq aligned sequence as an empty string
+               NO_HSEQ = return the hseq aligned sequence as an empty string
+               This 2 last flags would save a bit of time as doing so no querying to the core
+               database in done to get the sequence.
+  Example    : $daf->alignment_strings or
+               $daf->alignment_strings("FIX_HSEQ") or
+               $daf->alignment_strings("NO_SEQ","FIX_SEQ")
+  Description: Allows to rebuild the alignment string of both the seq and hseq sequence
+               using the cigar_string information and the slice and hslice objects
+  Returntype : array reference containing 2 strings
+               the first corresponds to seq
+               the second corresponds to hseq
+  Exceptions : 
+  Caller     : 
+  Status     : Stable
+
+=cut
+
+
+sub alignment_strings {
+  my ( $self, @flags ) = @_;
+
+  # set the flags
+  my $seq_flag = 1;
+  my $hseq_flag = 1;
+  my $fix_seq_flag = 0;
+  my $fix_hseq_flag = 0;
+
+  for my $flag ( @flags ) {
+    $seq_flag = 0 if ($flag eq "NO_SEQ");
+    $hseq_flag = 0 if ($flag eq "NO_HSEQ");
+    $fix_seq_flag = 1 if ($flag eq "FIX_SEQ");
+    $fix_hseq_flag = 1 if ($flag eq "FIX_HSEQ");
+  } 
+
+  my ($seq, $hseq);
+  $seq = $self->slice->subseq($self->start, $self->end, $self->strand) if ($seq_flag || $fix_seq_flag);  
+  $hseq = $self->hslice->subseq($self->hstart, $self->hend, $self->hstrand) if ($hseq_flag || $fix_hseq_flag);
+  
+  my $rseq= "";
+  # rseq - result sequence
+  my $rhseq= "";
+  # rhseq - result hsequence
+
+  my $seq_pos = 0;
+  my $hseq_pos = 0;
+
+  my @cig = ( $self->cigar_string =~ /(\d*[DIM])/g );
+
+  for my $cigElem ( @cig ) {
+    my $cigType = substr( $cigElem, -1, 1 );
+    my $cigCount = substr( $cigElem, 0 ,-1 );
+    $cigCount = 1 unless $cigCount;
+
+    if( $cigType eq "M" ) {
+        $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag);
+        $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag);
+      $seq_pos += $cigCount;
+      $hseq_pos += $cigCount;
+    } elsif( $cigType eq "D" ) {
+      if( ! $fix_seq_flag ) {
+        $rseq .=  "-" x $cigCount if ($seq_flag);
+        $rhseq .= substr( $hseq, $hseq_pos, $cigCount ) if ($hseq_flag);
+      }
+      $hseq_pos += $cigCount;
+    } elsif( $cigType eq "I" ) {
+      if( ! $fix_hseq_flag ) {
+        $rseq .= substr( $seq, $seq_pos, $cigCount ) if ($seq_flag);
+        $rhseq .= "-" x $cigCount if ($hseq_flag);
+      }
+      $seq_pos += $cigCount;
+    }
+  }
+  return [ $rseq,$rhseq ];
+}
+
+=head2 get_SimpleAlign
+
+  Arg [1]    : list of string $flags
+               translated = by default, the sequence alignment will be on nucleotide. With translated flag
+                            the aligned sequences are translated.
+               uc = by default aligned sequences are given in lower cases. With uc flag, the aligned 
+                    sequences are given in upper cases.
+  Example    : $daf->get_SimpleAlign or
+               $daf->get_SimpleAlign("translated") or
+               $daf->get_SimpleAlign("translated","uc")
+  Description: Allows to rebuild the alignment string of both the seq and hseq sequence
+               using the cigar_string information and the slice and hslice objects
+  Returntype : a Bio::SimpleAlign object
+  Exceptions : 
+  Caller     : 
+  Status     : Stable
+
+=cut
+
+sub get_SimpleAlign {
+  my ( $self, @flags ) = @_;
+
+  # setting the flags
+  my $uc = 0;
+  my $translated = 0;
+
+  for my $flag ( @flags ) {
+    $uc = 1 if ($flag =~ /^uc$/i);
+    $translated = 1 if ($flag =~ /^translated$/i);
+  }
+
+  my $sa = Bio::SimpleAlign->new();
+
+  #Hack to try to work with both bioperl 0.7 and 1.2:
+  #Check to see if the method is called 'addSeq' or 'add_seq'
+  my $bio07 = 0;
+  if(!$sa->can('add_seq')) {
+    $bio07 = 1;
+  }
+
+  my ($sb_seq,$qy_seq) = @{$self->alignment_strings};
+
+  my $loc_sb_seq = Bio::LocatableSeq->new(-SEQ    => $uc ? uc $sb_seq : lc $sb_seq,
+                                          -START  => $self->seq_region_start,
+                                          -END    => $self->seq_region_end,
+                                          -ID     => $self->seqname,
+                                          -STRAND => $self->strand);
+
+  $loc_sb_seq->seq($uc ? uc $loc_sb_seq->translate->seq
+                   : lc $loc_sb_seq->translate->seq) if ($translated);
+
+  my $loc_qy_seq = Bio::LocatableSeq->new(-SEQ    => $uc ? uc $qy_seq : lc $qy_seq,
+                                          -START  => $self->hseq_region_start,
+                                          -END    => $self->hseq_region_end,
+                                          -ID     => $self->hseqname,
+                                          -STRAND => $self->hstrand);
+
+  $loc_qy_seq->seq($uc ? uc  $loc_qy_seq->translate->seq
+                   : lc $loc_qy_seq->translate->seq) if ($translated);
+
+  if($bio07) {
+    $sa->addSeq($loc_sb_seq);
+    $sa->addSeq($loc_qy_seq);
+  } else {
+    $sa->add_seq($loc_sb_seq);
+    $sa->add_seq($loc_qy_seq);
+  }
+
+  return $sa;
+}
+
+1;