Mercurial > repos > mahtabm > ensemb_rep_gvl

diff variant_effect_predictor/Bio/SimpleAlign.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 06:29:17 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SimpleAlign.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,1934 @@
+# $Id: SimpleAlign.pm,v 1.65.2.1 2003/07/02 16:00:19 jason Exp $
+# BioPerl module for SimpleAlign
+#
+# Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
+#
+# Copyright Ewan Birney
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+#
+#  History:
+#	11/3/00 Added threshold feature to consensus and consensus_aa  - PS
+#	May 2001 major rewrite - Heikki Lehvaslaiho
+
+=head1 NAME
+
+Bio::SimpleAlign - Multiple alignments held as a set of sequences
+
+=head1 SYNOPSIS
+
+  # use Bio::AlignIO to read in the alignment
+  $str = Bio::AlignIO->new('-file' => 't/data/testaln.pfam');
+  $aln = $str->next_aln();
+
+  # some descriptors
+  print $aln->length, "\n";
+  print $aln->no_residues, "\n";
+  print $aln->is_flush, "\n";
+  print $aln->no_sequences, "\n";
+  print $aln->percentage_identity, "\n";
+  print $aln->consensus_string(50), "\n";
+
+  # find the position in the alignment for a sequence location
+  $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
+
+  # extract sequences and check values for the alignment column $pos
+  foreach $seq ($aln->each_seq) {
+      $res = $seq->subseq($pos, $pos);
+      $count{$res}++;
+  }
+  foreach $res (keys %count) {
+      printf "Res: %s  Count: %2d\n", $res, $count{$res};
+  }
+
+
+=head1 DESCRIPTION
+
+SimpleAlign handles multiple alignments of sequences. It is very
+permissive of types (it won't insist on things being all same length
+etc): really it is a SequenceSet explicitly held in memory with a
+whole series of built in manipulations and especially file format
+systems for read/writing alignments.
+
+SimpleAlign basically views an alignment as an immutable block of
+text.  SimpleAlign *is not* the object to be using if you want to
+perform complex alignment manipulations.
+
+However for lightweight display/formatting and minimal manipulation
+(e.g. removing all-gaps columns) - this is the one to use.
+
+SimpleAlign uses a subclass of L<Bio::PrimarySeq> class
+L<Bio::LocatableSeq> to store its sequences. These are subsequences
+with a start and end positions in the parent reference sequence.
+
+Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in
+the alignment, and this is the key for the internal hashes.
+(name,start,end is abbreviated nse in the code). However, in many
+cases people don't want the name/start-end to be displayed: either
+multiple names in an alignment or names specific to the alignment
+(ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
+'displayname', and generally is what is used to print out the
+alignment. They default to name/start-end.
+
+The SimpleAlign Module came from Ewan Birney's Align module.
+
+=head1 PROGRESS
+
+SimpleAlign is being slowly converted to bioperl coding standards,
+mainly by Ewan.
+
+=over 3
+
+=item Use Bio::Root::Object - done
+
+=item Use proper exceptions - done
+
+=item Use hashed constructor - not done!
+
+=back
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules.  Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+   bioperl-l@bioperl.org             - General discussion
+   http://bioperl.org/MailList.shtml - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution. Bug reports can be submitted via email
+or the web:
+
+    bioperl-bugs@bio.perl.org
+    http://bugzilla.bioperl.org/
+
+=head1 AUTHOR
+
+Ewan Birney, birney@sanger.ac.uk
+
+=head1 CONTRIBUTORS
+
+Richard Adams, Richard.Adams@ed.ac.uk, 
+David J. Evans, David.Evans@vir.gla.ac.uk, 
+Heikki Lehvaslaiho, heikki@ebi.ac.uk, 
+Allen Smith, allens@cpan.org, 
+Jason Stajich, jason@bioperl.org, 
+Anthony Underwood, aunderwood@phls.org.uk, 
+Xintao Wei & Giri Narasimhan, giri@cs.fiu.edu
+
+
+=head1 SEE ALSO
+
+L<Bio::LocatableSeq.pm>
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+# 'Let the code begin...
+
+package Bio::SimpleAlign;
+use vars qw(@ISA %CONSERVATION_GROUPS);
+use strict;
+
+use Bio::Root::Root;
+use Bio::LocatableSeq;         # uses Seq's as list
+use Bio::Align::AlignI;
+
+BEGIN {
+    # This data should probably be in a more centralized module...
+    # it is taken from Clustalw documentation
+    # These are all the positively scoring groups that occur in the
+    # Gonnet Pam250 matrix. The strong and weak groups are
+    # defined as strong score >0.5 and weak score =<0.5 respectively.
+
+    %CONSERVATION_GROUPS = ( 'strong' => [ qw(STA
+						 NEQK
+						 NHQK
+						 NDEQ
+						 QHRK
+						 MILV
+						 MILF
+						 HY
+						 FYW)
+					      ],
+				'weak' => [ qw(CSA
+					       ATV
+					       SAG
+					       STNK
+					       STPA
+					       SGND
+					       SNDEQK
+					       NDEQHK
+					       NEQHRK
+					       FVLIM
+					       HFY) ],
+				);
+
+}
+
+@ISA = qw(Bio::Root::Root Bio::Align::AlignI);
+
+=head2 new
+
+ Title     : new
+ Usage     : my $aln = new Bio::SimpleAlign();
+ Function  : Creates a new simple align object
+ Returns   : Bio::SimpleAlign
+ Args      : -source => string representing the source program
+                        where this alignment came from
+
+=cut
+
+
+sub new {
+  my($class,@args) = @_;
+
+  my $self = $class->SUPER::new(@args);
+
+  my ($src) = $self->_rearrange([qw(SOURCE)], @args);
+  $src && $self->source($src);
+  # we need to set up internal hashs first!
+
+  $self->{'_seq'} = {};
+  $self->{'_order'} = {};
+  $self->{'_start_end_lists'} = {};
+  $self->{'_dis_name'} = {};
+  $self->{'_id'} = 'NoName';
+  $self->{'_symbols'} = {};
+  # maybe we should automatically read in from args. Hmmm...
+
+  return $self; # success - we hope!
+}
+
+=head1 Modifier methods
+
+These methods modify the MSE by adding, removing or shuffling complete
+sequences.
+
+=head2 add_seq
+
+ Title     : add_seq
+ Usage     : $myalign->add_seq($newseq);
+ Function  : Adds another sequence to the alignment. *Does not* align
+             it - just adds it to the hashes.
+ Returns   : nothing
+ Args      : a Bio::LocatableSeq object
+             order (optional)
+
+See L<Bio::LocatableSeq> for more information
+
+=cut
+
+sub addSeq {
+    my $self = shift;
+    $self->warn(ref($self). "::addSeq - deprecated method. Use add_seq() instead.");
+    $self->add_seq(@_);
+}
+
+sub add_seq {
+    my $self = shift;
+    my $seq  = shift;
+    my $order = shift;
+    my ($name,$id,$start,$end);
+
+    if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
+	$self->throw("Unable to process non locatable sequences [", ref($seq), "]");
+    }
+
+    $id = $seq->id() ||$seq->display_id || $seq->primary_id;
+    $start = $seq->start();
+    $end  = $seq->end();
+
+    # build the symbol list for this sequence,
+    # will prune out the gap and missing/match chars
+    # when actually asked for the symbol list in the
+    # symbol_chars
+    map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
+
+    if( !defined $order ) {
+	$order = keys %{$self->{'_seq'}};
+    }
+    $name = sprintf("%s/%d-%d",$id,$start,$end);
+
+    if( $self->{'_seq'}->{$name} ) {
+	$self->warn("Replacing one sequence [$name]\n");
+    }
+    else {
+	#print STDERR "Assigning $name to $order\n";
+
+	$self->{'_order'}->{$order} = $name;
+
+	unless( exists( $self->{'_start_end_lists'}->{$id})) {
+	    $self->{'_start_end_lists'}->{$id} = [];
+	}
+	push @{$self->{'_start_end_lists'}->{$id}}, $seq;
+    }
+
+    $self->{'_seq'}->{$name} = $seq;
+
+}
+
+
+=head2 remove_seq
+
+ Title     : remove_seq
+ Usage     : $aln->remove_seq($seq);
+ Function  : Removes a single sequence from an alignment
+ Returns   :
+ Argument  : a Bio::LocatableSeq object
+
+=cut
+
+sub removeSeq {
+    my $self = shift;
+    $self->warn(ref($self). "::removeSeq - deprecated method. Use remove_seq() instead.");
+    $self->remove_seq(@_);
+}
+
+sub remove_seq {
+    my $self = shift;
+    my $seq = shift;
+    my ($name,$id,$start,$end);
+
+    $self->throw("Need Bio::Locatable seq argument ")
+	unless ref $seq && $seq->isa('Bio::LocatableSeq');
+
+    $id = $seq->id();
+    $start = $seq->start();
+    $end  = $seq->end();
+    $name = sprintf("%s/%d-%d",$id,$start,$end);
+
+    if( !exists $self->{'_seq'}->{$name} ) {
+	$self->throw("Sequence $name does not exist in the alignment to remove!");
+    }
+
+    delete $self->{'_seq'}->{$name};
+
+    # we need to remove this seq from the start_end_lists hash
+
+    if (exists $self->{'_start_end_lists'}->{$id}) {
+	# we need to find the sequence in the array.
+
+	my ($i, $found);;
+	for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
+	    if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
+		$found = 1;
+		last;
+	    }
+	}
+	if ($found) {
+	    splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
+	}
+	else {
+	    $self->throw("Could not find the sequence to remoce from the start-end list");
+	}
+    }
+    else {
+	$self->throw("There is no seq list for the name $id");
+    }
+    return 1;
+    # we can't do anything about the order hash but that is ok
+    # because each_seq will handle it
+}
+
+
+=head2 purge
+
+ Title   : purge
+ Usage   : $aln->purge(0.7);
+ Function:
+
+           Removes sequences above given sequence similarity
+           This function will grind on large alignments. Beware!
+
+ Example :
+ Returns : An array of the removed sequences
+ Args    : float, threshold for similarity
+
+=cut
+
+sub purge {
+    my ($self,$perc) = @_;
+    my (%duplicate, @dups);
+
+    my @seqs = $self->each_seq();
+
+    for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
+	my $seq = $seqs[$i];
+
+	#skip if already in duplicate hash
+	next if exists $duplicate{$seq->display_id} ;
+	my $one = $seq->seq();
+
+	my @one = split '', $one;	#split to get 1aa per array element
+
+	for (my $j=$i+1;$j < @seqs;$j++) {
+	    my $seq2 = $seqs[$j];
+
+	    #skip if already in duplicate hash
+	    next if exists $duplicate{$seq2->display_id} ;
+
+	    my $two = $seq2->seq();
+	    my @two = split '', $two;
+
+	    my $count = 0;
+	    my $res = 0;
+	    for (my $k=0;$k<@one;$k++) {
+		if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
+		     $one[$k] eq $two[$k]) {
+		    $count++;
+		}
+		if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
+		     $two[$k] ne '.' && $two[$k] ne '-' ) {
+		    $res++;
+		}
+	    }
+
+	    my $ratio = 0;
+	    $ratio = $count/$res unless $res == 0;
+
+	    # if above threshold put in duplicate hash and push onto
+	    # duplicate array for returning to get_unique
+	    if ( $ratio > $perc ) {
+		print STDERR "duplicate!", $seq2->display_id, "\n" if $self->verbose > 0;
+		$duplicate{$seq2->display_id} = 1;
+		push @dups, $seq2;
+	    }
+	}
+    }
+    foreach my $seq (@dups) {
+	$self->remove_seq($seq);
+    }
+    return @dups;
+}
+
+=head2 sort_alphabetically
+
+ Title     : sort_alphabetically
+ Usage     : $ali->sort_alphabetically
+ Function  : 
+
+             Changes the order of the alignemnt to alphabetical on name
+             followed by numerical by number.
+
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub sort_alphabetically {
+    my $self = shift;
+    my ($seq,$nse,@arr,%hash,$count);
+
+    foreach $seq ( $self->each_seq() ) {
+	$nse = $seq->get_nse;
+	$hash{$nse} = $seq;
+    }
+
+    $count = 0;
+
+    %{$self->{'_order'}} = (); # reset the hash;
+
+    foreach $nse ( sort _alpha_startend keys %hash) {
+	$self->{'_order'}->{$count} = $nse;
+
+	$count++;
+    }
+    1;
+}
+
+=head1 Sequence selection methods
+
+Methods returning one or more sequences objects.
+
+=head2 each_seq
+
+ Title     : each_seq
+ Usage     : foreach $seq ( $align->each_seq() ) 
+ Function  : Gets an array of Seq objects from the alignment
+ Returns   : an array
+ Argument  : 
+
+=cut
+
+sub eachSeq {
+    my $self = shift;
+    $self->warn(ref($self). "::eachSeq - deprecated method. Use each_seq() instead.");
+    $self->each_seq();
+}
+
+sub each_seq {
+    my $self = shift;
+    my (@arr,$order);
+
+    foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
+	if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
+	    push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
+	}
+    }
+
+    return @arr;
+}
+
+
+=head2 each_alphabetically
+
+ Title     : each_alphabetically
+ Usage     : foreach $seq ( $ali->each_alphabetically() )
+ Function  :
+
+             Returns an array of sequence object sorted alphabetically 
+             by name and then by start point.
+             Does not change the order of the alignment
+
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub each_alphabetically {
+    my $self = shift;
+    my ($seq,$nse,@arr,%hash,$count);
+
+    foreach $seq ( $self->each_seq() ) {
+	$nse = $seq->get_nse;
+	$hash{$nse} = $seq;
+    }
+
+    foreach $nse ( sort _alpha_startend keys %hash) {
+	push(@arr,$hash{$nse});
+    }
+
+    return @arr;
+
+}
+
+sub _alpha_startend {
+    my ($aname,$astart,$bname,$bstart);
+    ($aname,$astart) = split (/-/,$a);
+    ($bname,$bstart) = split (/-/,$b);
+
+    if( $aname eq $bname ) {
+	return $astart <=> $bstart;
+    }
+    else {
+	return $aname cmp $bname;
+    }
+
+}
+
+=head2 each_seq_with_id
+
+ Title     : each_seq_with_id
+ Usage     : foreach $seq ( $align->each_seq_with_id() )
+ Function  : 
+
+             Gets an array of Seq objects from the
+             alignment, the contents being those sequences
+             with the given name (there may be more than one)
+
+ Returns   : an array
+ Argument  : a seq name
+
+=cut
+
+sub eachSeqWithId {
+    my $self = shift;
+    $self->warn(ref($self). "::eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
+    $self->each_seq_with_id(@_);
+}
+
+sub each_seq_with_id {
+    my $self = shift;
+    my $id = shift;
+
+    $self->throw("Method each_seq_with_id needs a sequence name argument")
+	unless defined $id;
+
+    my (@arr, $seq);
+
+    if (exists($self->{'_start_end_lists'}->{$id})) {
+	@arr = @{$self->{'_start_end_lists'}->{$id}};
+    }
+    return @arr;
+}
+
+=head2 get_seq_by_pos
+
+ Title     : get_seq_by_pos
+ Usage     : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
+ Function  : 
+
+             Gets a sequence based on its position in the alignment.
+             Numbering starts from 1.  Sequence positions larger than
+             no_sequences() will thow an error.
+
+ Returns   : a Bio::LocatableSeq object
+ Args      : positive integer for the sequence osition
+
+=cut
+
+sub get_seq_by_pos {
+
+    my $self = shift;
+    my ($pos) = @_;
+
+    $self->throw("Sequence position has to be a positive integer, not [$pos]")
+	unless $pos =~ /^\d+$/ and $pos > 0;
+    $self->throw("No sequence at position [$pos]")
+	unless $pos <= $self->no_sequences ;
+
+    my $nse = $self->{'_order'}->{--$pos};
+    return $self->{'_seq'}->{$nse};
+}
+
+=head1 Create new alignments
+
+The result of these methods are horizontal or vertical subsets of the
+current MSE.
+
+=head2 select
+
+ Title     : select
+ Usage     : $aln2 = $aln->select(1, 3) # three first sequences
+ Function  : 
+
+             Creates a new alignment from a continuous subset of
+             sequences.  Numbering starts from 1.  Sequence positions
+             larger than no_sequences() will thow an error.
+
+ Returns   : a Bio::SimpleAlign object
+ Args      : positive integer for the first sequence
+             positive integer for the last sequence to include (optional)
+
+=cut
+
+sub select {
+    my $self = shift;
+    my ($start, $end) = @_;
+
+    $self->throw("Select start has to be a positive integer, not [$start]")
+	unless $start =~ /^\d+$/ and $start > 0;
+    $self->throw("Select end has to be a positive integer, not [$end]")
+	unless $end  =~ /^\d+$/ and $end > 0;
+    $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
+	unless $start <= $end;
+
+    my $aln = new $self;
+    foreach my $pos ($start .. $end) {
+	$aln->add_seq($self->get_seq_by_pos($pos));
+    }
+    $aln->id($self->id);	
+    return $aln;
+}
+
+=head2 select_noncont
+
+ Title     : select_noncont
+ Usage     : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences
+ Function  : 
+
+             Creates a new alignment from a subset of
+             sequences.  Numbering starts from 1.  Sequence positions
+             larger than no_sequences() will thow an error.
+
+ Returns   : a Bio::SimpleAlign object
+ Args      : array of integers for the sequences
+
+=cut
+
+sub select_noncont {
+    my $self = shift;
+    my (@pos) = @_;
+    my $end = $self->no_sequences;
+    foreach ( @pos ) {
+	$self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
+	    unless( /^\d+$/ && $_ > 0 && $_ <= $end );
+    }
+    my $aln = new $self;
+    foreach my $p (@pos) {
+	$aln->add_seq($self->get_seq_by_pos($p));
+    }
+    $aln->id($self->id);
+    return $aln;
+}
+
+=head2 slice
+
+ Title     : slice
+ Usage     : $aln2 = $aln->slice(20, 30)
+ Function  : 
+
+             Creates a slice from the alignment inclusive of start and
+             end columns.  Sequences with no residues in the slice are
+             excluded from the new alignment and a warning is printed.
+             Slice beyond the length of the sequence does not do
+             padding.
+
+ Returns   : a Bio::SimpleAlign object
+ Args      : positive integer for start column
+             positive integer for end column
+
+=cut
+
+sub slice {
+    my $self = shift;
+    my ($start, $end) = @_;
+
+    $self->throw("Slice start has to be a positive integer, not [$start]")
+	unless $start =~ /^\d+$/ and $start > 0;
+    $self->throw("Slice end has to be a positive integer, not [$end]")
+	unless $end =~ /^\d+$/ and $end > 0;
+    $self->throw("Slice $start [$start] has to be smaller than or equal to end [$end]")
+	unless $start <= $end;
+    my $aln_length = $self->length;
+    $self->throw("This alignment has only ". $self->length.
+		  " residues. Slice start [$start] is too bigger.")
+	 if $start > $self->length;
+
+    my $aln = new $self;
+    $aln->id($self->id);
+    foreach my $seq ( $self->each_seq() ) {
+
+	my $new_seq = new Bio::LocatableSeq (-id => $seq->id);
+
+	# seq
+	my $seq_end = $end;
+	$seq_end = $seq->length if $end > $seq->length;
+	my $slice_seq = $seq->subseq($start, $seq_end);
+	$new_seq->seq( $slice_seq );
+
+	# start
+	if ($start > 1) {
+	    my $pre_start_seq = $seq->subseq(1, $start - 1);
+	    $pre_start_seq =~ s/\W//g; #print "$pre_start_seq\n";
+	    $new_seq->start( $seq->start + CORE::length($pre_start_seq)  );
+	} else {
+	    $new_seq->start( $seq->start);
+	}
+
+	# end
+	$slice_seq =~ s/\W//g;
+	$new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
+
+	if ($new_seq->start and $new_seq->end >= $new_seq->start) {
+	    $aln->add_seq($new_seq);
+	} else {
+	    my $nse = $seq->get_nse();
+	    $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
+			" Sequence excluded from the new alignment.");
+	}
+
+    }
+
+    return $aln;
+}
+
+=head2 remove_columns
+
+ Title     : remove_column
+ Usage     : $aln2 = $aln->remove_columns(['mismatch','weak'])
+ Function  :
+             Creates an aligment with columns removed corresponding to
+             the specified criteria.
+ Returns   : a L<Bio::SimpleAlign> object
+ Args      : array ref of types, 'match'|'weak'|'strong'|'mismatch'
+
+=cut
+
+sub remove_columns{
+    my ($self,$type) = @_;
+    my %matchchars = ( 'match'  => '\*',
+                       'weak'   => '\.',
+                       'strong' => ':',
+                       'mismatch'=> ' ',
+               );
+   #get the characters to delete against
+   my $del_char;
+   foreach my $type(@{$type}){
+    $del_char.= $matchchars{$type};
+   }
+
+   my $match_line = $self->match_line;
+   my $aln = new $self;
+
+   my @remove;
+   my $length = 0;
+
+   #do the matching to get the segments to remove
+   while($match_line=~m/[$del_char]/g){
+    my $start = pos($match_line)-1;
+    $match_line=~/\G[$del_char]+/gc;
+    my $end = pos($match_line)-1;
+
+    #have to offset the start and end for subsequent removes
+    $start-=$length;
+    $end  -=$length;
+    $length += ($end-$start+1);
+    push @remove, [$start,$end];
+   }
+
+  #remove the segments
+  $aln = $self->_remove_col($aln,\@remove);
+
+  return $aln;
+}
+
+sub _remove_col {
+    my ($self,$aln,$remove) = @_;
+    my @new;
+
+    #splice out the segments and create new seq
+    foreach my $seq($self->each_seq){
+        my $new_seq = new Bio::LocatableSeq(-id=>$seq->id);
+        my $sequence;
+        foreach my $pair(@{$remove}){
+            my $start = $pair->[0];
+            my $end   = $pair->[1];
+            $sequence = $seq->seq unless $sequence;
+            my $spliced;
+            $spliced .= $start > 0 ? substr($sequence,0,$start) : '';
+            $spliced .= substr($sequence,$end+1,$seq->length-$end+1);
+            $sequence = $spliced;
+            if ($start == 1) {
+              $new_seq->start($end);
+            }
+            else {
+              $new_seq->start( $seq->start);
+            }
+            # end
+            if($end >= $seq->end){
+             $new_seq->end( $start);
+            }
+            else {
+             $new_seq->end($seq->end);
+            }
+        }
+        $new_seq->seq($sequence);
+        push @new, $new_seq;
+    }
+    #add the new seqs to the alignment
+    foreach my $new(@new){
+        $aln->add_seq($new);
+    }
+    return $aln;
+}
+
+=head1 Change sequences within the MSE
+
+These methods affect characters in all sequences without changeing the
+alignment.
+
+
+=head2 map_chars
+
+ Title     : map_chars
+ Usage     : $ali->map_chars('\.','-')
+ Function  : 
+
+             Does a s/$arg1/$arg2/ on the sequences. Useful for gap
+             characters
+
+             Notice that the from (arg1) is interpretted as a regex,
+             so be careful about quoting meta characters (eg
+             $ali->map_chars('.','-') wont do what you want)
+
+ Returns   : 
+ Argument  : 'from' rexexp
+             'to' string
+
+=cut
+
+sub map_chars {
+    my $self = shift;
+    my $from = shift;
+    my $to   = shift;
+    my ($seq,$temp);
+
+    $self->throw("Need exactly two arguments") 
+	unless defined $from and defined $to;
+
+    foreach $seq ( $self->each_seq() ) {
+	$temp = $seq->seq();
+	$temp =~ s/$from/$to/g;
+	$seq->seq($temp);
+    }
+    return 1;
+}
+
+
+=head2 uppercase
+
+ Title     : uppercase()
+ Usage     : $ali->uppercase()
+ Function  : Sets all the sequences to uppercase
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub uppercase {
+    my $self = shift;
+    my $seq;
+    my $temp;
+
+    foreach $seq ( $self->each_seq() ) {
+      $temp = $seq->seq();
+      $temp =~ tr/[a-z]/[A-Z]/;
+
+      $seq->seq($temp);
+    }
+    return 1;
+}
+
+=head2 cigar_line
+
+ Title    : cigar_line()
+ Usage    : $align->cigar_line()
+ Function : Generates a "cigar" line for each sequence in the alignment
+            The format is simply A-1,60;B-1,1:4,60;C-5,10:12,58
+            where A,B,C,etc. are the sequence identifiers, and the numbers
+            refer to conserved positions within the alignment
+ Args     : none
+
+=cut
+
+sub cigar_line {
+    my ($self) = @_;
+
+    my %cigar;
+    my %clines;
+    my @seqchars;
+    my $seqcount = 0;
+    my $sc;
+    foreach my $seq ( $self->each_seq ) {
+	push @seqchars, [ split(//, uc ($seq->seq)) ];
+	$sc = scalar(@seqchars);
+    }
+
+    foreach my $pos ( 0..$self->length ) {
+	my $i=0;
+	foreach my $seq ( @seqchars ) {
+	    $i++;
+#	    print STDERR "Seq $i at pos $pos: ".$seq->[$pos]."\n";
+	    if ($seq->[$pos] eq '.') {
+		if (defined $cigar{$i} && $clines{$i} !~ $cigar{$i}) {
+		    $clines{$i}.=$cigar{$i};
+		}
+	    }
+	    else {
+		if (! defined $cigar{$i}) {
+		    $clines{$i}.=($pos+1).",";
+		}
+		$cigar{$i}=$pos+1;
+	    }
+	    if ($pos+1 == $self->length && ($clines{$i} =~ /\,$/) ) {
+		$clines{$i}.=$cigar{$i};
+	     }
+	}
+    }
+    for(my $i=1; $i<$sc+1;$i++) {
+	print STDERR "Seq $i cigar line ".$clines{$i}."\n";
+    }
+    return %clines;
+}
+
+=head2 match_line
+
+ Title    : match_line()
+ Usage    : $align->match_line()
+ Function : Generates a match line - much like consensus string
+            except that a line indicating the '*' for a match.
+ Args     : (optional) Match line characters ('*' by default)
+            (optional) Strong match char (':' by default)
+            (optional) Weak match char ('.' by default)
+
+=cut
+
+sub match_line {
+    my ($self,$matchlinechar, $strong, $weak) = @_;
+    my %matchchars = ( 'match'  => $matchlinechar || '*',
+		       'weak'   => $weak          || '.',
+		       'strong' => $strong        || ':',
+		       'mismatch'=> ' ', 
+	       );
+
+
+    my @seqchars;
+    my $seqcount = 0;
+    my $alphabet;
+    foreach my $seq ( $self->each_seq ) {
+	push @seqchars, [ split(//, uc ($seq->seq)) ];
+	$alphabet = $seq->alphabet unless defined $alphabet;
+    }
+    my $refseq = shift @seqchars;
+    # let's just march down the columns
+    my $matchline;
+    POS: foreach my $pos ( 0..$self->length ) {
+	my $refchar = $refseq->[$pos];
+	next unless $refchar; # skip '' 
+	my %col = ($refchar => 1);
+	my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
+	foreach my $seq ( @seqchars ) {
+	    $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' || 
+			  $seq->[$pos] eq ' ' );
+	    $col{$seq->[$pos]}++;
+	}
+	my @colresidues = sort keys %col;
+	my $char = $matchchars{'mismatch'};
+	# if all the values are the same
+	if( $dash ) { $char =  $matchchars{'mismatch'} }
+	elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
+	elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
+	                                  # matches for protein seqs
+	    TYPE: foreach my $type ( qw(strong weak) ) {
+                # iterate through categories
+		my %groups;
+		# iterate through each of the aa in the col
+		# look to see which groups it is in
+		foreach my $c ( @colresidues ) {
+		    foreach my $f ( grep /\Q$c/, @{$CONSERVATION_GROUPS{$type}} ) {
+			push @{$groups{$f}},$c;
+		    }
+		}
+		GRP: foreach my $cols ( values %groups ) {
+		    @$cols = sort @$cols;
+		    # now we are just testing to see if two arrays
+		    # are identical w/o changing either one
+
+		    # have to be same len
+		    next if( scalar @$cols != scalar @colresidues );
+		    # walk down the length and check each slot
+		    for($_=0;$_ < (scalar @$cols);$_++ ) {
+			next GRP if( $cols->[$_] ne $colresidues[$_] );
+		    }
+		    $char = $matchchars{$type};
+		    last TYPE;
+		}
+	    }
+	  }
+	$matchline .= $char;
+    }
+    return $matchline;
+}
+
+=head2 match
+
+ Title     : match()
+ Usage     : $ali->match()
+ Function  : 
+
+             Goes through all columns and changes residues that are
+             identical to residue in first sequence to match '.'
+             character. Sets match_char.
+
+             USE WITH CARE: Most MSE formats do not support match
+             characters in sequences, so this is mostly for output
+             only. NEXUS format (Bio::AlignIO::nexus) can handle
+             it.
+
+ Returns   : 1
+ Argument  : a match character, optional, defaults to '.'
+
+=cut
+
+sub match {
+    my ($self, $match) = @_;
+
+    $match ||= '.';
+    my ($matching_char) = $match;
+    $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ;  #'; 
+    $self->map_chars($matching_char, '-');
+
+    my @seqs = $self->each_seq();
+    return 1 unless scalar @seqs > 1;
+
+    my $refseq = shift @seqs ;
+    my @refseq = split //, $refseq->seq;
+    my $gapchar = $self->gap_char;
+
+    foreach my $seq ( @seqs ) {
+	my @varseq = split //, $seq->seq();
+	for ( my $i=0; $i < scalar @varseq; $i++) {
+	    $varseq[$i] = $match if defined $refseq[$i] &&
+		( $refseq[$i] =~ /[A-Za-z\*]/ ||
+		  $refseq[$i] =~ /$gapchar/ )
+		      && $refseq[$i] eq $varseq[$i];
+	}
+	$seq->seq(join '', @varseq);
+    }
+    $self->match_char($match);
+    return 1;
+}
+
+
+=head2 unmatch
+
+ Title     : unmatch()
+ Usage     : $ali->unmatch()
+ Function  : Undoes the effect of method match. Unsets match_char.
+
+ Returns   : 1
+ Argument  : a match character, optional, defaults to '.'
+
+See L<match> and L<match_char>
+
+=cut
+
+sub unmatch {
+    my ($self, $match) = @_;
+
+    $match ||= '.';
+
+    my @seqs = $self->each_seq();
+    return 1 unless scalar @seqs > 1;
+
+    my $refseq = shift @seqs ;
+    my @refseq = split //, $refseq->seq;
+    my $gapchar = $self->gap_char;
+    foreach my $seq ( @seqs ) {
+	my @varseq = split //, $seq->seq();
+	for ( my $i=0; $i < scalar @varseq; $i++) {
+	    $varseq[$i] = $refseq[$i] if defined $refseq[$i] && 
+		( $refseq[$i] =~ /[A-Za-z\*]/ ||
+		  $refseq[$i] =~ /$gapchar/ ) &&
+		      $varseq[$i] eq $match;
+	}
+	$seq->seq(join '', @varseq);
+    }
+    $self->match_char('');
+    return 1;
+}
+
+=head1 MSE attibutes
+
+Methods for setting and reading the MSE attributes.
+
+Note that the methods defining character semantics depend on the user
+to set them sensibly.  They are needed only by certain input/output
+methods. Unset them by setting to an empty string ('').
+
+=head2 id
+
+ Title     : id
+ Usage     : $myalign->id("Ig")
+ Function  : Gets/sets the id field of the alignment
+ Returns   : An id string
+ Argument  : An id string (optional)
+
+=cut
+
+sub id {
+    my ($self, $name) = @_;
+
+    if (defined( $name )) {
+	$self->{'_id'} = $name;
+    }
+
+    return $self->{'_id'};
+}
+
+=head2 missing_char
+
+ Title     : missing_char
+ Usage     : $myalign->missing_char("?")
+ Function  : Gets/sets the missing_char attribute of the alignment
+             It is generally recommended to set it to 'n' or 'N'
+             for nucleotides and to 'X' for protein.
+ Returns   : An missing_char string,
+ Argument  : An missing_char string (optional)
+
+=cut
+
+sub missing_char {
+    my ($self, $char) = @_;
+
+    if (defined $char ) {
+	$self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
+	$self->{'_missing_char'} = $char;
+    }
+
+    return $self->{'_missing_char'};
+}
+
+=head2 match_char
+
+ Title     : match_char
+ Usage     : $myalign->match_char('.')
+ Function  : Gets/sets the match_char attribute of the alignment
+ Returns   : An match_char string,
+ Argument  : An match_char string (optional)
+
+=cut
+
+sub match_char {
+    my ($self, $char) = @_;
+
+    if (defined $char ) {
+	$self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
+	$self->{'_match_char'} = $char;
+    }
+
+    return $self->{'_match_char'};
+}
+
+=head2 gap_char
+
+ Title     : gap_char
+ Usage     : $myalign->gap_char('-')
+ Function  : Gets/sets the gap_char attribute of the alignment
+ Returns   : An gap_char string, defaults to '-'
+ Argument  : An gap_char string (optional)
+
+=cut
+
+sub gap_char {
+    my ($self, $char) = @_;
+
+    if (defined $char || ! defined $self->{'_gap_char'} ) {
+	$char= '-' unless defined $char;
+	$self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
+	$self->{'_gap_char'} = $char;
+    }
+    return $self->{'_gap_char'};
+}
+
+=head2 symbol_chars
+
+ Title   : symbol_chars
+ Usage   : my @symbolchars = $aln->symbol_chars;
+ Function: Returns all the seen symbols (other than gaps)
+ Returns : array of characters that are the seen symbols
+ Args    : boolean to include the gap/missing/match characters
+
+=cut
+
+sub symbol_chars{
+   my ($self,$includeextra) = @_;
+   if( ! defined $self->{'_symbols'} ) {
+       $self->warn("Symbol list was not initialized");
+       return ();
+   }
+   my %copy = %{$self->{'_symbols'}};
+   if( ! $includeextra ) {
+       foreach my $char ( $self->gap_char, $self->match_char,
+			  $self->missing_char) {
+	   delete $copy{$char} if( defined $char );
+       }
+   }
+   return keys %copy;
+}
+
+=head1 Alignment descriptors
+
+These read only methods describe the MSE in various ways.
+
+
+=head2 consensus_string
+
+ Title     : consensus_string
+ Usage     : $str = $ali->consensus_string($threshold_percent)
+ Function  : Makes a strict consensus
+ Returns   : 
+ Argument  : Optional treshold ranging from 0 to 100.
+             The consensus residue has to appear at least threshold %
+             of the sequences at a given location, otherwise a '?'
+             character will be placed at that location.
+             (Default value = 0%)
+
+=cut
+
+sub consensus_string {
+    my $self = shift;
+    my $threshold = shift;
+    my $len;
+    my ($out,$count);
+
+    $out = "";
+
+    $len = $self->length - 1;
+
+    foreach $count ( 0 .. $len ) {
+	$out .= $self->_consensus_aa($count,$threshold);
+    }
+    return $out;
+}
+
+sub _consensus_aa {
+    my $self = shift;
+    my $point = shift;
+    my $threshold_percent = shift || -1 ;
+    my ($seq,%hash,$count,$letter,$key);
+
+    foreach $seq ( $self->each_seq() ) {
+	$letter = substr($seq->seq,$point,1);
+	$self->throw("--$point-----------") if $letter eq '';
+	($letter =~ /\./) && next;
+	# print "Looking at $letter\n";
+	$hash{$letter}++;
+    }
+    my $number_of_sequences = $self->no_sequences();
+    my $threshold = $number_of_sequences * $threshold_percent / 100. ;
+    $count = -1;
+    $letter = '?';
+
+    foreach $key ( sort keys %hash ) {
+	# print "Now at $key $hash{$key}\n";
+	if( $hash{$key} > $count && $hash{$key} >= $threshold) {
+	    $letter = $key;
+	    $count = $hash{$key};
+	}
+    }
+    return $letter;
+}
+
+
+=head2 consensus_iupac
+
+ Title     : consensus_iupac
+ Usage     : $str = $ali->consensus_iupac()
+ Function  : 
+
+             Makes a consensus using IUPAC ambiguity codes from DNA
+             and RNA. The output is in upper case except when gaps in
+             a column force output to be in lower case.
+
+             Note that if your alignment sequences contain a lot of
+             IUPAC ambiquity codes you often have to manually set
+             alphabet.  Bio::PrimarySeq::_guess_type thinks they
+             indicate a protein sequence.
+
+ Returns   : consensus string
+ Argument  : none
+ Throws    : on protein sequences
+
+=cut
+
+sub consensus_iupac {
+    my $self = shift;
+    my $out = "";
+    my $len = $self->length-1;
+
+    # only DNA and RNA sequences are valid
+    foreach my $seq ( $self->each_seq() ) {
+	$self->throw("Seq [". $seq->get_nse. "] is a protein")
+	    if $seq->alphabet eq 'protein';
+    }
+    # loop over the alignment columns
+    foreach my $count ( 0 .. $len ) {
+	$out .= $self->_consensus_iupac($count);
+    }
+    return $out;
+}
+
+sub _consensus_iupac {
+    my ($self, $column) = @_;
+    my ($string, $char, $rna);
+
+    #determine all residues in a column
+    foreach my $seq ( $self->each_seq() ) {
+	$string .= substr($seq->seq, $column, 1);
+    }
+    $string = uc $string;
+
+    # quick exit if there's an N in the string
+    if ($string =~ /N/) {	
+	$string =~ /\W/ ? return 'n' : return 'N';
+    }
+    # ... or if there are only gap characters
+    return '-' if $string =~ /^\W+$/;
+
+    # treat RNA as DNA in regexps
+    if ($string =~ /U/) {	
+	$string =~ s/U/T/;
+	$rna = 1;
+    }
+
+    # the following s///'s only need to be done to the _first_ ambiguity code
+    # as we only need to see the _range_ of characters in $string
+
+    if ($string =~ /[VDHB]/) {
+	$string =~ s/V/AGC/;
+	$string =~ s/D/AGT/;
+	$string =~ s/H/ACT/;
+	$string =~ s/B/CTG/;
+    }
+
+    if ($string =~ /[SKYRWM]/) {
+	$string =~ s/S/GC/;
+	$string =~ s/K/GT/;
+	$string =~ s/Y/CT/;
+	$string =~ s/R/AG/;
+	$string =~ s/W/AT/;
+	$string =~ s/M/AC/;
+    }
+
+    # and now the guts of the thing
+
+    if ($string =~ /A/) {
+        $char = 'A';                     # A                      A
+        if ($string =~ /G/) {
+            $char = 'R';                 # A and G (purines)      R
+            if ($string =~ /C/) {
+                $char = 'V';             # A and G and C          V
+                if ($string =~ /T/) {
+                    $char = 'N';         # A and G and C and T    N
+                }
+            } elsif ($string =~ /T/) {
+                $char = 'D';             # A and G and T          D
+            }
+        } elsif ($string =~ /C/) {
+            $char = 'M';                 # A and C                M
+            if ($string =~ /T/) {
+                $char = 'H';             # A and C and T          H
+            }
+        } elsif ($string =~ /T/) {
+            $char = 'W';                 # A and T                W
+        }
+    } elsif ($string =~ /C/) {
+        $char = 'C';                     # C                      C
+        if ($string =~ /T/) {
+            $char = 'Y';                 # C and T (pyrimidines)  Y
+            if ($string =~ /G/) {
+                $char = 'B';             # C and T and G          B
+            }
+        } elsif ($string =~ /G/) {
+            $char = 'S';                 # C and G                S
+        }
+    } elsif ($string =~ /G/) {
+        $char = 'G';                     # G                      G
+        if ($string =~ /C/) {
+            $char = 'S';                 # G and C                S
+        } elsif ($string =~ /T/) {
+            $char = 'K';                 # G and T                K
+        }
+    } elsif ($string =~ /T/) {
+        $char = 'T';                     # T                      T
+    }
+
+    $char = 'U' if $rna and $char eq 'T';
+    $char = lc $char if $string =~ /\W/;
+
+    return $char;
+}
+
+=head2 is_flush
+
+ Title     : is_flush
+ Usage     : if( $ali->is_flush() )
+           : 
+           :
+ Function  : Tells you whether the alignment 
+           : is flush, ie all of the same length
+           : 
+           :
+ Returns   : 1 or 0
+ Argument  : 
+
+=cut
+
+sub is_flush {
+    my ($self,$report) = @_;
+    my $seq;
+    my $length = (-1);
+    my $temp;
+
+    foreach $seq ( $self->each_seq() ) {
+	if( $length == (-1) ) {
+	    $length = CORE::length($seq->seq());
+	    next;
+	}
+
+	$temp = CORE::length($seq->seq());
+	if( $temp != $length ) {
+	    $self->warn("expecting $length not $temp from ".
+			$seq->display_id) if( $report );
+	    $self->debug("expecting $length not $temp from ".
+			 $seq->display_id);
+	    $self->debug($seq->seq(). "\n");
+	    return 0;
+	}
+    }
+
+    return 1;
+}
+
+
+=head2 length
+
+ Title     : length()
+ Usage     : $len = $ali->length()
+ Function  : Returns the maximum length of the alignment.
+             To be sure the alignment is a block, use is_flush
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub length_aln {
+    my $self = shift;
+    $self->warn(ref($self). "::length_aln - deprecated method. Use length() instead.");
+    $self->length(@_);
+}
+
+sub length {
+    my $self = shift;
+    my $seq;
+    my $length = (-1);
+    my ($temp,$len);
+
+    foreach $seq ( $self->each_seq() ) {
+	$temp = CORE::length($seq->seq());
+	if( $temp > $length ) {
+	    $length = $temp;
+	}
+    }
+
+    return $length;
+}
+
+
+=head2 maxdisplayname_length
+
+ Title     : maxdisplayname_length
+ Usage     : $ali->maxdisplayname_length()
+ Function  : 
+
+             Gets the maximum length of the displayname in the
+             alignment. Used in writing out various MSE formats.
+
+ Returns   : integer
+ Argument  : 
+
+=cut
+
+sub maxname_length {
+    my $self = shift;
+    $self->warn(ref($self). "::maxname_length - deprecated method.".
+		" Use maxdisplayname_length() instead.");
+    $self->maxdisplayname_length();
+}
+
+sub maxnse_length {
+    my $self = shift;
+    $self->warn(ref($self). "::maxnse_length - deprecated method.".
+		" Use maxnse_length() instead.");
+    $self->maxdisplayname_length();
+}
+
+sub maxdisplayname_length {
+    my $self = shift;
+    my $maxname = (-1);
+    my ($seq,$len);
+
+    foreach $seq ( $self->each_seq() ) {
+	$len = CORE::length $self->displayname($seq->get_nse());
+
+	if( $len > $maxname ) {
+	    $maxname = $len;
+	}
+    }
+
+    return $maxname;
+}
+
+=head2 no_residues
+
+ Title     : no_residues
+ Usage     : $no = $ali->no_residues
+ Function  : number of residues in total in the alignment
+ Returns   : integer
+ Argument  : 
+
+=cut
+
+sub no_residues {
+    my $self = shift;
+    my $count = 0;
+
+    foreach my $seq ($self->each_seq) {
+	my $str = $seq->seq();
+
+	$count += ($str =~ s/[^A-Za-z]//g);
+    }
+
+    return $count;
+}
+
+=head2 no_sequences
+
+ Title     : no_sequences
+ Usage     : $depth = $ali->no_sequences
+ Function  : number of sequence in the sequence alignment
+ Returns   : integer
+ Argument  : 
+
+=cut
+
+sub no_sequences {
+    my $self = shift;
+
+    return scalar($self->each_seq);
+}
+
+
+=head2 average_percentage_identity
+
+ Title   : average_percentage_identity
+ Usage   : $id = $align->average_percentage_identity
+ Function: The function uses a fast method to calculate the average
+           percentage identity of the alignment
+ Returns : The average percentage identity of the alignment
+ Args    : None
+ Notes   : This method implemented by Kevin Howe calculates a figure that is
+           designed to be similar to the average pairwise identity of the
+           alignment (identical in the absence of gaps), without having to
+           explicitly calculate pairwise identities proposed by Richard Durbin.
+           Validated by Ewan Birney ad Alex Bateman.
+
+=cut
+
+sub average_percentage_identity{
+   my ($self,@args) = @_;
+
+   my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
+                   'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
+
+   my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
+
+   if (! $self->is_flush()) {
+       $self->throw("All sequences in the alignment must be the same length");
+   }
+
+   @seqs = $self->each_seq();
+   $len = $self->length();
+
+   # load the each hash with correct keys for existence checks
+
+   for( my $index=0; $index < $len; $index++) {
+       foreach my $letter (@alphabet) {
+	   $countHashes[$index]->{$letter} = 0;
+       }
+   }
+   foreach my $seq (@seqs)  {
+       my @seqChars = split //, $seq->seq();
+       for( my $column=0; $column < @seqChars; $column++ ) {
+	   my $char = uc($seqChars[$column]);
+	   if (exists $countHashes[$column]->{$char}) {
+	       $countHashes[$column]->{$char}++;
+	   }
+       }
+   }
+
+   $total = 0;
+   $divisor = 0;
+   for(my $column =0; $column < $len; $column++) {
+       my %hash = %{$countHashes[$column]};
+       $subdivisor = 0;
+       foreach my $res (keys %hash) {
+	   $total += $hash{$res}*($hash{$res} - 1);
+	   $subdivisor += $hash{$res};
+       }
+       $divisor += $subdivisor * ($subdivisor - 1);
+   }
+   return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
+}
+
+=head2 percentage_identity
+
+ Title   : percentage_identity
+ Usage   : $id = $align->percentage_identity
+ Function: The function calculates the average percentage identity
+           (aliased for average_percentage_identity)
+ Returns : The average percentage identity
+ Args    : None
+
+=cut
+
+sub percentage_identity {
+    my $self = shift;
+    return $self->average_percentage_identity();
+}
+
+=head2 overall_percentage_identity
+
+ Title   : percentage_identity
+ Usage   : $id = $align->percentage_identity
+ Function: The function calculates the percentage identity of
+           the conserved columns
+ Returns : The percentage identity of the conserved columns
+ Args    : None
+
+=cut
+
+sub overall_percentage_identity{
+   my ($self,@args) = @_;
+
+   my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
+                   'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
+
+   my ($len, $total, @seqs, @countHashes);
+
+   if (! $self->is_flush()) {
+       $self->throw("All sequences in the alignment must be the same length");
+   }
+
+   @seqs = $self->each_seq();
+   $len = $self->length();
+
+   # load the each hash with correct keys for existence checks
+   for( my $index=0; $index < $len; $index++) {
+       foreach my $letter (@alphabet) {
+	   $countHashes[$index]->{$letter} = 0;
+       }
+   }
+   foreach my $seq (@seqs)  {
+       my @seqChars = split //, $seq->seq();
+       for( my $column=0; $column < @seqChars; $column++ ) {
+	   my $char = uc($seqChars[$column]);
+	   if (exists $countHashes[$column]->{$char}) {
+	       $countHashes[$column]->{$char}++;
+	   }
+       }
+   }
+
+   $total = 0;
+   for(my $column =0; $column < $len; $column++) {
+       my %hash = %{$countHashes[$column]};
+       foreach ( values %hash ) {
+	   next if( $_ == 0 );
+	   $total++ if( $_ == scalar @seqs );
+	   last;
+       }
+   }
+   return ($total / $len ) * 100.0;
+}
+
+=head1 Alignment positions
+
+Methods to map a sequence position into an alignment column and back.
+column_from_residue_number() does the former. The latter is really a
+property of the sequence object and can done using
+L<Bio::LocatableSeq::location_from_column>:
+
+    # select somehow a sequence from the alignment, e.g.
+    my $seq = $aln->get_seq_by_pos(1);
+    #$loc is undef or Bio::LocationI object
+    my $loc = $seq->location_from_column(5);
+
+
+=head2 column_from_residue_number
+
+ Title   : column_from_residue_number
+ Usage   : $col = $ali->column_from_residue_number( $seqname, $resnumber)
+ Function:
+
+           This function gives the position in the alignment
+           (i.e. column number) of the given residue number in the
+           sequence with the given name. For example, for the
+           alignment
+
+  	     Seq1/91-97 AC..DEF.GH
+  	     Seq2/24-30 ACGG.RTY..
+  	     Seq3/43-51 AC.DDEFGHI
+
+           column_from_residue_number( "Seq1", 94 ) returns 5.
+           column_from_residue_number( "Seq2", 25 ) returns 2.
+           column_from_residue_number( "Seq3", 50 ) returns 9.
+
+           An exception is thrown if the residue number would lie
+           outside the length of the aligment
+           (e.g. column_from_residue_number( "Seq2", 22 )
+
+	  Note: If the the parent sequence is represented by more than
+	  one alignment sequence and the residue number is present in
+	  them, this method finds only the first one.
+
+ Returns : A column number for the position in the alignment of the
+           given residue in the given sequence (1 = first column)
+ Args    : A sequence id/name (not a name/start-end)
+           A residue number in the whole sequence (not just that
+           segment of it in the alignment)
+
+=cut
+
+sub column_from_residue_number {
+    my ($self, $name, $resnumber) = @_;
+
+    $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
+    $self->throw("Second argument residue number missing") unless $resnumber;
+
+    foreach my $seq ($self->each_seq_with_id($name)) {
+	my $col;
+	eval {
+	    $col = $seq->column_from_residue_number($resnumber);
+	};
+	next if $@;		
+	return $col;
+    }
+
+    $self->throw("Could not find a sequence segment in $name ".
+		 "containing residue number $resnumber");
+
+}
+
+=head1 Sequence names
+
+Methods to manipulate the display name. The default name based on the
+sequence id and subsequence positions can be overridden in various
+ways.
+
+=head2 displayname
+
+ Title     : displayname
+ Usage     : $myalign->displayname("Ig", "IgA")
+ Function  : Gets/sets the display name of a sequence in the alignment
+           :
+ Returns   : A display name string
+ Argument  : name of the sequence
+             displayname of the sequence (optional)
+
+=cut
+
+sub get_displayname {
+    my $self = shift;
+    $self->warn(ref($self). "::get_displayname - deprecated method. Use displayname() instead.");
+    $self->displayname(@_);
+}
+
+sub set_displayname {
+    my $self = shift;
+    $self->warn(ref($self). "::set_displayname - deprecated method. Use displayname() instead.");
+    $self->displayname(@_);
+}
+
+sub displayname {
+    my ($self, $name, $disname) = @_;
+
+    $self->throw("No sequence with name [$name]") unless $self->{'_seq'}->{$name};
+
+    if(  $disname and  $name) {
+	$self->{'_dis_name'}->{$name} = $disname;
+	return $disname;
+    }
+    elsif( defined $self->{'_dis_name'}->{$name} ) {
+	return  $self->{'_dis_name'}->{$name};
+    } else {
+	return $name;
+    }
+}
+
+=head2 set_displayname_count
+
+ Title     : set_displayname_count
+ Usage     : $ali->set_displayname_count
+ Function  : 
+
+             Sets the names to be name_# where # is the number of
+             times this name has been used.
+
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub set_displayname_count {
+    my $self= shift;
+    my (@arr,$name,$seq,$count,$temp,$nse);
+
+    foreach $seq ( $self->each_alphabetically() ) {
+	$nse = $seq->get_nse();
+
+	#name will be set when this is the second
+	#time (or greater) is has been seen
+
+	if( defined $name and $name eq ($seq->id()) ) {
+	    $temp = sprintf("%s_%s",$name,$count);
+	    $self->displayname($nse,$temp);
+	    $count++;
+	} else {
+	    $count = 1;
+	    $name = $seq->id();
+	    $temp = sprintf("%s_%s",$name,$count);
+	    $self->displayname($nse,$temp);
+	    $count++;
+	}
+    }
+    return 1;
+}
+
+=head2 set_displayname_flat
+
+ Title     : set_displayname_flat
+ Usage     : $ali->set_displayname_flat()
+ Function  : Makes all the sequences be displayed as just their name,
+             not name/start-end
+ Returns   : 1
+ Argument  : 
+
+=cut
+
+sub set_displayname_flat {
+    my $self = shift;
+    my ($nse,$seq);
+
+    foreach $seq ( $self->each_seq() ) {
+	$nse = $seq->get_nse();
+	$self->displayname($nse,$seq->id());
+    }
+    return 1;
+}
+
+=head2 set_displayname_normal
+
+ Title     : set_displayname_normal
+ Usage     : $ali->set_displayname_normal()
+ Function  : Makes all the sequences be displayed as name/start-end
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub set_displayname_normal {
+    my $self = shift;
+    my ($nse,$seq);
+
+    foreach $seq ( $self->each_seq() ) {
+	$nse = $seq->get_nse();
+	$self->displayname($nse,$nse);
+    }
+    return 1;
+}
+
+=head2 source
+
+ Title   : source
+ Usage   : $obj->source($newval)
+ Function: sets the Alignment source program
+ Example : 
+ Returns : value of source
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub source{
+   my ($self,$value) = @_;
+   if( defined $value) {
+      $self->{'_source'} = $value;
+    }
+    return $self->{'_source'};
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 06:29:17 -0400
parents
children