Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/Assembly/Contig.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Assembly/Contig.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,2091 @@
+# $Id: Contig.pm,v 1.1 2002/11/04 11:50:11 heikki Exp $
+#
+# BioPerl module for Bio::Assembly::Contig
+#   Mostly based on Bio::SimpleAlign by Ewan Birney
+#
+# Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br>
+#
+# Copyright Robson Francisco de Souza
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Assembly::Contig - Perl module to hold and manipulate
+                     sequence assembly contigs.
+
+=head1 SYNOPSYS
+
+    # Module loading
+    use Bio::Assembly::IO;
+
+    # Assembly loading methods
+    $aio = new Bio::Assembly::IO(-file=>"test.ace.1",
+                               -format=>'phrap');
+
+    $assembly = $aio->next_assembly;
+    foreach $contig ($assembly->all_contigs) {
+      # do something
+    }
+
+    # OR, if you want to build the contig yourself,
+
+    use Bio::Assembly::Contig;
+    $c = Bio::Assembly::Contig->new(-id=>"1"); 
+
+    $ls  = Bio::LocatableSeq->new(-seq=>"ACCG-T",
+				  -id=>"r1",
+				  -alphabet=>'dna');
+    $ls2 = Bio::LocatableSeq->new(-seq=>"ACA-CG-T",
+				  -id=>"r2",
+				  -alphabet=>'dna');
+
+    $ls_coord = Bio::SeqFeature::Generic->new(-start=>3,
+	   				      -end=>8,
+                                              -strand=>1);
+    $ls2_coord = Bio::SeqFeature::Generic->new(-start=>1,
+					       -end=>8,
+					       -strand=>1);
+    $c->add_seq($ls);
+    $c->add_seq($ls2);
+    $c->set_seq_coord($ls_coord,$ls);
+    $c->set_seq_coord($ls2_coord,$ls2);
+
+    $con = Bio::LocatableSeq->new(-seq=>"ACACCG-T",
+				  -alphabet=>'dna');
+    $c->set_consensus_sequence($con);
+
+    $l = $c->change_coord('unaligned r2','ungapped consensus',6);
+    print "6 in unaligned r2 => $l in ungapped consensus\n";
+
+
+=head1 DESCRIPTION
+
+A contig is as a set of sequences, locally aligned to each other, so
+that every sequence has overlapping regions with at least one sequence
+in the contig, such that a continuous of overlapping sequences is
+formed, allowing the deduction of a consensus sequence which may be
+longer than any of the sequences from which it was deduced.
+
+In this documentation we refer to the overlapping sequences used to
+build the contig as "aligned sequences" and to the sequence deduced
+from the overlap of aligned sequences as the "consensus". Methods to
+deduce the consensus sequence from aligned sequences were not yet
+implemented in this module, but its posssible to add a consensus
+sequence deduced by other means, e.g, by the assembly program used to
+build the alignment.
+
+All aligned sequences in a Bio::Assembly::Contig must be Bio::Assembly::Locatable
+objects and have a unique ID. The unique ID restriction is due to the
+nature of the module's internal data structures and is also a request
+of some assembly programs. If two sequences with the same ID are added
+to a contig, the first sequence added is replaced by the second one.
+
+=head2 Coordinate_systems
+
+There are four base coordinate systems in Bio::Assembly::Contig.  When
+you need to access contig elements or data that exists on a certain
+range or location, you may be specifying coordinates in relation to
+different sequences, which may be either the contig consensus or one
+of the aligned sequences that were used to do the assembly.
+
+ =========================================================
+          Name           | Referenced sequence
+ ---------------------------------------------------------
+   "gapped consensus"    | Contig (with gaps)
+   "ungapped consensus"  | Contig (without gaps)
+   "aligned $seqID"      | sequence $seqID (with gaps)
+   "unaligned $seqID"    | sequence $seqID (without gaps)
+ =========================================================
+
+"gapped consensus" refers to positions in the aligned consensus
+sequence, which is the consensus sequence including the gaps inserted
+to align it agains the aligned sequences that were used to assemble
+the contig. So, its limits are [ 1, (consensus length + number of gaps
+in consensus) ]
+
+"ungapped consensus" is a coordinate system based on the consensus
+sequence, but excluding consensus gaps. This is just the coordinate
+system that you have when considering the consensus sequence alone,
+instead of aligned to other sequences.
+
+"aligned $seqID" refers to locations in the sequence $seqID after
+alignment of $seqID against the consensus sequence (reverse
+complementing the original sequence, if needed).  Coordinate 1 in
+"aligned $seqID" is equivalent to the start location (first base) of
+$seqID in the consensus sequence, just like if the aligned sequence
+$seqID was a feature of the consensus sequence.
+
+"unaligned $seqID" is equivalent to a location in the isolated
+sequence, just like you would have when considering the sequence
+alone, out of an alignment.  When changing coordinates from "aligned
+$seq2" to "unaligned $seq2", if $seq2 was reverse complemented when
+included in the alignment, the output coordinates will be reversed to
+fit that fact, i.e. 1 will be changed to length($seq2), 2 will be
+length($seq)-1 and so on.
+
+An important note: when you change gap coordinates from a gapped
+system ("gapped consensus" or "aligned $seqID") to a system that does
+not include gaps ("ungapped consensus" or "unaligned $seqID"), the
+position returned will be the first location before all gaps
+neighboring the input location.
+
+=head2 Feature_collection
+
+Bio::Assembly::Contig stores much information about a contig in a
+Bio::Assembly::SeqFeature::Collection object. Relevant information on the
+alignment is accessed by selecting features based on their primary
+tags (e.g. all features which have a primary tag of the form
+'_aligned_coord:$seqID', where $seqID is an aligned sequence ID, are
+coordinates for sequences in the contig alignment) and, by using
+methods from Bio::Assembly::SeqFeature::Collection, it's possible to select
+features by overlap with other features.
+
+We suggest that you use the primary tags of features as identifiers
+for feature classes. By convention, features with primary tags
+starting with a '_' are generated by modules that populate the contig
+data structure and return the contig object, maybe as part of an
+assembly object, e.g.  drivers from the Bio::Assembly::IO set.
+
+Features in the features collection may be associated with particular
+aligned sequences. To obtain this, you must attach the sequence to the
+feature, using attach() seq from Bio::Assembly::SeqFeatureI, before you add the
+feature to the feature collection. We also suggest to add the sequence
+id to the primary tag, so that is easy to select feature for a
+particular sequence.
+
+There is only one feature class that some methods in
+Bio::Assembly::Contig expect to find in the feature collection: features
+with primary tags of the form '_aligned_coord:$seqID', where $seqID is
+the aligned sequence id (like returned by $seq-E<gt>id()). These features
+describe the position (in "gapped consensus" coordinates) of aligned
+sequences, and the method set_seq_coord() automatically changes a
+feature's primary tag to this form whenever the feature is added to
+the collection by this method. Only two methods in Bio::Assembly::Contig
+will not work unless there are features from this class:
+change_coord() and get_seq_coord().
+
+Other feature classes will be automatically available only when
+Bio::Assembly::Contig objects are created by a specific module. Such
+feature classes are (or should be) documented in the documentation of
+the module which create them, to which the user should refer.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to the
+Bioperl mailing lists  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                 - General discussion
+  http://bio.perl.org/MailList.html     - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Robson Francisco de Souza
+
+rfsouza@citri.iq.usp.br
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+#'
+package Bio::Assembly::Contig;
+
+use strict;
+use vars qw(@ISA $VERSION);
+
+use Bio::Root::Root;
+use Bio::Align::AlignI;
+use Bio::SeqFeature::Collection;
+use Bio::Seq::PrimaryQual;
+
+@ISA = qw(Bio::Root::Root Bio::Align::AlignI);
+
+=head1 Object creator
+
+=head2 new
+
+ Title     : new
+ Usage     : my $contig = new Bio::Assembly::Contig();
+ Function  : Creates a new contig object
+ Returns   : Bio::Assembly::Contig
+ Args      : -source => string representing the source
+                        program where this contig came
+                        from
+             -id => contig unique ID
+
+=cut
+
+#-----------
+sub new {
+#-----------
+    my ($class,@args) = @_;
+
+    my $self = $class->SUPER::new(@args);
+
+    my ($src, $id) = $self->_rearrange([qw(SOURCE ID)], @args);
+    $src && $self->source($src);
+    ($id && $self->id($id)) || ($self->{'_id'} = 'NoName'); # Alignment (contig) name
+    ($id && $self->id($id)) || ($self->{'_source'} = 'Unknown'); # Program used to build the contig
+    # we need to set up internal hashes first!
+
+    # Bio::SimpleAlign derived fields (check which ones are needed for AlignI compatibility)
+    $self->{'_elem'} = {}; # contig elements: aligned sequence objects (keyed by ID)
+    $self->{'_order'} = {}; # store sequence order
+#    $self->{'start_end_lists'} = {}; # References to entries in {'_seq'}. Keyed by seq ids.
+#    $self->{'_dis_name'} = {}; # Display names for each sequence
+    $self->{'_symbols'} = {}; # List of symbols
+
+    #Contig specific slots
+    $self->{'_consensus_sequence'} = undef;
+    $self->{'_consensus_quality'} = undef;
+    $self->{'_nof_residues'} = 0;
+    $self->{'_nof_seqs'} = 0;
+#    $self->{'_nof_segments'} = 0; # Let's not make it heavier than needed by now...
+    $self->{'_sfc'} = Bio::SeqFeature::Collection->new();
+
+    # Assembly specifcs
+    $self->{'_assembly'} = undef; # Reference to a Bio::Assembly::Scaffold object, if contig belongs to one.
+    $self->{'_strand'} = 0; # Reverse (-1) or forward (1), if contig is in a scaffold. 0 otherwise
+    $self->{'_neighbor_start'} = undef; # Will hold a reference to another contig
+    $self->{'_neighbor_end'} = undef; # Will hold a reference to another contig
+
+    return $self; # success - we hope!
+}
+
+=head1 Assembly related methods
+
+These methods exist to enable adding information about possible
+relations among contigs, e.g. when you already have a scaffold for
+your assembly, describing the ordering of contigs in the final
+assembly, but no sequences covering the gaps between neighboring
+contigs.
+
+=head2 source
+
+ Title     : source
+ Usage     : $contig->source($program);
+ Function  : Get/Set program used to build this contig
+ Returns   : string
+ Argument  : [optional] string
+
+=cut
+
+sub source {
+    my $self = shift;
+    my $source = shift;
+
+    $self->{'_source'} = $source if (defined $source);
+    return $self->{'_source'};
+}
+
+=head2 assembly
+
+ Title     : assembly
+ Usage     : $contig->assembly($assembly);
+ Function  : Get/Set assembly object for this contig
+ Returns   : a Bio::Assembly::Scaffold object
+ Argument  : a Bio::Assembly::Scaffold object
+
+=cut
+
+sub assembly {
+    my $self = shift;
+    my $assembly = shift;
+
+    $self->throw("Using non Bio::Assembly::Scaffold object when assign contig to assembly")
+	if (defined $assembly && ! $assembly->isa("Bio::Assembly::Scaffold"));
+
+    $self->{'_assembly'} = $assembly if (defined $assembly);
+    return $self->{'_assembly'};
+}
+
+=head2 strand
+
+ Title     : strand
+ Usage     : $contig->strand($num);
+ Function  : Get/Set contig orientation in a scaffold/assembly.
+             Its equivalent to the strand property of sequence
+             objects and sets whether the contig consensus should
+             be reversed and complemented before being added to a
+             scaffold or assembly.
+ Returns   : integer
+ Argument  : 1 if orientaion is forward, -1 if reverse and
+             0 if none
+
+=cut
+
+sub strand {
+    my $self = shift;
+    my $ori = shift;
+
+    $self->throw("Contig strand must be either 1, -1 or 0")
+	unless (defined $ori && ($ori == 1 || $ori == 0 || $ori == -1));
+
+    $self->{'_strand'} = $ori;
+    return $self->{'_strand'};
+}
+
+=head2 upstream_neighbor
+
+ Title     : upstream_neighbor
+ Usage     : $contig->upstream_neighbor($contig);
+ Function  : Get/Set a contig neighbor for the current contig when
+             building a scaffold. The upstream neighbor is
+             located before $contig first base 
+ Returns   : nothing
+ Argument  : Bio::Assembly::Contig
+
+=cut
+
+sub upstream_neighbor {
+    my $self = shift;
+    my $ref = shift;
+
+    $self->throw("Trying to assign a non Bio::Assembly::Contig object to upstream contig")
+	if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
+
+    $self->{'_neighbor_start'} = $ref if (defined $ref);
+    return $self->{'_neighbor_start'};
+}
+
+=head2 downstream_neighbor
+
+ Title     : downstream_neighbor
+ Usage     : $contig->downstream_neighbor($num);
+ Function  : Get/Set a contig neighbor for the current contig when
+             building a scaffold. The downstream neighbor is
+             located after $contig last base
+ Returns   : nothing
+ Argument  : Bio::Assembly::Contig
+
+=cut
+
+sub downstream_neighbor {
+    my $self = shift;
+    my $ref = shift;
+
+    $self->throw("Trying to assign a non Bio::Assembly::Contig object to downstream contig")
+	if (defined $ref && ! $ref->isa("Bio::Assembly::Contig"));
+    $self->{'_neighbor_end'} = $ref if (defined $ref);
+    return $self->{'_neighbor_end'};
+}
+
+=head1 Contig feature collection methods
+
+=head2 add_features
+
+ Title     : add_features
+ Usage     : $contig->add_features($feat,$flag)
+ Function  : 
+
+             Add an array of features to the contig feature
+             collection. The consensus sequence may be attached to the
+             added feature, if $flag is set to 1. If $flag is 0 and
+             the feature attached to one of the contig aligned
+             sequences, the feature is registered as an aligned
+             sequence feature. If $flag is 0 and the feature is not
+             attched to any sequence in the contig, the feature is
+             simply added to the feature collection and no attachment
+             or registration is made.
+
+             Note: You must attach aligned sequences to their features
+             prior to calling add_features, otherwise you won't be
+             able to access the feature through get_seq_feat_by_tag()
+             method.
+
+ Returns   : number of features added.
+ Argument  : 
+             $feat : A reference to an array of Bio::SeqFeatureI
+             $flag : boolean - true if consensus sequence object
+                     should be attached to this feature, false if
+                     no consensus attachment should be made.
+                     Default: false.
+
+=cut
+
+sub add_features {
+    my ($self, $args, $flag) = @_;
+
+    # Adding shortcuts for aligned sequence features
+    $flag = 0 unless (defined $flag);
+    if ($flag && defined $self->{'_consensus_sequence'}) {
+	foreach my $feat (@$args) {
+	    next if (defined $feat->seq);
+	    $feat->attach_seq($self->{'_consensus_sequence'});
+	}
+    } elsif (!$flag) { # Register aligned sequence features
+	foreach my $feat (@$args) {
+	    if (my $seq = $feat->entire_seq()) {
+		my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
+		$self->warn("Adding contig feature attached to unknown sequence $seqID!")
+		    unless (exists $self->{'_elem'}{$seqID});
+		my $tag = $feat->primary_tag;
+		$self->{'_elem'}{$seqID}{'_feat'}{$tag} = $feat;
+	    }
+	}
+    }
+
+    # Add feature to feature collection
+    my $nof_added = $self->{'_sfc'}->add_features($args);
+
+    return $nof_added;
+}
+
+=head2 remove_features
+
+ Title     : remove_features
+ Usage     : $contig->remove_features(@feat)
+ Function  : Remove an array of contig features
+ Returns   : number of features removed.
+ Argument  : An array of Bio::SeqFeatureI
+
+=cut
+
+sub remove_features {
+    my ($self, @args) = @_;
+
+    # Removing shortcuts for aligned sequence features
+    foreach my $feat (@args) {
+	if (my $seq = $feat->entire_seq()) {
+	    my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
+	    my $tag = $feat->primary_tag;
+	    $tag =~ s/:$seqID$/$1/g;
+	    delete( $self->{'_elem'}{$seqID}{'_feat'}{$tag} )
+		if (exists $self->{'_elem'}{$seqID}{'_feat'}{$tag} &&
+		    $self->{'_elem'}{$seqID}{'_feat'}{$tag} eq $feat);
+	}
+    }
+
+    return $self->{'_sfc'}->remove_features(\@args);
+}
+
+=head2 get_features_collection
+
+ Title     : get_features_collection
+ Usage     : $contig->get_features_collection()
+ Function  : Get the collection of all contig features
+ Returns   : Bio::SeqFeature::Collection
+ Argument  : none
+
+=cut
+
+sub get_features_collection {
+    my $self = shift;
+
+    return $self->{'_sfc'};
+}
+
+=head1 Coordinate system's related methods
+
+See L<Coordinate_Systems> above.
+
+=head2 change_coord
+
+ Title     : change_coord
+ Usage     : $contig->change_coord($in,$out,$query)
+ Function  : 
+
+             Change coordinate system for $query.  This method
+             transforms locations between coordinate systems described
+             in section "Coordinate Systems" of this document.
+
+             Note: this method will throw an exception when changing
+             coordinates between "ungapped consensus" and other
+             systems if consensus sequence was not set. It will also
+             throw exceptions when changing coordinates among aligned
+             sequence, either with or without gaps, and other systems
+             if sequence locations were not set with set_seq_coord().
+
+ Returns   : integer
+ Argument  : 
+             $in    : [string]  input coordinate system
+             $out   : [string]  output coordinate system
+             $query : [integer] a position in a sequence
+
+=cut
+
+sub change_coord {
+    my $self     = shift;
+    my $type_in  = shift;
+    my $type_out = shift;
+    my $query    = shift;
+
+    # Parsing arguments
+    # Loading read objects (these calls will throw exceptions whether $read_in or 
+    # $read_out is not found
+    my ($read_in,$read_out) = (undef,undef);
+    my $in_ID  = ( split(' ',$type_in)  )[1];
+    my $out_ID = ( split(' ',$type_out) )[1];
+
+    if ($in_ID  ne 'consensus') {
+	$read_in  = $self->get_seq_coord( $self->get_seq_by_name($in_ID)  );
+	$self->throw("Can't change coordinates without sequence location for $in_ID") 
+	    unless (defined $read_in);
+    }
+    if ($out_ID ne 'consensus') {
+	$read_out = $self->get_seq_coord( $self->get_seq_by_name($out_ID) );
+	$self->throw("Can't change coordinates without sequence location for $out_ID")
+	    unless (defined $read_out);
+    }
+
+    # Performing transformation between coordinates
+  SWITCH1: {
+
+      # Transformations between contig padded and contig unpadded
+      (($type_in eq 'gapped consensus') && ($type_out eq 'ungapped consensus')) && do {
+	  $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
+	      unless (defined $self->{'_consensus_sequence'});
+	  $query = &_padded_unpadded($self->{'_consensus_gaps'}, $query);
+	  last SWITCH1;
+      };
+      (($type_in eq 'ungapped consensus') && ($type_out eq 'gapped consensus')) && do {
+	  $self->throw("Can't use ungapped consensus coordinates without a consensus sequence")
+	      unless (defined $self->{'_consensus_sequence'});
+	  $query = &_unpadded_padded($self->{'_consensus_gaps'},$query);
+	  last SWITCH1;
+      };
+
+      # Transformations between contig (padded) and read (padded)
+      (($type_in  eq 'gapped consensus') &&
+       ($type_out =~ /^aligned /) && defined($read_out)) && do {
+	   $query = $query - $read_out->start() + 1;
+	   last SWITCH1;
+       };
+      (($type_in =~ /^aligned /) && defined($read_in) &&
+       ($type_out  eq 'gapped consensus')) && do {
+	   $query = $query + $read_in->start() - 1;
+	   last SWITCH1;
+       };
+
+      # Transformations between contig (unpadded) and read (padded)
+      (($type_in eq 'ungapped consensus') &&
+       ($type_out =~ /^aligned /) && defined($read_out)) && do {
+	   $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
+	   $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
+	   last SWITCH1;
+       };
+      (($type_in =~ /^aligned /) && defined($read_in) && 
+       ($type_out eq 'ungapped consensus')) && do {
+	   $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
+	   $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
+	   last SWITCH1;
+       };
+
+      # Transformations between seq $read_in padded and seq $read_out padded
+      (defined($read_in)  && ($type_in  =~ /^aligned /)  &&
+       defined($read_out) && ($type_out =~ /^aligned /)) && do {
+	   $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
+	   $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
+	   last SWITCH1;
+       };
+
+      # Transformations between seq $read_in padded and seq $read_out unpadded
+      (defined($read_in)  && ($type_in  =~ /^aligned /)    &&
+       defined($read_out) && ($type_out =~ /^unaligned /)) && do {
+	   if ($read_in ne $read_out) {
+	       $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
+	       $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
+	   }
+	   my $list_out = $self->{'_elem'}{$out_ID}{'_gaps'};
+	   $query = &_padded_unpadded($list_out,$query);
+	   # Changing read orientation if read was reverse complemented when aligned
+	   if ($read_out->strand == -1) {
+	       my ($length) = $read_out->length();
+	       $length = $length - &_nof_gaps($list_out,$length);
+	       $query  = $length - $query + 1;
+	   }
+	   last SWITCH1;
+       };
+      (defined($read_in)  && ($type_in  =~ /^unaligned /) && 
+       defined($read_out) && ($type_out =~ /^aligned /))  && do {
+	   my $list_in = $self->{'_elem'}{$in_ID}{'_gaps'};
+	   # Changing read orientation if read was reverse complemented when aligned
+	   if ($read_in->strand == -1) {
+	       my ($length) = $read_in->length();
+	       $length = $length - &_nof_gaps($list_in,$length);
+	       $query  = $length - $query + 1;
+	   }
+	   $query = &_unpadded_padded($list_in,$query);
+	   if ($read_in ne $read_out) {
+	       $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
+	       $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
+	   }
+	   last SWITCH1;
+       };
+
+      # Transformations between seq $read_in unpadded and seq $read_out unpadded
+      (defined($read_in)  && ($type_in  =~ /^unaligned /)    &&
+       defined($read_out) && ($type_out =~ /^unaligned /)) && do {
+	   $query = $self->change_coord("unaligned $in_ID","aligned $out_ID",$query);
+	   $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
+	   last SWITCH1;
+       };
+
+      # Transformations between contig (padded) and read (unpadded)
+      (($type_in eq 'gapped consensus') &&
+       ($type_out =~ /^unaligned /) && defined($read_out)) && do {
+	   $query = $self->change_coord('gapped consensus',"aligned $out_ID",$query);
+	   $query = $self->change_coord("aligned $out_ID","unaligned $out_ID",$query);
+	   last SWITCH1;
+       };
+      (($type_in =~ /^unaligned /) && defined($read_in) &&
+       ($type_out eq 'gapped consensus')) && do {
+	   $query = $self->change_coord("unaligned $in_ID","aligned $in_ID",$query);
+	   $query = $self->change_coord("aligned $in_ID",'gapped consensus',$query);
+	   last SWITCH1;
+       };
+
+      # Transformations between contig (unpadded) and read (unpadded)
+      (($type_in eq 'ungapped consensus') &&
+       ($type_out =~ /^unaligned /) && defined($read_out)) && do {
+	   $query = $self->change_coord('ungapped consensus','gapped consensus',$query);
+	   $query = $self->change_coord('gapped consensus',"unaligned $out_ID",$query);
+	   last SWITCH1;
+       };
+      (($type_in =~ /^unaligned /) && defined($read_in) &&
+       ($type_out eq 'ungapped consensus')) && do {
+	   $query = $self->change_coord("unaligned $in_ID",'gapped consensus',$query);
+	   $query = $self->change_coord('gapped consensus','ungapped consensus',$query);
+	   last SWITCH1;
+       };
+
+      $self->throw("Unknow coordinate system. Args: $type_in, $type_out.");
+      $query = undef; # If a coordinate systems just requested is unknown
+  }
+
+    return $query;
+}
+
+=head2 get_seq_coord
+
+ Title     : get_seq_coord
+ Usage     : $contig->get_seq_coord($seq);
+ Function  : Get "gapped consensus" location for aligned sequence
+ Returns   : Bio::SeqFeature::Generic for coordinates or undef.
+             A warning is printed if sequence coordinates were not set.
+ Argument  : Bio::LocatabaleSeq object
+
+=cut
+
+sub get_seq_coord {
+    my ($self,$seq) = @_;
+
+    if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
+	$self->throw("$seq is not a Bio::LocatableSeq");
+    }
+    my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
+
+    unless (exists( $self->{'_elem'}{$seqID} )) {
+	$self->warn("No such sequence ($seqID) in contig ".$self->id);
+	return undef;
+    }
+    unless (exists( $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} )) {
+	$self->warn("Location not set for sequence ($seqID) in contig ".$self->id);
+	return undef;
+    }
+
+    return $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"};
+}
+
+=head2 set_seq_coord
+
+ Title     : set_seq_coord
+ Usage     : $contig->set_seq_coord($feat,$seq);
+ Function  : 
+
+             Set "gapped consensus" location for an aligned
+             sequence. If the sequence was previously added using
+             add_seq, its coordinates are changed/set.  Otherwise,
+             add_seq is called and the sequence is added to the
+             contig.
+
+ Returns   : Bio::SeqFeature::Generic for old coordinates or undef.
+ Argument  : 
+             $feat  : a Bio::SeqFeature::Generic object
+                      representing a location for the
+                      aligned sequence, in "gapped 
+                      consensus" coordinates.
+
+             Note: the original feature primary tag will
+                   be lost.
+
+             $seq   : a Bio::LocatabaleSeq object
+
+=cut
+
+sub set_seq_coord {
+    my ($self,$feat,$seq) = @_;
+
+    if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
+	$self->throw("Unable to process non locatable sequences [".ref($seq)."]");
+    }
+
+    # Complaining about inadequate feature object
+     $self->throw("Coordinates must be a Bio::SeqFeature::Generic object!")
+	unless ( $feat->isa("Bio::SeqFeature::Generic") );
+    $self->throw("Sequence coordinates must have an end!")
+	unless (defined $feat->end);
+    $self->throw("Sequence coordinates must have a start!")
+	unless (defined $feat->start);
+
+    my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
+    if (exists( $self->{'_elem'}{$seqID} ) &&
+	exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
+	defined( $self->{'_elem'}{$seqID}{'_seq'} ) &&
+	($seq ne $self->{'_elem'}{$seqID}{'_seq'}) ) {
+	$self->warn("Replacing sequence $seqID\n");
+	$self->remove_seq($self->{'_elem'}{$seqID}{'_seq'});
+    }
+    $self->add_seq($seq);
+
+    # Remove previous coordinates, if any
+    $self->remove_features($feat);
+
+    # Add new Bio::Generic::SeqFeature
+    $feat->add_tag_value('contig',$self->id)
+	unless ( $feat->has_tag('contig') );
+    $feat->primary_tag("_aligned_coord:$seqID");
+    $feat->attach_seq($seq);
+    $self->{'_elem'}{$seqID}{'_feat'}{"_aligned_coord:$seqID"} = $feat;
+    $self->add_features([ $feat ]);
+}
+
+=head1 Bio::Assembly::Contig consensus methods
+
+=head2 set_consensus_sequence
+
+ Title     : set_consensus_sequence
+ Usage     : $contig->set_consensus_sequence($seq)
+ Function  : Set the consensus sequence object for this contig
+ Returns   : consensus length
+ Argument  : Bio::LocatableSeq
+
+=cut
+
+sub set_consensus_sequence {
+    my $self = shift;
+    my $seq  = shift;
+
+    $self->throw("Consensus sequence must be a Bio::LocatableSeq!")
+	unless ($seq->isa("Bio::LocatableSeq"));
+
+    my $con_len = $seq->length;
+    $seq->start(1); $seq->end($con_len);
+
+    $self->{'_consensus_gaps'} = []; # Consensus Gap registry
+    $self->_register_gaps($seq->seq,
+			  $self->{'_consensus_gaps'});
+    $self->{'_consensus_sequence'} = $seq;
+
+    return $con_len;
+}
+
+=head2 set_consensus_quality
+
+ Title     : set_consensus_quality
+ Usage     : $contig->set_consensus_quality($qual)
+ Function  : Set the quality object for consensus sequence
+ Returns   : nothing
+ Argument  : Bio::Seq::QualI object
+
+=cut
+
+sub set_consensus_quality {
+    my $self = shift;
+    my $qual  = shift;
+
+    $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
+	unless ( $qual->isa("Bio::Seq::QualI") );
+
+    $self->throw("Consensus quality can't be added before you set the consensus sequence!")
+	unless (defined $self->{'_consensus_sequence'});
+
+    $self->{'_consensus_quality'} = $qual;
+}
+
+=head2 get_consensus_length
+
+ Title     : get_consensus_length
+ Usage     : $contig->get_consensus_length()
+ Function  : Get consensus sequence length
+ Returns   : integer
+ Argument  : none
+
+=cut
+
+sub get_consensus_length {
+    my $self = shift;
+
+    return $self->{'_consensus_sequence'}->length();
+}
+
+=head2 get_consensus_sequence
+
+ Title     : get_consensus_sequence
+ Usage     : $contig->get_consensus_sequence()
+ Function  : Get a reference to the consensus sequence object
+             for this contig
+ Returns   : Bio::SeqI object
+ Argument  : none
+
+=cut
+
+sub get_consensus_sequence {
+    my ($self, @args) = @_;
+
+    return $self->{'_consensus_sequence'};
+}
+
+=head2 get_consensus_quality
+
+ Title     : get_consensus_quality
+ Usage     : $contig->get_consensus_quality()
+ Function  : Get a reference to the consensus quality object
+             for this contig.
+ Returns   : A Bio::QualI object
+ Argument  : none
+
+=cut
+
+sub get_consensus_quality {
+    my ($self, @args) = @_;
+
+    return $self->{'_consensus_quality'};
+}
+
+=head1 Bio::Assembly::Contig aligned sequences methods
+
+=head2 set_seq_qual
+
+ Title     : set_seq_qual
+ Usage     : $contig->set_seq_qual($seq,$qual);
+ Function  : Adds quality to an aligned sequence.
+ Returns   : nothing
+ Argument  : a Bio::LocatableSeq object and
+             a Bio::Seq::QualI object
+
+See L<Bio::LocatableSeq> for more information.
+
+=cut
+
+sub set_seq_qual {
+    my ($self,$seq,$qual) = @_;
+
+    if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
+	$self->throw("Unable to process non locatable sequences [", ref($seq), "]");
+    }
+    my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
+
+    $self->throw("Consensus quality must be a Bio::Seq::QualI object!")
+	unless ( $qual->isa("Bio::Seq::QualI") );
+    $self->throw("Use add_seq first: aligned sequence qualities can't be added before you load the sequence!")
+	unless (exists $self->{'_elem'}{$seqID}{'_seq'});
+    $self->throw("Use set_seq_coord first: aligned sequence qualities can't be added before you add coordinates for the sequence!") unless (defined( $self->get_seq_coord($seq) ));
+
+    # Adding gaps to quality object
+    my $sequence = $self->{'_elem'}{$seqID}{'_seq'}->seq();
+    my $tmp = $qual->qual();
+    @{$tmp} = reverse(@{$tmp}) if ($self->get_seq_coord($seq)->strand() == -1);
+    my @quality  = ();
+    my $previous = 0;
+    my $next     = 0;
+    my $i = 0; my $j = 0;
+    while ($i<=$#{$tmp}) {
+	# IF base is a gap, quality is the average for neighbouring sites
+	if (substr($sequence,$j,1) eq '-') {
+	    $previous = $tmp->[$i-1] unless ($i == 0);
+	    if ($i < $#{$tmp}) {
+		$next = $tmp->[$i+1];
+	    } else {
+		$next = 0;
+	    }
+	    push(@quality,int( ($previous+$next)/2 ));
+	} else {
+	    push(@quality,$tmp->[$i]);
+	    $i++;
+	}
+	$j++;
+    }
+
+    $self->{'_elem'}{$seqID}{'_qual'} = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality),
+								   -id=>$seqID);
+}
+
+=head2 get_seq_ids
+
+ Title     : get_seq_ids
+ Usage     : $contig->get_seq_ids(-start=>$start,
+				  -end=>$end,
+				  -type=>"gapped A0QR67B08.b");
+ Function  : Get list of sequence IDs overlapping inteval [$start, $end]
+             The default interval is [1,$contig->length]
+             Default coordinate system is "gapped contig"
+ Returns   : An array
+ Argument  : A hash with optional elements:
+             -start : consensus subsequence start
+             -end   : consensus subsequence end
+             -type  : the coordinate system type for $start and $end arguments
+                      Coordinate system avaliable are:
+                      "gapped consensus"   : consensus coordinates with gaps
+                      "ungapped consensus" : consensus coordinates without gaps
+                      "aligned $ReadID"    : read $ReadID coordinates with gaps
+                      "unaligned $ReadID"  : read $ReadID coordinates without gaps
+
+
+=cut
+
+sub get_seq_ids {
+    my ($self, @args) = @_;
+
+    my ($type,$start,$end) = 
+	$self->_rearrange([qw(TYPE START END)], @args);
+
+    if (defined($start) && defined($end)) {
+	if (defined($type) && ($type ne 'gapped consensus')) {
+	    $start = $self->change_coord($type,'gapped consensus',$start);
+	    $end   = $self->change_coord($type,'gapped consensus',$end);
+	}
+
+	my @list = grep { $_->isa("Bio::SeqFeature::Generic") &&
+			      ($_->primary_tag =~ /^_aligned_coord:/) }
+	$self->{'_sfc'}->features_in_range(-start=>$start,
+					   -end=>$end,
+					   -contain=>0,
+					   -strandmatch=>'ignore');
+	@list = map { $_->entire_seq->id } @list;
+	return @list;
+    }
+
+    # Entire aligned sequences list
+    return map { $self->{'_order'}{$_} } sort { $a cmp $b } keys %{ $self->{'_order'} };
+}
+
+=head2 get_seq_feat_by_tag
+
+ Title     : get_seq_feat_by_tag
+ Usage     : $seq = $contig->get_seq_feat_by_tag($seq,"_aligned_coord:$seqID")
+ Function  : 
+
+             Get a sequence feature based on its primary_tag.
+             When you add 
+
+ Returns   : a Bio::SeqFeature object
+ Argument  : a Bio::LocatableSeq and a string (feature primary tag)
+
+=cut
+
+sub get_seq_feat_by_tag {
+    my ($self,$seq,$tag) = @_;
+
+    if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
+	$self->throw("Unable to process non locatable sequences [", ref($seq), "]");
+    }
+    my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
+
+    return $self->{'_elem'}{$seqID}{'_feat'}{$tag};
+}
+
+=head2 get_seq_by_name
+
+ Title     : get_seq_by_name
+ Usage     : $seq = $contig->get_seq_by_name('Seq1')
+ Function  : Gets a sequence based on its id.
+ Returns   : a Bio::LocatableSeq object
+             undef if name is not found
+ Argument  : string
+
+=cut
+
+sub get_seq_by_name {
+    my $self = shift;
+    my ($seqID) = @_;
+
+    unless (exists $self->{'_elem'}{$seqID}{'_seq'}) {
+	$self->throw("Could not find sequence $seqID in contig ".$self->id);
+	return undef;
+    }
+
+    return $self->{'_elem'}{$seqID}{'_seq'};
+}
+
+=head2 get_qual_by_name
+
+ Title     : get_qual_by_name
+ Usage     : $seq = $contig->get_qual_by_name('Seq1')
+ Function  :
+
+             Gets Bio::Seq::QualI object for a sequence
+             through its id ( as given by $qual->id() ).
+
+ Returns   : a Bio::Seq::QualI object.
+             undef if name is not found
+ Argument  : string
+
+=cut
+
+sub get_qual_by_name {
+    my $self = shift;
+    my ($seqID) = @_;
+
+    unless (exists $self->{'_elem'}{$seqID}{'_qual'}) {
+	$self->warn("Could not find quality for $seqID in contig!");
+	return undef;
+    }
+
+    return $self->{'_elem'}{$seqID}{'_qual'};
+}
+
+=head1 Bio::Align::AlignI compatible methods
+
+=head2 Modifier methods
+
+These methods modify the MSE by adding, removing or shuffling complete
+sequences.
+
+=head2 add_seq
+
+ Title     : add_seq
+ Usage     : $contig->add_seq($newseq);
+ Function  :
+
+             Adds a sequence to the contig. *Does* 
+             *not* align it - just adds it to the 
+             hashes.
+
+ Returns   : nothing
+ Argument  : a Bio::LocatableSeq object
+
+See L<Bio::LocatableSeq> for more information.
+
+=cut
+
+sub add_seq {
+    my $self = shift;
+    my $seq = shift;
+
+    if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
+	$self->throw("Unable to process non locatable sequences [", ref($seq), "]");
+    }
+
+    my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
+    $self->{'_elem'}{$seqID} = {} unless (exists $self->{'elem'}{$seqID});
+
+    if (exists( $self->{'_elem'}{$seqID}{'_seq'} ) &&
+	($seq eq $self->{'_elem'}{$seqID}{'_seq'}) ) {
+	$self->warn("Adding sequence $seqID, which has already been added");
+    }
+
+    # Our locatable sequences are always considered to be complete sequences
+    $seq->start(1); $seq->end($seq->length());
+
+    $self->warn("Adding non-nucleotidic sequence ".$seqID)
+	if (lc($seq->alphabet) ne 'dna' && lc($seq->alphabet) ne 'rna');
+
+    # build the symbol list for this sequence,
+    # will prune out the gap and missing/match chars
+    # when actually asked for the symbol list in the
+    # symbol_chars
+    if (defined $seq->seq) {
+	map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
+    } else {
+	$self->{'_symbols'} = {};
+    }
+
+    my $seq_no = ++$self->{'_nof_seqs'};
+
+    if (ref( $self->{'_elem'}{$seqID}{'_seq'} )) {
+	$self->warn("Replacing one sequence [$seqID]\n");
+    } else {
+	#print STDERR "Assigning $seqID to $order\n";
+	$self->{'_order'}->{$seq_no} = $seqID;
+#	$self->{'_start_end_lists'}->{$id} = []
+#	    unless(exists $self->{'_start_end_lists'}->{$id});
+#	push @{$self->{'_start_end_lists'}->{$id}}, $seq;
+    }
+
+    $self->{'_elem'}{$seqID}{'_seq'}  = $seq;
+    $self->{'_elem'}{$seqID}{'_feat'} = {};
+    $self->{'_elem'}{$seqID}{'_gaps'} = [];
+    my $dbref = $self->{'_elem'}{$seqID}{'_gaps'};
+    my $nofgaps = $self->_register_gaps($seq->seq,$dbref);
+
+    # Updating residue count
+    $self->{'_nof_residues'} += $seq->length - $nofgaps;
+
+    return 1;
+}
+
+=head2 remove_seq
+
+ Title     : remove_seq
+ Usage     : $contig->remove_seq($seq);
+ Function  : Removes a single sequence from an alignment
+ Returns   : 1 on success, 0 otherwise
+ Argument  : a Bio::LocatableSeq object
+
+=cut
+
+sub remove_seq {
+    my ($self,$seq) = @_;
+
+    if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
+	$self->throw("Unable to process non locatable sequences [", ref($seq), "]");
+    }
+
+    my $seqID = $seq->id() || $seq->display_id || $seq->primary_id;
+    unless (exists $self->{'_elem'}{$seqID} ) {
+	$self->warn("No sequence named $seqID  [$seq]");
+	return 0;
+    }
+
+    # Updating residue count
+    $self->{'_nof_residues'} -= $seq->length() + 
+	&_nof_gaps( $self->{'_elem'}{$seqID}{'_gaps'}, $seq->length );
+
+    # Remove all references to features of this sequence
+    my @feats = ();
+    foreach my $tag (keys %{ $self->{'_elem'}{$seqID}{'_feat'} }) {
+	push(@feats, $self->{'_elem'}{$seqID}{'_feat'}{$tag});
+    }
+    $self->{'_sfc'}->remove_features(\@feats);
+    delete $self->{'_elem'}{$seqID};
+
+    return 1;
+}
+
+=head2 purge
+
+ Title   : purge
+ Usage   : $contig->purge(0.7);
+ Function:
+
+           Removes sequences above whatever %id.
+
+           This function will grind on large alignments. Beware!
+           (perhaps not ideally implemented)
+
+ Example :
+ Returns : An array of the removed sequences
+ Argument:
+
+
+=cut
+
+sub purge {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 sort_alphabetically
+
+ Title     : sort_alphabetically
+ Usage     : $contig->sort_alphabetically
+ Function  : 
+
+             Changes the order of the alignemnt to alphabetical on name 
+             followed by numerical by number.
+
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub sort_alphabetically {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 Sequence selection methods
+
+Methods returning one or more sequences objects.
+
+=head2 each_seq
+
+ Title     : each_seq
+ Usage     : foreach $seq ( $contig->each_seq() ) 
+ Function  : Gets an array of Seq objects from the alignment
+ Returns   : an array
+ Argument  : 
+
+=cut
+
+sub each_seq {
+    my ($self) = @_;
+
+    my (@arr,$seqID);
+
+    foreach $seqID ( map { $self->{'_order'}{$_} } sort { $a <=> $b } keys %{$self->{'_order'}} ) {
+	push(@arr,$self->{'_elem'}{$seqID}{'_seq'});
+    }
+
+    return @arr;
+}
+
+=head2 each_alphabetically
+
+ Title     : each_alphabetically
+ Usage     : foreach $seq ( $contig->each_alphabetically() )
+ Function  :
+
+             Returns an array of sequence object sorted alphabetically 
+             by name and then by start point.
+             Does not change the order of the alignment
+
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub each_alphabetically {
+    my($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 each_seq_with_id
+
+ Title     : each_seq_with_id
+ Usage     : foreach $seq ( $contig->each_seq_with_id() ) 
+ Function  : 
+
+             Gets an array of Seq objects from the
+             alignment, the contents being those sequences
+             with the given name (there may be more than one)
+
+ Returns   : an array
+ Argument  : a seq name
+
+=cut
+
+sub each_seq_with_id {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 get_seq_by_pos
+
+ Title     : get_seq_by_pos
+ Usage     : $seq = $contig->get_seq_by_pos(3)
+ Function  : 
+
+             Gets a sequence based on its position in the alignment.
+             Numbering starts from 1.  Sequence positions larger than
+             no_sequences() will thow an error.
+
+ Returns   : a Bio::LocatableSeq object
+ Argument  : positive integer for the sequence osition
+
+=cut
+
+sub get_seq_by_pos {
+    my $self = shift;
+    my ($pos) = @_;
+
+    $self->throw("Sequence position has to be a positive integer, not [$pos]")
+	unless $pos =~ /^\d+$/ and $pos > 0;
+    $self->throw("No sequence at position [$pos]")
+	unless $pos <= $self->no_sequences ;
+
+    my $seqID = $self->{'_order'}->{--$pos};
+    return $self->{'_elem'}{$seqID}{'_seq'};
+}
+
+=head2 Create new alignments
+
+The result of these methods are horizontal or vertical subsets of the
+current MSE.
+
+=head2 select
+
+ Title     : select
+ Usage     : $contig2 = $contig->select(1, 3) # three first sequences
+ Function  : 
+
+             Creates a new alignment from a continuous subset of
+             sequences.  Numbering starts from 1.  Sequence positions
+             larger than no_sequences() will thow an error.
+
+ Returns   : a Bio::Assembly::Contig object
+ Argument  : positive integer for the first sequence
+             positive integer for the last sequence to include (optional)
+
+=cut
+
+sub select {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+
+=head2 select_noncont
+
+ Title     : select_noncont
+ Usage     : $contig2 = $contig->select_noncont(1, 3) # first and 3rd sequences
+ Function  : 
+
+             Creates a new alignment from a subset of
+             sequences.  Numbering starts from 1.  Sequence positions
+             larger than no_sequences() will thow an error.
+
+ Returns   : a Bio::Assembly::Contig object
+ Args      : array of integers for the sequences
+
+=cut
+
+sub select_noncont {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+    
+=head2 slice
+
+ Title     : slice
+ Usage     : $contig2 = $contig->slice(20, 30)
+ Function  : 
+
+             Creates a slice from the alignment inclusive of start and
+             end columns.  Sequences with no residues in the slice are
+             excluded from the new alignment and a warning is printed.
+             Slice beyond the length of the sequence does not do
+             padding.
+
+ Returns   : a Bio::Assembly::Contig object
+ Argument  : positive integer for start column 
+             positive integer for end column 
+
+=cut
+
+sub slice {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 Change sequences within the MSE
+
+These methods affect characters in all sequences without changeing the
+alignment.
+
+
+=head2 map_chars
+
+ Title     : map_chars
+ Usage     : $contig->map_chars('\.','-')
+ Function  : 
+
+             Does a s/$arg1/$arg2/ on the sequences. Useful for gap
+             characters
+
+             Notice that the from (arg1) is interpretted as a regex,
+             so be careful about quoting meta characters (eg
+             $contig->map_chars('.','-') wont do what you want)
+
+ Returns   : 
+ Argument  : 'from' rexexp
+             'to' string
+
+=cut
+
+sub map_chars {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 uppercase
+
+ Title     : uppercase()
+ Usage     : $contig->uppercase()
+ Function  : Sets all the sequences to uppercase
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub uppercase {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 match_line
+
+ Title    : match_line()
+ Usage    : $contig->match_line()
+ Function : Generates a match line - much like consensus string
+            except that a line indicating the '*' for a match.
+ Argument : (optional) Match line characters ('*' by default)
+            (optional) Strong match char (':' by default)
+            (optional) Weak match char ('.' by default)
+
+=cut
+
+sub match_line {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 match
+
+ Title     : match()
+ Usage     : $contig->match()
+ Function  : 
+
+             Goes through all columns and changes residues that are
+             identical to residue in first sequence to match '.'
+             character. Sets match_char.
+
+             USE WITH CARE: Most MSE formats do not support match
+             characters in sequences, so this is mostly for output
+             only. NEXUS format (Bio::AlignIO::nexus) can handle
+             it.
+
+ Returns   : 1
+ Argument  : a match character, optional, defaults to '.'
+
+=cut
+
+sub match {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 unmatch
+
+ Title     : unmatch()
+ Usage     : $contig->unmatch()
+ Function  : 
+
+             Undoes the effect of method match. Unsets match_char.
+
+ Returns   : 1
+ Argument  : a match character, optional, defaults to '.'
+
+=cut
+
+sub unmatch {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+
+=head2 MSE attibutes
+
+Methods for setting and reading the MSE attributes.
+
+Note that the methods defining character semantics depend on the user
+to set them sensibly.  They are needed only by certain input/output
+methods. Unset them by setting to an empty string ('').
+
+=head2 id
+
+ Title     : id
+ Usage     : $contig->id("Ig")
+ Function  : Gets/sets the id field of the alignment
+ Returns   : An id string
+ Argument  : An id string (optional)
+
+=cut
+
+sub id {
+    my ($self, $contig_name) = @_;
+
+    if (defined( $contig_name )) {
+	$self->{'_id'} = $contig_name;
+    }
+
+    return $self->{'_id'};
+}
+
+=head2 missing_char
+
+ Title     : missing_char
+ Usage     : $contig->missing_char("?")
+ Function  : Gets/sets the missing_char attribute of the alignment
+             It is generally recommended to set it to 'n' or 'N' 
+             for nucleotides and to 'X' for protein. 
+ Returns   : An missing_char string,
+ Argument  : An missing_char string (optional)
+
+=cut
+
+sub missing_char {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 match_char
+
+ Title     : match_char
+ Usage     : $contig->match_char('.')
+ Function  : Gets/sets the match_char attribute of the alignment
+ Returns   : An match_char string,
+ Argument  : An match_char string (optional)
+
+=cut
+
+sub match_char {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 gap_char
+
+ Title     : gap_char
+ Usage     : $contig->gap_char('-')
+ Function  : Gets/sets the gap_char attribute of the alignment
+ Returns   : An gap_char string, defaults to '-'
+ Argument  : An gap_char string (optional)
+
+=cut
+
+sub gap_char {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 symbol_chars
+
+ Title   : symbol_chars
+ Usage   : my @symbolchars = $contig->symbol_chars;
+ Function: Returns all the seen symbols (other than gaps)
+ Returns : array of characters that are the seen symbols
+ Argument: boolean to include the gap/missing/match characters
+
+=cut
+
+sub symbol_chars{
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 Alignment descriptors
+
+These read only methods describe the MSE in various ways. 
+
+
+=head2 consensus_string
+
+ Title     : consensus_string
+ Usage     : $str = $contig->consensus_string($threshold_percent)
+ Function  : Makes a strict consensus 
+ Returns   : 
+ Argument  : Optional treshold ranging from 0 to 100.
+             The consensus residue has to appear at least threshold %
+             of the sequences at a given location, otherwise a '?'
+             character will be placed at that location.
+             (Default value = 0%)
+
+=cut
+
+sub consensus_string {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 consensus_iupac
+
+ Title     : consensus_iupac
+ Usage     : $str = $contig->consensus_iupac()
+ Function  : 
+
+             Makes a consensus using IUPAC ambiguity codes from DNA
+             and RNA. The output is in upper case except when gaps in
+             a column force output to be in lower case.
+
+             Note that if your alignment sequences contain a lot of
+             IUPAC ambiquity codes you often have to manually set
+             alphabet.  Bio::PrimarySeq::_guess_type thinks they
+             indicate a protein sequence.
+
+ Returns   : consensus string
+ Argument  : none
+ Throws    : on protein sequences
+
+
+=cut
+
+sub consensus_iupac {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 is_flush
+
+ Title     : is_flush
+ Usage     : if( $contig->is_flush() )
+           : 
+           :
+ Function  : Tells you whether the alignment 
+           : is flush, ie all of the same length
+           : 
+           :
+ Returns   : 1 or 0
+ Argument  : 
+
+=cut
+
+sub is_flush {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 length
+
+ Title     : length()
+ Usage     : $len = $contig->length() 
+ Function  : Returns the maximum length of the alignment.
+             To be sure the alignment is a block, use is_flush
+ Returns   : 
+ Argument  : 
+
+=cut
+
+sub length {
+    my ($self) = @_;
+
+    $self->throw_not_implemented();
+}
+
+=head2 maxdisplayname_length
+
+ Title     : maxdisplayname_length
+ Usage     : $contig->maxdisplayname_length()
+ Function  : 
+
+             Gets the maximum length of the displayname in the
+             alignment. Used in writing out various MSE formats.
+
+ Returns   : integer
+ Argument  : 
+
+=cut
+
+sub maxname_length {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 no_residues
+
+ Title     : no_residues
+ Usage     : $no = $contig->no_residues
+ Function  : number of residues in total in the alignment
+ Returns   : integer
+ Argument  : 
+
+=cut
+
+sub no_residues {
+    my ($self) = @_;
+
+    return $self->{'_nof_residues'};
+}
+
+=head2 no_sequences
+
+ Title     : no_sequences
+ Usage     : $depth = $contig->no_sequences
+ Function  : number of sequence in the sequence alignment
+ Returns   : integer
+ Argument  : None
+
+=cut
+
+sub no_sequences {
+    my ($self) = @_;
+
+    return scalar( keys %{ $self->{'_elem'} } );
+}
+
+=head2 percentage_identity
+
+ Title   : percentage_identity
+ Usage   : $id = $contig->percentage_identity
+ Function: The function calculates the percentage identity of the alignment
+ Returns : The percentage identity of the alignment (as defined by the 
+						     implementation)
+ Argument: None
+
+=cut
+
+sub percentage_identity{
+    my ($self) = @_;
+
+    $self->throw_not_implemeneted();
+}
+
+=head2 overall_percentage_identity
+
+ Title   : percentage_identity
+ Usage   : $id = $contig->percentage_identity
+ Function: The function calculates the percentage identity of 
+           the conserved columns
+ Returns : The percentage identity of the conserved columns
+ Args    : None
+
+=cut
+
+sub overall_percentage_identity{
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+
+=head2 average_percentage_identity
+
+ Title   : average_percentage_identity
+ Usage   : $id = $contig->average_percentage_identity
+ Function: The function uses a fast method to calculate the average 
+           percentage identity of the alignment
+ Returns : The average percentage identity of the alignment
+ Args    : None
+
+=cut
+
+sub average_percentage_identity {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 Alignment positions
+
+Methods to map a sequence position into an alignment column and back.
+column_from_residue_number() does the former. The latter is really a
+property of the sequence object and can done using
+L<Bio::LocatableSeq::location_from_column>:
+
+    # select somehow a sequence from the alignment, e.g.
+    my $seq = $contig->get_seq_by_pos(1);
+    #$loc is undef or Bio::LocationI object
+    my $loc = $seq->location_from_column(5);
+
+
+=head2 column_from_residue_number
+
+ Title   : column_from_residue_number
+ Usage   : $col = $contig->column_from_residue_number( $seqname, $resnumber)
+ Function:
+
+           This function gives the position in the alignment
+           (i.e. column number) of the given residue number in the
+           sequence with the given name. For example, for the
+           alignment
+
+  	     Seq1/91-97 AC..DEF.GH
+  	     Seq2/24-30 ACGG.RTY..
+  	     Seq3/43-51 AC.DDEFGHI
+
+           column_from_residue_number( "Seq1", 94 ) returns 5.
+           column_from_residue_number( "Seq2", 25 ) returns 2.
+           column_from_residue_number( "Seq3", 50 ) returns 9.
+
+           An exception is thrown if the residue number would lie
+           outside the length of the aligment
+           (e.g. column_from_residue_number( "Seq2", 22 )
+
+	  Note: If the the parent sequence is represented by more than
+	  one alignment sequence and the residue number is present in
+	  them, this method finds only the first one.
+
+ Returns : A column number for the position in the alignment of the
+           given residue in the given sequence (1 = first column)
+ Args    : A sequence id/name (not a name/start-end)
+           A residue number in the whole sequence (not just that
+           segment of it in the alignment)
+
+=cut
+
+sub column_from_residue_number {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 Sequence names
+
+Methods to manipulate the display name. The default name based on the
+sequence id and subsequence positions can be overridden in various
+ways.
+
+=head2 displayname
+
+ Title     : displayname
+ Usage     : $contig->displayname("Ig", "IgA")
+ Function  : Gets/sets the display name of a sequence in the alignment
+           :
+ Returns   : A display name string
+ Argument  : name of the sequence
+             displayname of the sequence (optional)
+
+=cut
+
+sub displayname { # Do nothing
+}
+
+=head2 set_displayname_count
+
+ Title     : set_displayname_count
+ Usage     : $contig->set_displayname_count
+ Function  : 
+
+             Sets the names to be name_# where # is the number of
+             times this name has been used.
+
+ Returns   : None 
+ Argument  : None
+
+=cut
+
+sub set_displayname_count {
+    my ($self) = @_;
+    $self->throw_not_implemented();
+}
+
+=head2 set_displayname_flat
+
+ Title     : set_displayname_flat
+ Usage     : $contig->set_displayname_flat()
+ Function  : Makes all the sequences be displayed as just their name,
+             not name/start-end
+ Returns   : 1
+ Argument  : None
+
+=cut
+
+sub set_displayname_flat { # Do nothing!
+}
+
+=head2 set_displayname_normal
+
+ Title     : set_displayname_normal
+ Usage     : $contig->set_displayname_normal() 
+ Function  : Makes all the sequences be displayed as name/start-end
+ Returns   : None
+ Argument  : None
+
+=cut
+
+sub set_displayname_normal { # Do nothing!
+}
+
+=head1 Internal Methods
+
+=head2 _binary_search
+
+ Title     : _binary_search
+ Usage     : _binary_search($list,$query)
+ Function  : 
+
+             Find a number in a sorted list of numbers.  Return values
+             may be on or two integers. One positive integer or zero
+             (>=0) is the index of the element that stores the queried
+             value.  Two positive integers (or zero and another
+             number) are the indexes of elements among which the
+             queried value should be placed. Negative single values
+             mean:
+
+             -1: $query is smaller than smallest element in list
+             -2: $query is greater than greatest element in list
+
+ Returns   : array of integers
+ Argument  : 
+             $list  : array reference
+             $query : integer
+
+=cut
+
+sub _binary_search {
+    my $list   = shift;
+    my $query  = shift;
+    #
+    # If there is only one element in list
+    if (!$#{$list} && ($query == $list->[0])) { return (0) }
+    # If there are others...
+    my $start = 0;
+    my $end   = $#{$list};
+    (&_compare($query,$list->[$start]) == 0) && do { return ($start) };
+    (&_compare($query,$list->[$end])   == 0) && do { return ($end) };
+    (&_compare($query,$list->[$start])  < 0) && do { return (-1) };
+    (&_compare($query,$list->[$end])    > 0) && do { return (-2) };
+    my $middle = 0;
+    while ($end - $start > 1) {
+        $middle = int(($end+$middle)/2);
+	(&_compare($query,$list->[$middle]) == 0) && return ($middle);
+	(&_compare($query,$list->[$middle]) <  0) && do { $end   = $middle ; $middle = 0; next };
+	$start = $middle; # If &_compare() > 0, move region beggining
+    }
+    return ($start,$end);
+}
+
+=head2 _compare
+
+    Title   : _compare
+    Usage   : _compare($arg1,$arg2)
+    Function: Perform numeric or string comparisons
+    Returns : integer (0, 1 or -1)
+    Args    : values to be compared
+
+=cut
+
+sub _compare {
+    my $arg1 = shift;
+    my $arg2 = shift;
+    #
+    if (($arg1 =~ /^\d+$/) && ($arg2 =~ /^\d+$/)) { return $arg1 <=> $arg2 }
+    else { return $arg1 cmp $arg2 }
+}
+
+=head2 _nof_gaps
+
+    Title   : _nof_gaps
+    Usage   : _nof_gaps($array_ref, $query)
+    Function: number of gaps found before position $query
+    Returns : integer
+    Args    : 
+              $array_ref : gap registry reference
+              $query     : [integer] a position in a sequence 
+
+=cut
+
+#' emacs...
+sub _nof_gaps {
+    my $list  = shift;
+    my $query = shift;
+    # If there are no gaps in this contig
+    return 0 unless (defined($list) && scalar(@{$list}));
+    # Locate query index in gap list (if any)
+    my @index = &_binary_search($list,$query);
+    # If after all alignments, correct using total number of align
+    if ($index[0] == -2) { $query = scalar(@{$list}) }
+    # If before any alignment, return 0
+    elsif ($index[0] == -1) { $query = 0 }
+    elsif ($index[0] >= 0) {
+	# If query is between alignments, translate coordinates
+	if ($#index > 0) { $query = $index[0] + 1 }
+	# If query sits upon an alignment, do another correction
+	elsif ($#index == 0) { $query = $index[0] }
+    }
+    #
+    return $query;
+}
+
+=head2 _padded_unpadded
+
+    Title   : _padded_unpadded
+    Usage   : _padded_unpadded($array_ref, $query)
+    Function: 
+
+              Returns a coordinate corresponding to
+              position $query after gaps were 
+              removed from a sequence.
+
+    Returns : integer
+    Args    : 
+              $array_ref : reference to this gap registry
+              $query     : [integer] coordionate to change
+
+=cut
+
+sub _padded_unpadded {
+    my $list  = shift;
+    my $query = shift;
+
+    my $align = &_nof_gaps($list,$query);
+    $query-- if (defined($list->[$align]) && ($list->[$align] == $query));
+    $query = $query - $align;
+    #
+    return $query;
+}
+
+=head2 _unpadded_padded
+
+    Title   : _unpadded_padded
+    Usage   : _unpadded_padded($array_ref, $query)
+    Function: 
+
+              Returns the value corresponding to 
+              ungapped position $query when gaps are
+              counted as valid sites in a sequence
+
+    Returns : 
+    Args    : $array_ref = a reference to this sequence's gap registry
+              $query = [integer] location to change
+
+=cut
+
+#'
+sub _unpadded_padded {
+    my $list  = shift;
+    my $query = shift;
+
+    my $align  = &_nof_gaps($list,$query);
+    $query = $query + $align;
+    my $new_align = &_nof_gaps($list,$query);
+    while ($new_align - $align > 0) {
+	$query = $query + $new_align - $align;
+	$align  = $new_align;
+	$new_align = &_nof_gaps($list,$query);
+    }
+    # If current position is also a align, look for the first upstream base
+    while (defined($list->[$align]) && ($list->[$align] == $query)) {
+	$query++; $align++;
+    }
+    #
+    return $query;
+}
+
+=head2 _register_gaps
+
+    Title   : _register_gaps
+    Usage   : $self->_register_gaps($seq, $array_ref)
+    Function: stores gap locations for a sequence
+    Returns : number of gaps found
+    Args    : 
+              $seq       : sequence string
+              $array_ref : a reference to an array,
+                           where gap locations will
+                           be stored
+
+=cut
+
+sub _register_gaps {
+    my $self     = shift;
+    my $sequence = shift;
+    my $dbref    = shift;
+
+    $self->throw("Not an aligned sequence string to register gaps")
+	if (ref($sequence));
+
+    $self->throw("Not an array reference for gap registry")
+	unless (ref($dbref) eq 'ARRAY');
+
+    # Registering alignments
+    @{$dbref} = (); # Cleaning registry
+    if (defined $sequence) {
+	my $i = -1;
+	while(1) {
+	    $i = index($sequence,"-",$i+1);
+	    last if ($i == -1);
+	    push(@{$dbref},$i+1);
+	}
+    } else {
+#	$self->warn("Found undefined sequence while registering gaps");
+	return 0;
+    }
+
+    return scalar(@{$dbref});
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children