Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/SimpleAlign.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SimpleAlign.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,1934 @@ +# $Id: SimpleAlign.pm,v 1.65.2.1 2003/07/02 16:00:19 jason Exp $ +# BioPerl module for SimpleAlign +# +# Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code +# +# History: +# 11/3/00 Added threshold feature to consensus and consensus_aa - PS +# May 2001 major rewrite - Heikki Lehvaslaiho + +=head1 NAME + +Bio::SimpleAlign - Multiple alignments held as a set of sequences + +=head1 SYNOPSIS + + # use Bio::AlignIO to read in the alignment + $str = Bio::AlignIO->new('-file' => 't/data/testaln.pfam'); + $aln = $str->next_aln(); + + # some descriptors + print $aln->length, "\n"; + print $aln->no_residues, "\n"; + print $aln->is_flush, "\n"; + print $aln->no_sequences, "\n"; + print $aln->percentage_identity, "\n"; + print $aln->consensus_string(50), "\n"; + + # find the position in the alignment for a sequence location + $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6; + + # extract sequences and check values for the alignment column $pos + foreach $seq ($aln->each_seq) { + $res = $seq->subseq($pos, $pos); + $count{$res}++; + } + foreach $res (keys %count) { + printf "Res: %s Count: %2d\n", $res, $count{$res}; + } + + +=head1 DESCRIPTION + +SimpleAlign handles multiple alignments of sequences. It is very +permissive of types (it won't insist on things being all same length +etc): really it is a SequenceSet explicitly held in memory with a +whole series of built in manipulations and especially file format +systems for read/writing alignments. + +SimpleAlign basically views an alignment as an immutable block of +text. SimpleAlign *is not* the object to be using if you want to +perform complex alignment manipulations. + +However for lightweight display/formatting and minimal manipulation +(e.g. removing all-gaps columns) - this is the one to use. + +SimpleAlign uses a subclass of L<Bio::PrimarySeq> class +L<Bio::LocatableSeq> to store its sequences. These are subsequences +with a start and end positions in the parent reference sequence. + +Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in +the alignment, and this is the key for the internal hashes. +(name,start,end is abbreviated nse in the code). However, in many +cases people don't want the name/start-end to be displayed: either +multiple names in an alignment or names specific to the alignment +(ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called +'displayname', and generally is what is used to print out the +alignment. They default to name/start-end. + +The SimpleAlign Module came from Ewan Birney's Align module. + +=head1 PROGRESS + +SimpleAlign is being slowly converted to bioperl coding standards, +mainly by Ewan. + +=over 3 + +=item Use Bio::Root::Object - done + +=item Use proper exceptions - done + +=item Use hashed constructor - not done! + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Ewan Birney, birney@sanger.ac.uk + +=head1 CONTRIBUTORS + +Richard Adams, Richard.Adams@ed.ac.uk, +David J. Evans, David.Evans@vir.gla.ac.uk, +Heikki Lehvaslaiho, heikki@ebi.ac.uk, +Allen Smith, allens@cpan.org, +Jason Stajich, jason@bioperl.org, +Anthony Underwood, aunderwood@phls.org.uk, +Xintao Wei & Giri Narasimhan, giri@cs.fiu.edu + + +=head1 SEE ALSO + +L<Bio::LocatableSeq.pm> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# 'Let the code begin... + +package Bio::SimpleAlign; +use vars qw(@ISA %CONSERVATION_GROUPS); +use strict; + +use Bio::Root::Root; +use Bio::LocatableSeq; # uses Seq's as list +use Bio::Align::AlignI; + +BEGIN { + # This data should probably be in a more centralized module... + # it is taken from Clustalw documentation + # These are all the positively scoring groups that occur in the + # Gonnet Pam250 matrix. The strong and weak groups are + # defined as strong score >0.5 and weak score =<0.5 respectively. + + %CONSERVATION_GROUPS = ( 'strong' => [ qw(STA + NEQK + NHQK + NDEQ + QHRK + MILV + MILF + HY + FYW) + ], + 'weak' => [ qw(CSA + ATV + SAG + STNK + STPA + SGND + SNDEQK + NDEQHK + NEQHRK + FVLIM + HFY) ], + ); + +} + +@ISA = qw(Bio::Root::Root Bio::Align::AlignI); + +=head2 new + + Title : new + Usage : my $aln = new Bio::SimpleAlign(); + Function : Creates a new simple align object + Returns : Bio::SimpleAlign + Args : -source => string representing the source program + where this alignment came from + +=cut + + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($src) = $self->_rearrange([qw(SOURCE)], @args); + $src && $self->source($src); + # we need to set up internal hashs first! + + $self->{'_seq'} = {}; + $self->{'_order'} = {}; + $self->{'_start_end_lists'} = {}; + $self->{'_dis_name'} = {}; + $self->{'_id'} = 'NoName'; + $self->{'_symbols'} = {}; + # maybe we should automatically read in from args. Hmmm... + + return $self; # success - we hope! +} + +=head1 Modifier methods + +These methods modify the MSE by adding, removing or shuffling complete +sequences. + +=head2 add_seq + + Title : add_seq + Usage : $myalign->add_seq($newseq); + Function : Adds another sequence to the alignment. *Does not* align + it - just adds it to the hashes. + Returns : nothing + Args : a Bio::LocatableSeq object + order (optional) + +See L<Bio::LocatableSeq> for more information + +=cut + +sub addSeq { + my $self = shift; + $self->warn(ref($self). "::addSeq - deprecated method. Use add_seq() instead."); + $self->add_seq(@_); +} + +sub add_seq { + my $self = shift; + my $seq = shift; + my $order = shift; + my ($name,$id,$start,$end); + + if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { + $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); + } + + $id = $seq->id() ||$seq->display_id || $seq->primary_id; + $start = $seq->start(); + $end = $seq->end(); + + # build the symbol list for this sequence, + # will prune out the gap and missing/match chars + # when actually asked for the symbol list in the + # symbol_chars + map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq); + + if( !defined $order ) { + $order = keys %{$self->{'_seq'}}; + } + $name = sprintf("%s/%d-%d",$id,$start,$end); + + if( $self->{'_seq'}->{$name} ) { + $self->warn("Replacing one sequence [$name]\n"); + } + else { + #print STDERR "Assigning $name to $order\n"; + + $self->{'_order'}->{$order} = $name; + + unless( exists( $self->{'_start_end_lists'}->{$id})) { + $self->{'_start_end_lists'}->{$id} = []; + } + push @{$self->{'_start_end_lists'}->{$id}}, $seq; + } + + $self->{'_seq'}->{$name} = $seq; + +} + + +=head2 remove_seq + + Title : remove_seq + Usage : $aln->remove_seq($seq); + Function : Removes a single sequence from an alignment + Returns : + Argument : a Bio::LocatableSeq object + +=cut + +sub removeSeq { + my $self = shift; + $self->warn(ref($self). "::removeSeq - deprecated method. Use remove_seq() instead."); + $self->remove_seq(@_); +} + +sub remove_seq { + my $self = shift; + my $seq = shift; + my ($name,$id,$start,$end); + + $self->throw("Need Bio::Locatable seq argument ") + unless ref $seq && $seq->isa('Bio::LocatableSeq'); + + $id = $seq->id(); + $start = $seq->start(); + $end = $seq->end(); + $name = sprintf("%s/%d-%d",$id,$start,$end); + + if( !exists $self->{'_seq'}->{$name} ) { + $self->throw("Sequence $name does not exist in the alignment to remove!"); + } + + delete $self->{'_seq'}->{$name}; + + # we need to remove this seq from the start_end_lists hash + + if (exists $self->{'_start_end_lists'}->{$id}) { + # we need to find the sequence in the array. + + my ($i, $found);; + for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) { + if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) { + $found = 1; + last; + } + } + if ($found) { + splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1; + } + else { + $self->throw("Could not find the sequence to remoce from the start-end list"); + } + } + else { + $self->throw("There is no seq list for the name $id"); + } + return 1; + # we can't do anything about the order hash but that is ok + # because each_seq will handle it +} + + +=head2 purge + + Title : purge + Usage : $aln->purge(0.7); + Function: + + Removes sequences above given sequence similarity + This function will grind on large alignments. Beware! + + Example : + Returns : An array of the removed sequences + Args : float, threshold for similarity + +=cut + +sub purge { + my ($self,$perc) = @_; + my (%duplicate, @dups); + + my @seqs = $self->each_seq(); + + for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment + my $seq = $seqs[$i]; + + #skip if already in duplicate hash + next if exists $duplicate{$seq->display_id} ; + my $one = $seq->seq(); + + my @one = split '', $one; #split to get 1aa per array element + + for (my $j=$i+1;$j < @seqs;$j++) { + my $seq2 = $seqs[$j]; + + #skip if already in duplicate hash + next if exists $duplicate{$seq2->display_id} ; + + my $two = $seq2->seq(); + my @two = split '', $two; + + my $count = 0; + my $res = 0; + for (my $k=0;$k<@one;$k++) { + if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) && + $one[$k] eq $two[$k]) { + $count++; + } + if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) && + $two[$k] ne '.' && $two[$k] ne '-' ) { + $res++; + } + } + + my $ratio = 0; + $ratio = $count/$res unless $res == 0; + + # if above threshold put in duplicate hash and push onto + # duplicate array for returning to get_unique + if ( $ratio > $perc ) { + print STDERR "duplicate!", $seq2->display_id, "\n" if $self->verbose > 0; + $duplicate{$seq2->display_id} = 1; + push @dups, $seq2; + } + } + } + foreach my $seq (@dups) { + $self->remove_seq($seq); + } + return @dups; +} + +=head2 sort_alphabetically + + Title : sort_alphabetically + Usage : $ali->sort_alphabetically + Function : + + Changes the order of the alignemnt to alphabetical on name + followed by numerical by number. + + Returns : + Argument : + +=cut + +sub sort_alphabetically { + my $self = shift; + my ($seq,$nse,@arr,%hash,$count); + + foreach $seq ( $self->each_seq() ) { + $nse = $seq->get_nse; + $hash{$nse} = $seq; + } + + $count = 0; + + %{$self->{'_order'}} = (); # reset the hash; + + foreach $nse ( sort _alpha_startend keys %hash) { + $self->{'_order'}->{$count} = $nse; + + $count++; + } + 1; +} + +=head1 Sequence selection methods + +Methods returning one or more sequences objects. + +=head2 each_seq + + Title : each_seq + Usage : foreach $seq ( $align->each_seq() ) + Function : Gets an array of Seq objects from the alignment + Returns : an array + Argument : + +=cut + +sub eachSeq { + my $self = shift; + $self->warn(ref($self). "::eachSeq - deprecated method. Use each_seq() instead."); + $self->each_seq(); +} + +sub each_seq { + my $self = shift; + my (@arr,$order); + + foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) { + if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) { + push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}}); + } + } + + return @arr; +} + + +=head2 each_alphabetically + + Title : each_alphabetically + Usage : foreach $seq ( $ali->each_alphabetically() ) + Function : + + Returns an array of sequence object sorted alphabetically + by name and then by start point. + Does not change the order of the alignment + + Returns : + Argument : + +=cut + +sub each_alphabetically { + my $self = shift; + my ($seq,$nse,@arr,%hash,$count); + + foreach $seq ( $self->each_seq() ) { + $nse = $seq->get_nse; + $hash{$nse} = $seq; + } + + foreach $nse ( sort _alpha_startend keys %hash) { + push(@arr,$hash{$nse}); + } + + return @arr; + +} + +sub _alpha_startend { + my ($aname,$astart,$bname,$bstart); + ($aname,$astart) = split (/-/,$a); + ($bname,$bstart) = split (/-/,$b); + + if( $aname eq $bname ) { + return $astart <=> $bstart; + } + else { + return $aname cmp $bname; + } + +} + +=head2 each_seq_with_id + + Title : each_seq_with_id + Usage : foreach $seq ( $align->each_seq_with_id() ) + Function : + + Gets an array of Seq objects from the + alignment, the contents being those sequences + with the given name (there may be more than one) + + Returns : an array + Argument : a seq name + +=cut + +sub eachSeqWithId { + my $self = shift; + $self->warn(ref($self). "::eachSeqWithId - deprecated method. Use each_seq_with_id() instead."); + $self->each_seq_with_id(@_); +} + +sub each_seq_with_id { + my $self = shift; + my $id = shift; + + $self->throw("Method each_seq_with_id needs a sequence name argument") + unless defined $id; + + my (@arr, $seq); + + if (exists($self->{'_start_end_lists'}->{$id})) { + @arr = @{$self->{'_start_end_lists'}->{$id}}; + } + return @arr; +} + +=head2 get_seq_by_pos + + Title : get_seq_by_pos + Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment + Function : + + Gets a sequence based on its position in the alignment. + Numbering starts from 1. Sequence positions larger than + no_sequences() will thow an error. + + Returns : a Bio::LocatableSeq object + Args : positive integer for the sequence osition + +=cut + +sub get_seq_by_pos { + + my $self = shift; + my ($pos) = @_; + + $self->throw("Sequence position has to be a positive integer, not [$pos]") + unless $pos =~ /^\d+$/ and $pos > 0; + $self->throw("No sequence at position [$pos]") + unless $pos <= $self->no_sequences ; + + my $nse = $self->{'_order'}->{--$pos}; + return $self->{'_seq'}->{$nse}; +} + +=head1 Create new alignments + +The result of these methods are horizontal or vertical subsets of the +current MSE. + +=head2 select + + Title : select + Usage : $aln2 = $aln->select(1, 3) # three first sequences + Function : + + Creates a new alignment from a continuous subset of + sequences. Numbering starts from 1. Sequence positions + larger than no_sequences() will thow an error. + + Returns : a Bio::SimpleAlign object + Args : positive integer for the first sequence + positive integer for the last sequence to include (optional) + +=cut + +sub select { + my $self = shift; + my ($start, $end) = @_; + + $self->throw("Select start has to be a positive integer, not [$start]") + unless $start =~ /^\d+$/ and $start > 0; + $self->throw("Select end has to be a positive integer, not [$end]") + unless $end =~ /^\d+$/ and $end > 0; + $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]") + unless $start <= $end; + + my $aln = new $self; + foreach my $pos ($start .. $end) { + $aln->add_seq($self->get_seq_by_pos($pos)); + } + $aln->id($self->id); + return $aln; +} + +=head2 select_noncont + + Title : select_noncont + Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences + Function : + + Creates a new alignment from a subset of + sequences. Numbering starts from 1. Sequence positions + larger than no_sequences() will thow an error. + + Returns : a Bio::SimpleAlign object + Args : array of integers for the sequences + +=cut + +sub select_noncont { + my $self = shift; + my (@pos) = @_; + my $end = $self->no_sequences; + foreach ( @pos ) { + $self->throw("position must be a positive integer, > 0 and <= $end not [$_]") + unless( /^\d+$/ && $_ > 0 && $_ <= $end ); + } + my $aln = new $self; + foreach my $p (@pos) { + $aln->add_seq($self->get_seq_by_pos($p)); + } + $aln->id($self->id); + return $aln; +} + +=head2 slice + + Title : slice + Usage : $aln2 = $aln->slice(20, 30) + Function : + + Creates a slice from the alignment inclusive of start and + end columns. Sequences with no residues in the slice are + excluded from the new alignment and a warning is printed. + Slice beyond the length of the sequence does not do + padding. + + Returns : a Bio::SimpleAlign object + Args : positive integer for start column + positive integer for end column + +=cut + +sub slice { + my $self = shift; + my ($start, $end) = @_; + + $self->throw("Slice start has to be a positive integer, not [$start]") + unless $start =~ /^\d+$/ and $start > 0; + $self->throw("Slice end has to be a positive integer, not [$end]") + unless $end =~ /^\d+$/ and $end > 0; + $self->throw("Slice $start [$start] has to be smaller than or equal to end [$end]") + unless $start <= $end; + my $aln_length = $self->length; + $self->throw("This alignment has only ". $self->length. + " residues. Slice start [$start] is too bigger.") + if $start > $self->length; + + my $aln = new $self; + $aln->id($self->id); + foreach my $seq ( $self->each_seq() ) { + + my $new_seq = new Bio::LocatableSeq (-id => $seq->id); + + # seq + my $seq_end = $end; + $seq_end = $seq->length if $end > $seq->length; + my $slice_seq = $seq->subseq($start, $seq_end); + $new_seq->seq( $slice_seq ); + + # start + if ($start > 1) { + my $pre_start_seq = $seq->subseq(1, $start - 1); + $pre_start_seq =~ s/\W//g; #print "$pre_start_seq\n"; + $new_seq->start( $seq->start + CORE::length($pre_start_seq) ); + } else { + $new_seq->start( $seq->start); + } + + # end + $slice_seq =~ s/\W//g; + $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 ); + + if ($new_seq->start and $new_seq->end >= $new_seq->start) { + $aln->add_seq($new_seq); + } else { + my $nse = $seq->get_nse(); + $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.". + " Sequence excluded from the new alignment."); + } + + } + + return $aln; +} + +=head2 remove_columns + + Title : remove_column + Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) + Function : + Creates an aligment with columns removed corresponding to + the specified criteria. + Returns : a L<Bio::SimpleAlign> object + Args : array ref of types, 'match'|'weak'|'strong'|'mismatch' + +=cut + +sub remove_columns{ + my ($self,$type) = @_; + my %matchchars = ( 'match' => '\*', + 'weak' => '\.', + 'strong' => ':', + 'mismatch'=> ' ', + ); + #get the characters to delete against + my $del_char; + foreach my $type(@{$type}){ + $del_char.= $matchchars{$type}; + } + + my $match_line = $self->match_line; + my $aln = new $self; + + my @remove; + my $length = 0; + + #do the matching to get the segments to remove + while($match_line=~m/[$del_char]/g){ + my $start = pos($match_line)-1; + $match_line=~/\G[$del_char]+/gc; + my $end = pos($match_line)-1; + + #have to offset the start and end for subsequent removes + $start-=$length; + $end -=$length; + $length += ($end-$start+1); + push @remove, [$start,$end]; + } + + #remove the segments + $aln = $self->_remove_col($aln,\@remove); + + return $aln; +} + +sub _remove_col { + my ($self,$aln,$remove) = @_; + my @new; + + #splice out the segments and create new seq + foreach my $seq($self->each_seq){ + my $new_seq = new Bio::LocatableSeq(-id=>$seq->id); + my $sequence; + foreach my $pair(@{$remove}){ + my $start = $pair->[0]; + my $end = $pair->[1]; + $sequence = $seq->seq unless $sequence; + my $spliced; + $spliced .= $start > 0 ? substr($sequence,0,$start) : ''; + $spliced .= substr($sequence,$end+1,$seq->length-$end+1); + $sequence = $spliced; + if ($start == 1) { + $new_seq->start($end); + } + else { + $new_seq->start( $seq->start); + } + # end + if($end >= $seq->end){ + $new_seq->end( $start); + } + else { + $new_seq->end($seq->end); + } + } + $new_seq->seq($sequence); + push @new, $new_seq; + } + #add the new seqs to the alignment + foreach my $new(@new){ + $aln->add_seq($new); + } + return $aln; +} + +=head1 Change sequences within the MSE + +These methods affect characters in all sequences without changeing the +alignment. + + +=head2 map_chars + + Title : map_chars + Usage : $ali->map_chars('\.','-') + Function : + + Does a s/$arg1/$arg2/ on the sequences. Useful for gap + characters + + Notice that the from (arg1) is interpretted as a regex, + so be careful about quoting meta characters (eg + $ali->map_chars('.','-') wont do what you want) + + Returns : + Argument : 'from' rexexp + 'to' string + +=cut + +sub map_chars { + my $self = shift; + my $from = shift; + my $to = shift; + my ($seq,$temp); + + $self->throw("Need exactly two arguments") + unless defined $from and defined $to; + + foreach $seq ( $self->each_seq() ) { + $temp = $seq->seq(); + $temp =~ s/$from/$to/g; + $seq->seq($temp); + } + return 1; +} + + +=head2 uppercase + + Title : uppercase() + Usage : $ali->uppercase() + Function : Sets all the sequences to uppercase + Returns : + Argument : + +=cut + +sub uppercase { + my $self = shift; + my $seq; + my $temp; + + foreach $seq ( $self->each_seq() ) { + $temp = $seq->seq(); + $temp =~ tr/[a-z]/[A-Z]/; + + $seq->seq($temp); + } + return 1; +} + +=head2 cigar_line + + Title : cigar_line() + Usage : $align->cigar_line() + Function : Generates a "cigar" line for each sequence in the alignment + The format is simply A-1,60;B-1,1:4,60;C-5,10:12,58 + where A,B,C,etc. are the sequence identifiers, and the numbers + refer to conserved positions within the alignment + Args : none + +=cut + +sub cigar_line { + my ($self) = @_; + + my %cigar; + my %clines; + my @seqchars; + my $seqcount = 0; + my $sc; + foreach my $seq ( $self->each_seq ) { + push @seqchars, [ split(//, uc ($seq->seq)) ]; + $sc = scalar(@seqchars); + } + + foreach my $pos ( 0..$self->length ) { + my $i=0; + foreach my $seq ( @seqchars ) { + $i++; +# print STDERR "Seq $i at pos $pos: ".$seq->[$pos]."\n"; + if ($seq->[$pos] eq '.') { + if (defined $cigar{$i} && $clines{$i} !~ $cigar{$i}) { + $clines{$i}.=$cigar{$i}; + } + } + else { + if (! defined $cigar{$i}) { + $clines{$i}.=($pos+1).","; + } + $cigar{$i}=$pos+1; + } + if ($pos+1 == $self->length && ($clines{$i} =~ /\,$/) ) { + $clines{$i}.=$cigar{$i}; + } + } + } + for(my $i=1; $i<$sc+1;$i++) { + print STDERR "Seq $i cigar line ".$clines{$i}."\n"; + } + return %clines; +} + +=head2 match_line + + Title : match_line() + Usage : $align->match_line() + Function : Generates a match line - much like consensus string + except that a line indicating the '*' for a match. + Args : (optional) Match line characters ('*' by default) + (optional) Strong match char (':' by default) + (optional) Weak match char ('.' by default) + +=cut + +sub match_line { + my ($self,$matchlinechar, $strong, $weak) = @_; + my %matchchars = ( 'match' => $matchlinechar || '*', + 'weak' => $weak || '.', + 'strong' => $strong || ':', + 'mismatch'=> ' ', + ); + + + my @seqchars; + my $seqcount = 0; + my $alphabet; + foreach my $seq ( $self->each_seq ) { + push @seqchars, [ split(//, uc ($seq->seq)) ]; + $alphabet = $seq->alphabet unless defined $alphabet; + } + my $refseq = shift @seqchars; + # let's just march down the columns + my $matchline; + POS: foreach my $pos ( 0..$self->length ) { + my $refchar = $refseq->[$pos]; + next unless $refchar; # skip '' + my %col = ($refchar => 1); + my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' '); + foreach my $seq ( @seqchars ) { + $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' || + $seq->[$pos] eq ' ' ); + $col{$seq->[$pos]}++; + } + my @colresidues = sort keys %col; + my $char = $matchchars{'mismatch'}; + # if all the values are the same + if( $dash ) { $char = $matchchars{'mismatch'} } + elsif( @colresidues == 1 ) { $char = $matchchars{'match'} } + elsif( $alphabet eq 'protein' ) { # only try to do weak/strong + # matches for protein seqs + TYPE: foreach my $type ( qw(strong weak) ) { + # iterate through categories + my %groups; + # iterate through each of the aa in the col + # look to see which groups it is in + foreach my $c ( @colresidues ) { + foreach my $f ( grep /\Q$c/, @{$CONSERVATION_GROUPS{$type}} ) { + push @{$groups{$f}},$c; + } + } + GRP: foreach my $cols ( values %groups ) { + @$cols = sort @$cols; + # now we are just testing to see if two arrays + # are identical w/o changing either one + + # have to be same len + next if( scalar @$cols != scalar @colresidues ); + # walk down the length and check each slot + for($_=0;$_ < (scalar @$cols);$_++ ) { + next GRP if( $cols->[$_] ne $colresidues[$_] ); + } + $char = $matchchars{$type}; + last TYPE; + } + } + } + $matchline .= $char; + } + return $matchline; +} + +=head2 match + + Title : match() + Usage : $ali->match() + Function : + + Goes through all columns and changes residues that are + identical to residue in first sequence to match '.' + character. Sets match_char. + + USE WITH CARE: Most MSE formats do not support match + characters in sequences, so this is mostly for output + only. NEXUS format (Bio::AlignIO::nexus) can handle + it. + + Returns : 1 + Argument : a match character, optional, defaults to '.' + +=cut + +sub match { + my ($self, $match) = @_; + + $match ||= '.'; + my ($matching_char) = $match; + $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #'; + $self->map_chars($matching_char, '-'); + + my @seqs = $self->each_seq(); + return 1 unless scalar @seqs > 1; + + my $refseq = shift @seqs ; + my @refseq = split //, $refseq->seq; + my $gapchar = $self->gap_char; + + foreach my $seq ( @seqs ) { + my @varseq = split //, $seq->seq(); + for ( my $i=0; $i < scalar @varseq; $i++) { + $varseq[$i] = $match if defined $refseq[$i] && + ( $refseq[$i] =~ /[A-Za-z\*]/ || + $refseq[$i] =~ /$gapchar/ ) + && $refseq[$i] eq $varseq[$i]; + } + $seq->seq(join '', @varseq); + } + $self->match_char($match); + return 1; +} + + +=head2 unmatch + + Title : unmatch() + Usage : $ali->unmatch() + Function : Undoes the effect of method match. Unsets match_char. + + Returns : 1 + Argument : a match character, optional, defaults to '.' + +See L<match> and L<match_char> + +=cut + +sub unmatch { + my ($self, $match) = @_; + + $match ||= '.'; + + my @seqs = $self->each_seq(); + return 1 unless scalar @seqs > 1; + + my $refseq = shift @seqs ; + my @refseq = split //, $refseq->seq; + my $gapchar = $self->gap_char; + foreach my $seq ( @seqs ) { + my @varseq = split //, $seq->seq(); + for ( my $i=0; $i < scalar @varseq; $i++) { + $varseq[$i] = $refseq[$i] if defined $refseq[$i] && + ( $refseq[$i] =~ /[A-Za-z\*]/ || + $refseq[$i] =~ /$gapchar/ ) && + $varseq[$i] eq $match; + } + $seq->seq(join '', @varseq); + } + $self->match_char(''); + return 1; +} + +=head1 MSE attibutes + +Methods for setting and reading the MSE attributes. + +Note that the methods defining character semantics depend on the user +to set them sensibly. They are needed only by certain input/output +methods. Unset them by setting to an empty string (''). + +=head2 id + + Title : id + Usage : $myalign->id("Ig") + Function : Gets/sets the id field of the alignment + Returns : An id string + Argument : An id string (optional) + +=cut + +sub id { + my ($self, $name) = @_; + + if (defined( $name )) { + $self->{'_id'} = $name; + } + + return $self->{'_id'}; +} + +=head2 missing_char + + Title : missing_char + Usage : $myalign->missing_char("?") + Function : Gets/sets the missing_char attribute of the alignment + It is generally recommended to set it to 'n' or 'N' + for nucleotides and to 'X' for protein. + Returns : An missing_char string, + Argument : An missing_char string (optional) + +=cut + +sub missing_char { + my ($self, $char) = @_; + + if (defined $char ) { + $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1; + $self->{'_missing_char'} = $char; + } + + return $self->{'_missing_char'}; +} + +=head2 match_char + + Title : match_char + Usage : $myalign->match_char('.') + Function : Gets/sets the match_char attribute of the alignment + Returns : An match_char string, + Argument : An match_char string (optional) + +=cut + +sub match_char { + my ($self, $char) = @_; + + if (defined $char ) { + $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1; + $self->{'_match_char'} = $char; + } + + return $self->{'_match_char'}; +} + +=head2 gap_char + + Title : gap_char + Usage : $myalign->gap_char('-') + Function : Gets/sets the gap_char attribute of the alignment + Returns : An gap_char string, defaults to '-' + Argument : An gap_char string (optional) + +=cut + +sub gap_char { + my ($self, $char) = @_; + + if (defined $char || ! defined $self->{'_gap_char'} ) { + $char= '-' unless defined $char; + $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1; + $self->{'_gap_char'} = $char; + } + return $self->{'_gap_char'}; +} + +=head2 symbol_chars + + Title : symbol_chars + Usage : my @symbolchars = $aln->symbol_chars; + Function: Returns all the seen symbols (other than gaps) + Returns : array of characters that are the seen symbols + Args : boolean to include the gap/missing/match characters + +=cut + +sub symbol_chars{ + my ($self,$includeextra) = @_; + if( ! defined $self->{'_symbols'} ) { + $self->warn("Symbol list was not initialized"); + return (); + } + my %copy = %{$self->{'_symbols'}}; + if( ! $includeextra ) { + foreach my $char ( $self->gap_char, $self->match_char, + $self->missing_char) { + delete $copy{$char} if( defined $char ); + } + } + return keys %copy; +} + +=head1 Alignment descriptors + +These read only methods describe the MSE in various ways. + + +=head2 consensus_string + + Title : consensus_string + Usage : $str = $ali->consensus_string($threshold_percent) + Function : Makes a strict consensus + Returns : + Argument : Optional treshold ranging from 0 to 100. + The consensus residue has to appear at least threshold % + of the sequences at a given location, otherwise a '?' + character will be placed at that location. + (Default value = 0%) + +=cut + +sub consensus_string { + my $self = shift; + my $threshold = shift; + my $len; + my ($out,$count); + + $out = ""; + + $len = $self->length - 1; + + foreach $count ( 0 .. $len ) { + $out .= $self->_consensus_aa($count,$threshold); + } + return $out; +} + +sub _consensus_aa { + my $self = shift; + my $point = shift; + my $threshold_percent = shift || -1 ; + my ($seq,%hash,$count,$letter,$key); + + foreach $seq ( $self->each_seq() ) { + $letter = substr($seq->seq,$point,1); + $self->throw("--$point-----------") if $letter eq ''; + ($letter =~ /\./) && next; + # print "Looking at $letter\n"; + $hash{$letter}++; + } + my $number_of_sequences = $self->no_sequences(); + my $threshold = $number_of_sequences * $threshold_percent / 100. ; + $count = -1; + $letter = '?'; + + foreach $key ( sort keys %hash ) { + # print "Now at $key $hash{$key}\n"; + if( $hash{$key} > $count && $hash{$key} >= $threshold) { + $letter = $key; + $count = $hash{$key}; + } + } + return $letter; +} + + +=head2 consensus_iupac + + Title : consensus_iupac + Usage : $str = $ali->consensus_iupac() + Function : + + Makes a consensus using IUPAC ambiguity codes from DNA + and RNA. The output is in upper case except when gaps in + a column force output to be in lower case. + + Note that if your alignment sequences contain a lot of + IUPAC ambiquity codes you often have to manually set + alphabet. Bio::PrimarySeq::_guess_type thinks they + indicate a protein sequence. + + Returns : consensus string + Argument : none + Throws : on protein sequences + +=cut + +sub consensus_iupac { + my $self = shift; + my $out = ""; + my $len = $self->length-1; + + # only DNA and RNA sequences are valid + foreach my $seq ( $self->each_seq() ) { + $self->throw("Seq [". $seq->get_nse. "] is a protein") + if $seq->alphabet eq 'protein'; + } + # loop over the alignment columns + foreach my $count ( 0 .. $len ) { + $out .= $self->_consensus_iupac($count); + } + return $out; +} + +sub _consensus_iupac { + my ($self, $column) = @_; + my ($string, $char, $rna); + + #determine all residues in a column + foreach my $seq ( $self->each_seq() ) { + $string .= substr($seq->seq, $column, 1); + } + $string = uc $string; + + # quick exit if there's an N in the string + if ($string =~ /N/) { + $string =~ /\W/ ? return 'n' : return 'N'; + } + # ... or if there are only gap characters + return '-' if $string =~ /^\W+$/; + + # treat RNA as DNA in regexps + if ($string =~ /U/) { + $string =~ s/U/T/; + $rna = 1; + } + + # the following s///'s only need to be done to the _first_ ambiguity code + # as we only need to see the _range_ of characters in $string + + if ($string =~ /[VDHB]/) { + $string =~ s/V/AGC/; + $string =~ s/D/AGT/; + $string =~ s/H/ACT/; + $string =~ s/B/CTG/; + } + + if ($string =~ /[SKYRWM]/) { + $string =~ s/S/GC/; + $string =~ s/K/GT/; + $string =~ s/Y/CT/; + $string =~ s/R/AG/; + $string =~ s/W/AT/; + $string =~ s/M/AC/; + } + + # and now the guts of the thing + + if ($string =~ /A/) { + $char = 'A'; # A A + if ($string =~ /G/) { + $char = 'R'; # A and G (purines) R + if ($string =~ /C/) { + $char = 'V'; # A and G and C V + if ($string =~ /T/) { + $char = 'N'; # A and G and C and T N + } + } elsif ($string =~ /T/) { + $char = 'D'; # A and G and T D + } + } elsif ($string =~ /C/) { + $char = 'M'; # A and C M + if ($string =~ /T/) { + $char = 'H'; # A and C and T H + } + } elsif ($string =~ /T/) { + $char = 'W'; # A and T W + } + } elsif ($string =~ /C/) { + $char = 'C'; # C C + if ($string =~ /T/) { + $char = 'Y'; # C and T (pyrimidines) Y + if ($string =~ /G/) { + $char = 'B'; # C and T and G B + } + } elsif ($string =~ /G/) { + $char = 'S'; # C and G S + } + } elsif ($string =~ /G/) { + $char = 'G'; # G G + if ($string =~ /C/) { + $char = 'S'; # G and C S + } elsif ($string =~ /T/) { + $char = 'K'; # G and T K + } + } elsif ($string =~ /T/) { + $char = 'T'; # T T + } + + $char = 'U' if $rna and $char eq 'T'; + $char = lc $char if $string =~ /\W/; + + return $char; +} + +=head2 is_flush + + Title : is_flush + Usage : if( $ali->is_flush() ) + : + : + Function : Tells you whether the alignment + : is flush, ie all of the same length + : + : + Returns : 1 or 0 + Argument : + +=cut + +sub is_flush { + my ($self,$report) = @_; + my $seq; + my $length = (-1); + my $temp; + + foreach $seq ( $self->each_seq() ) { + if( $length == (-1) ) { + $length = CORE::length($seq->seq()); + next; + } + + $temp = CORE::length($seq->seq()); + if( $temp != $length ) { + $self->warn("expecting $length not $temp from ". + $seq->display_id) if( $report ); + $self->debug("expecting $length not $temp from ". + $seq->display_id); + $self->debug($seq->seq(). "\n"); + return 0; + } + } + + return 1; +} + + +=head2 length + + Title : length() + Usage : $len = $ali->length() + Function : Returns the maximum length of the alignment. + To be sure the alignment is a block, use is_flush + Returns : + Argument : + +=cut + +sub length_aln { + my $self = shift; + $self->warn(ref($self). "::length_aln - deprecated method. Use length() instead."); + $self->length(@_); +} + +sub length { + my $self = shift; + my $seq; + my $length = (-1); + my ($temp,$len); + + foreach $seq ( $self->each_seq() ) { + $temp = CORE::length($seq->seq()); + if( $temp > $length ) { + $length = $temp; + } + } + + return $length; +} + + +=head2 maxdisplayname_length + + Title : maxdisplayname_length + Usage : $ali->maxdisplayname_length() + Function : + + Gets the maximum length of the displayname in the + alignment. Used in writing out various MSE formats. + + Returns : integer + Argument : + +=cut + +sub maxname_length { + my $self = shift; + $self->warn(ref($self). "::maxname_length - deprecated method.". + " Use maxdisplayname_length() instead."); + $self->maxdisplayname_length(); +} + +sub maxnse_length { + my $self = shift; + $self->warn(ref($self). "::maxnse_length - deprecated method.". + " Use maxnse_length() instead."); + $self->maxdisplayname_length(); +} + +sub maxdisplayname_length { + my $self = shift; + my $maxname = (-1); + my ($seq,$len); + + foreach $seq ( $self->each_seq() ) { + $len = CORE::length $self->displayname($seq->get_nse()); + + if( $len > $maxname ) { + $maxname = $len; + } + } + + return $maxname; +} + +=head2 no_residues + + Title : no_residues + Usage : $no = $ali->no_residues + Function : number of residues in total in the alignment + Returns : integer + Argument : + +=cut + +sub no_residues { + my $self = shift; + my $count = 0; + + foreach my $seq ($self->each_seq) { + my $str = $seq->seq(); + + $count += ($str =~ s/[^A-Za-z]//g); + } + + return $count; +} + +=head2 no_sequences + + Title : no_sequences + Usage : $depth = $ali->no_sequences + Function : number of sequence in the sequence alignment + Returns : integer + Argument : + +=cut + +sub no_sequences { + my $self = shift; + + return scalar($self->each_seq); +} + + +=head2 average_percentage_identity + + Title : average_percentage_identity + Usage : $id = $align->average_percentage_identity + Function: The function uses a fast method to calculate the average + percentage identity of the alignment + Returns : The average percentage identity of the alignment + Args : None + Notes : This method implemented by Kevin Howe calculates a figure that is + designed to be similar to the average pairwise identity of the + alignment (identical in the absence of gaps), without having to + explicitly calculate pairwise identities proposed by Richard Durbin. + Validated by Ewan Birney ad Alex Bateman. + +=cut + +sub average_percentage_identity{ + my ($self,@args) = @_; + + my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', + 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); + + my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes); + + if (! $self->is_flush()) { + $self->throw("All sequences in the alignment must be the same length"); + } + + @seqs = $self->each_seq(); + $len = $self->length(); + + # load the each hash with correct keys for existence checks + + for( my $index=0; $index < $len; $index++) { + foreach my $letter (@alphabet) { + $countHashes[$index]->{$letter} = 0; + } + } + foreach my $seq (@seqs) { + my @seqChars = split //, $seq->seq(); + for( my $column=0; $column < @seqChars; $column++ ) { + my $char = uc($seqChars[$column]); + if (exists $countHashes[$column]->{$char}) { + $countHashes[$column]->{$char}++; + } + } + } + + $total = 0; + $divisor = 0; + for(my $column =0; $column < $len; $column++) { + my %hash = %{$countHashes[$column]}; + $subdivisor = 0; + foreach my $res (keys %hash) { + $total += $hash{$res}*($hash{$res} - 1); + $subdivisor += $hash{$res}; + } + $divisor += $subdivisor * ($subdivisor - 1); + } + return $divisor > 0 ? ($total / $divisor )*100.0 : 0; +} + +=head2 percentage_identity + + Title : percentage_identity + Usage : $id = $align->percentage_identity + Function: The function calculates the average percentage identity + (aliased for average_percentage_identity) + Returns : The average percentage identity + Args : None + +=cut + +sub percentage_identity { + my $self = shift; + return $self->average_percentage_identity(); +} + +=head2 overall_percentage_identity + + Title : percentage_identity + Usage : $id = $align->percentage_identity + Function: The function calculates the percentage identity of + the conserved columns + Returns : The percentage identity of the conserved columns + Args : None + +=cut + +sub overall_percentage_identity{ + my ($self,@args) = @_; + + my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', + 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); + + my ($len, $total, @seqs, @countHashes); + + if (! $self->is_flush()) { + $self->throw("All sequences in the alignment must be the same length"); + } + + @seqs = $self->each_seq(); + $len = $self->length(); + + # load the each hash with correct keys for existence checks + for( my $index=0; $index < $len; $index++) { + foreach my $letter (@alphabet) { + $countHashes[$index]->{$letter} = 0; + } + } + foreach my $seq (@seqs) { + my @seqChars = split //, $seq->seq(); + for( my $column=0; $column < @seqChars; $column++ ) { + my $char = uc($seqChars[$column]); + if (exists $countHashes[$column]->{$char}) { + $countHashes[$column]->{$char}++; + } + } + } + + $total = 0; + for(my $column =0; $column < $len; $column++) { + my %hash = %{$countHashes[$column]}; + foreach ( values %hash ) { + next if( $_ == 0 ); + $total++ if( $_ == scalar @seqs ); + last; + } + } + return ($total / $len ) * 100.0; +} + +=head1 Alignment positions + +Methods to map a sequence position into an alignment column and back. +column_from_residue_number() does the former. The latter is really a +property of the sequence object and can done using +L<Bio::LocatableSeq::location_from_column>: + + # select somehow a sequence from the alignment, e.g. + my $seq = $aln->get_seq_by_pos(1); + #$loc is undef or Bio::LocationI object + my $loc = $seq->location_from_column(5); + + +=head2 column_from_residue_number + + Title : column_from_residue_number + Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber) + Function: + + This function gives the position in the alignment + (i.e. column number) of the given residue number in the + sequence with the given name. For example, for the + alignment + + Seq1/91-97 AC..DEF.GH + Seq2/24-30 ACGG.RTY.. + Seq3/43-51 AC.DDEFGHI + + column_from_residue_number( "Seq1", 94 ) returns 5. + column_from_residue_number( "Seq2", 25 ) returns 2. + column_from_residue_number( "Seq3", 50 ) returns 9. + + An exception is thrown if the residue number would lie + outside the length of the aligment + (e.g. column_from_residue_number( "Seq2", 22 ) + + Note: If the the parent sequence is represented by more than + one alignment sequence and the residue number is present in + them, this method finds only the first one. + + Returns : A column number for the position in the alignment of the + given residue in the given sequence (1 = first column) + Args : A sequence id/name (not a name/start-end) + A residue number in the whole sequence (not just that + segment of it in the alignment) + +=cut + +sub column_from_residue_number { + my ($self, $name, $resnumber) = @_; + + $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name}; + $self->throw("Second argument residue number missing") unless $resnumber; + + foreach my $seq ($self->each_seq_with_id($name)) { + my $col; + eval { + $col = $seq->column_from_residue_number($resnumber); + }; + next if $@; + return $col; + } + + $self->throw("Could not find a sequence segment in $name ". + "containing residue number $resnumber"); + +} + +=head1 Sequence names + +Methods to manipulate the display name. The default name based on the +sequence id and subsequence positions can be overridden in various +ways. + +=head2 displayname + + Title : displayname + Usage : $myalign->displayname("Ig", "IgA") + Function : Gets/sets the display name of a sequence in the alignment + : + Returns : A display name string + Argument : name of the sequence + displayname of the sequence (optional) + +=cut + +sub get_displayname { + my $self = shift; + $self->warn(ref($self). "::get_displayname - deprecated method. Use displayname() instead."); + $self->displayname(@_); +} + +sub set_displayname { + my $self = shift; + $self->warn(ref($self). "::set_displayname - deprecated method. Use displayname() instead."); + $self->displayname(@_); +} + +sub displayname { + my ($self, $name, $disname) = @_; + + $self->throw("No sequence with name [$name]") unless $self->{'_seq'}->{$name}; + + if( $disname and $name) { + $self->{'_dis_name'}->{$name} = $disname; + return $disname; + } + elsif( defined $self->{'_dis_name'}->{$name} ) { + return $self->{'_dis_name'}->{$name}; + } else { + return $name; + } +} + +=head2 set_displayname_count + + Title : set_displayname_count + Usage : $ali->set_displayname_count + Function : + + Sets the names to be name_# where # is the number of + times this name has been used. + + Returns : + Argument : + +=cut + +sub set_displayname_count { + my $self= shift; + my (@arr,$name,$seq,$count,$temp,$nse); + + foreach $seq ( $self->each_alphabetically() ) { + $nse = $seq->get_nse(); + + #name will be set when this is the second + #time (or greater) is has been seen + + if( defined $name and $name eq ($seq->id()) ) { + $temp = sprintf("%s_%s",$name,$count); + $self->displayname($nse,$temp); + $count++; + } else { + $count = 1; + $name = $seq->id(); + $temp = sprintf("%s_%s",$name,$count); + $self->displayname($nse,$temp); + $count++; + } + } + return 1; +} + +=head2 set_displayname_flat + + Title : set_displayname_flat + Usage : $ali->set_displayname_flat() + Function : Makes all the sequences be displayed as just their name, + not name/start-end + Returns : 1 + Argument : + +=cut + +sub set_displayname_flat { + my $self = shift; + my ($nse,$seq); + + foreach $seq ( $self->each_seq() ) { + $nse = $seq->get_nse(); + $self->displayname($nse,$seq->id()); + } + return 1; +} + +=head2 set_displayname_normal + + Title : set_displayname_normal + Usage : $ali->set_displayname_normal() + Function : Makes all the sequences be displayed as name/start-end + Returns : + Argument : + +=cut + +sub set_displayname_normal { + my $self = shift; + my ($nse,$seq); + + foreach $seq ( $self->each_seq() ) { + $nse = $seq->get_nse(); + $self->displayname($nse,$nse); + } + return 1; +} + +=head2 source + + Title : source + Usage : $obj->source($newval) + Function: sets the Alignment source program + Example : + Returns : value of source + Args : newvalue (optional) + + +=cut + +sub source{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_source'} = $value; + } + return $self->{'_source'}; +} + +1;