comparison variant_effect_predictor/Bio/SimpleAlign.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: SimpleAlign.pm,v 1.65.2.1 2003/07/02 16:00:19 jason Exp $
2 # BioPerl module for SimpleAlign
3 #
4 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
5 #
6 # Copyright Ewan Birney
7 #
8 # You may distribute this module under the same terms as perl itself
9
10 # POD documentation - main docs before the code
11 #
12 # History:
13 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
14 # May 2001 major rewrite - Heikki Lehvaslaiho
15
16 =head1 NAME
17
18 Bio::SimpleAlign - Multiple alignments held as a set of sequences
19
20 =head1 SYNOPSIS
21
22 # use Bio::AlignIO to read in the alignment
23 $str = Bio::AlignIO->new('-file' => 't/data/testaln.pfam');
24 $aln = $str->next_aln();
25
26 # some descriptors
27 print $aln->length, "\n";
28 print $aln->no_residues, "\n";
29 print $aln->is_flush, "\n";
30 print $aln->no_sequences, "\n";
31 print $aln->percentage_identity, "\n";
32 print $aln->consensus_string(50), "\n";
33
34 # find the position in the alignment for a sequence location
35 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
36
37 # extract sequences and check values for the alignment column $pos
38 foreach $seq ($aln->each_seq) {
39 $res = $seq->subseq($pos, $pos);
40 $count{$res}++;
41 }
42 foreach $res (keys %count) {
43 printf "Res: %s Count: %2d\n", $res, $count{$res};
44 }
45
46
47 =head1 DESCRIPTION
48
49 SimpleAlign handles multiple alignments of sequences. It is very
50 permissive of types (it won't insist on things being all same length
51 etc): really it is a SequenceSet explicitly held in memory with a
52 whole series of built in manipulations and especially file format
53 systems for read/writing alignments.
54
55 SimpleAlign basically views an alignment as an immutable block of
56 text. SimpleAlign *is not* the object to be using if you want to
57 perform complex alignment manipulations.
58
59 However for lightweight display/formatting and minimal manipulation
60 (e.g. removing all-gaps columns) - this is the one to use.
61
62 SimpleAlign uses a subclass of L<Bio::PrimarySeq> class
63 L<Bio::LocatableSeq> to store its sequences. These are subsequences
64 with a start and end positions in the parent reference sequence.
65
66 Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in
67 the alignment, and this is the key for the internal hashes.
68 (name,start,end is abbreviated nse in the code). However, in many
69 cases people don't want the name/start-end to be displayed: either
70 multiple names in an alignment or names specific to the alignment
71 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
72 'displayname', and generally is what is used to print out the
73 alignment. They default to name/start-end.
74
75 The SimpleAlign Module came from Ewan Birney's Align module.
76
77 =head1 PROGRESS
78
79 SimpleAlign is being slowly converted to bioperl coding standards,
80 mainly by Ewan.
81
82 =over 3
83
84 =item Use Bio::Root::Object - done
85
86 =item Use proper exceptions - done
87
88 =item Use hashed constructor - not done!
89
90 =back
91
92 =head1 FEEDBACK
93
94 =head2 Mailing Lists
95
96 User feedback is an integral part of the evolution of this and other
97 Bioperl modules. Send your comments and suggestions preferably to one
98 of the Bioperl mailing lists. Your participation is much appreciated.
99
100 bioperl-l@bioperl.org - General discussion
101 http://bioperl.org/MailList.shtml - About the mailing lists
102
103 =head2 Reporting Bugs
104
105 Report bugs to the Bioperl bug tracking system to help us keep track
106 the bugs and their resolution. Bug reports can be submitted via email
107 or the web:
108
109 bioperl-bugs@bio.perl.org
110 http://bugzilla.bioperl.org/
111
112 =head1 AUTHOR
113
114 Ewan Birney, birney@sanger.ac.uk
115
116 =head1 CONTRIBUTORS
117
118 Richard Adams, Richard.Adams@ed.ac.uk,
119 David J. Evans, David.Evans@vir.gla.ac.uk,
120 Heikki Lehvaslaiho, heikki@ebi.ac.uk,
121 Allen Smith, allens@cpan.org,
122 Jason Stajich, jason@bioperl.org,
123 Anthony Underwood, aunderwood@phls.org.uk,
124 Xintao Wei & Giri Narasimhan, giri@cs.fiu.edu
125
126
127 =head1 SEE ALSO
128
129 L<Bio::LocatableSeq.pm>
130
131 =head1 APPENDIX
132
133 The rest of the documentation details each of the object
134 methods. Internal methods are usually preceded with a _
135
136 =cut
137
138 # 'Let the code begin...
139
140 package Bio::SimpleAlign;
141 use vars qw(@ISA %CONSERVATION_GROUPS);
142 use strict;
143
144 use Bio::Root::Root;
145 use Bio::LocatableSeq; # uses Seq's as list
146 use Bio::Align::AlignI;
147
148 BEGIN {
149 # This data should probably be in a more centralized module...
150 # it is taken from Clustalw documentation
151 # These are all the positively scoring groups that occur in the
152 # Gonnet Pam250 matrix. The strong and weak groups are
153 # defined as strong score >0.5 and weak score =<0.5 respectively.
154
155 %CONSERVATION_GROUPS = ( 'strong' => [ qw(STA
156 NEQK
157 NHQK
158 NDEQ
159 QHRK
160 MILV
161 MILF
162 HY
163 FYW)
164 ],
165 'weak' => [ qw(CSA
166 ATV
167 SAG
168 STNK
169 STPA
170 SGND
171 SNDEQK
172 NDEQHK
173 NEQHRK
174 FVLIM
175 HFY) ],
176 );
177
178 }
179
180 @ISA = qw(Bio::Root::Root Bio::Align::AlignI);
181
182 =head2 new
183
184 Title : new
185 Usage : my $aln = new Bio::SimpleAlign();
186 Function : Creates a new simple align object
187 Returns : Bio::SimpleAlign
188 Args : -source => string representing the source program
189 where this alignment came from
190
191 =cut
192
193
194 sub new {
195 my($class,@args) = @_;
196
197 my $self = $class->SUPER::new(@args);
198
199 my ($src) = $self->_rearrange([qw(SOURCE)], @args);
200 $src && $self->source($src);
201 # we need to set up internal hashs first!
202
203 $self->{'_seq'} = {};
204 $self->{'_order'} = {};
205 $self->{'_start_end_lists'} = {};
206 $self->{'_dis_name'} = {};
207 $self->{'_id'} = 'NoName';
208 $self->{'_symbols'} = {};
209 # maybe we should automatically read in from args. Hmmm...
210
211 return $self; # success - we hope!
212 }
213
214 =head1 Modifier methods
215
216 These methods modify the MSE by adding, removing or shuffling complete
217 sequences.
218
219 =head2 add_seq
220
221 Title : add_seq
222 Usage : $myalign->add_seq($newseq);
223 Function : Adds another sequence to the alignment. *Does not* align
224 it - just adds it to the hashes.
225 Returns : nothing
226 Args : a Bio::LocatableSeq object
227 order (optional)
228
229 See L<Bio::LocatableSeq> for more information
230
231 =cut
232
233 sub addSeq {
234 my $self = shift;
235 $self->warn(ref($self). "::addSeq - deprecated method. Use add_seq() instead.");
236 $self->add_seq(@_);
237 }
238
239 sub add_seq {
240 my $self = shift;
241 my $seq = shift;
242 my $order = shift;
243 my ($name,$id,$start,$end);
244
245 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
246 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
247 }
248
249 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
250 $start = $seq->start();
251 $end = $seq->end();
252
253 # build the symbol list for this sequence,
254 # will prune out the gap and missing/match chars
255 # when actually asked for the symbol list in the
256 # symbol_chars
257 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
258
259 if( !defined $order ) {
260 $order = keys %{$self->{'_seq'}};
261 }
262 $name = sprintf("%s/%d-%d",$id,$start,$end);
263
264 if( $self->{'_seq'}->{$name} ) {
265 $self->warn("Replacing one sequence [$name]\n");
266 }
267 else {
268 #print STDERR "Assigning $name to $order\n";
269
270 $self->{'_order'}->{$order} = $name;
271
272 unless( exists( $self->{'_start_end_lists'}->{$id})) {
273 $self->{'_start_end_lists'}->{$id} = [];
274 }
275 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
276 }
277
278 $self->{'_seq'}->{$name} = $seq;
279
280 }
281
282
283 =head2 remove_seq
284
285 Title : remove_seq
286 Usage : $aln->remove_seq($seq);
287 Function : Removes a single sequence from an alignment
288 Returns :
289 Argument : a Bio::LocatableSeq object
290
291 =cut
292
293 sub removeSeq {
294 my $self = shift;
295 $self->warn(ref($self). "::removeSeq - deprecated method. Use remove_seq() instead.");
296 $self->remove_seq(@_);
297 }
298
299 sub remove_seq {
300 my $self = shift;
301 my $seq = shift;
302 my ($name,$id,$start,$end);
303
304 $self->throw("Need Bio::Locatable seq argument ")
305 unless ref $seq && $seq->isa('Bio::LocatableSeq');
306
307 $id = $seq->id();
308 $start = $seq->start();
309 $end = $seq->end();
310 $name = sprintf("%s/%d-%d",$id,$start,$end);
311
312 if( !exists $self->{'_seq'}->{$name} ) {
313 $self->throw("Sequence $name does not exist in the alignment to remove!");
314 }
315
316 delete $self->{'_seq'}->{$name};
317
318 # we need to remove this seq from the start_end_lists hash
319
320 if (exists $self->{'_start_end_lists'}->{$id}) {
321 # we need to find the sequence in the array.
322
323 my ($i, $found);;
324 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
325 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
326 $found = 1;
327 last;
328 }
329 }
330 if ($found) {
331 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
332 }
333 else {
334 $self->throw("Could not find the sequence to remoce from the start-end list");
335 }
336 }
337 else {
338 $self->throw("There is no seq list for the name $id");
339 }
340 return 1;
341 # we can't do anything about the order hash but that is ok
342 # because each_seq will handle it
343 }
344
345
346 =head2 purge
347
348 Title : purge
349 Usage : $aln->purge(0.7);
350 Function:
351
352 Removes sequences above given sequence similarity
353 This function will grind on large alignments. Beware!
354
355 Example :
356 Returns : An array of the removed sequences
357 Args : float, threshold for similarity
358
359 =cut
360
361 sub purge {
362 my ($self,$perc) = @_;
363 my (%duplicate, @dups);
364
365 my @seqs = $self->each_seq();
366
367 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
368 my $seq = $seqs[$i];
369
370 #skip if already in duplicate hash
371 next if exists $duplicate{$seq->display_id} ;
372 my $one = $seq->seq();
373
374 my @one = split '', $one; #split to get 1aa per array element
375
376 for (my $j=$i+1;$j < @seqs;$j++) {
377 my $seq2 = $seqs[$j];
378
379 #skip if already in duplicate hash
380 next if exists $duplicate{$seq2->display_id} ;
381
382 my $two = $seq2->seq();
383 my @two = split '', $two;
384
385 my $count = 0;
386 my $res = 0;
387 for (my $k=0;$k<@one;$k++) {
388 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
389 $one[$k] eq $two[$k]) {
390 $count++;
391 }
392 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
393 $two[$k] ne '.' && $two[$k] ne '-' ) {
394 $res++;
395 }
396 }
397
398 my $ratio = 0;
399 $ratio = $count/$res unless $res == 0;
400
401 # if above threshold put in duplicate hash and push onto
402 # duplicate array for returning to get_unique
403 if ( $ratio > $perc ) {
404 print STDERR "duplicate!", $seq2->display_id, "\n" if $self->verbose > 0;
405 $duplicate{$seq2->display_id} = 1;
406 push @dups, $seq2;
407 }
408 }
409 }
410 foreach my $seq (@dups) {
411 $self->remove_seq($seq);
412 }
413 return @dups;
414 }
415
416 =head2 sort_alphabetically
417
418 Title : sort_alphabetically
419 Usage : $ali->sort_alphabetically
420 Function :
421
422 Changes the order of the alignemnt to alphabetical on name
423 followed by numerical by number.
424
425 Returns :
426 Argument :
427
428 =cut
429
430 sub sort_alphabetically {
431 my $self = shift;
432 my ($seq,$nse,@arr,%hash,$count);
433
434 foreach $seq ( $self->each_seq() ) {
435 $nse = $seq->get_nse;
436 $hash{$nse} = $seq;
437 }
438
439 $count = 0;
440
441 %{$self->{'_order'}} = (); # reset the hash;
442
443 foreach $nse ( sort _alpha_startend keys %hash) {
444 $self->{'_order'}->{$count} = $nse;
445
446 $count++;
447 }
448 1;
449 }
450
451 =head1 Sequence selection methods
452
453 Methods returning one or more sequences objects.
454
455 =head2 each_seq
456
457 Title : each_seq
458 Usage : foreach $seq ( $align->each_seq() )
459 Function : Gets an array of Seq objects from the alignment
460 Returns : an array
461 Argument :
462
463 =cut
464
465 sub eachSeq {
466 my $self = shift;
467 $self->warn(ref($self). "::eachSeq - deprecated method. Use each_seq() instead.");
468 $self->each_seq();
469 }
470
471 sub each_seq {
472 my $self = shift;
473 my (@arr,$order);
474
475 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
476 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
477 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
478 }
479 }
480
481 return @arr;
482 }
483
484
485 =head2 each_alphabetically
486
487 Title : each_alphabetically
488 Usage : foreach $seq ( $ali->each_alphabetically() )
489 Function :
490
491 Returns an array of sequence object sorted alphabetically
492 by name and then by start point.
493 Does not change the order of the alignment
494
495 Returns :
496 Argument :
497
498 =cut
499
500 sub each_alphabetically {
501 my $self = shift;
502 my ($seq,$nse,@arr,%hash,$count);
503
504 foreach $seq ( $self->each_seq() ) {
505 $nse = $seq->get_nse;
506 $hash{$nse} = $seq;
507 }
508
509 foreach $nse ( sort _alpha_startend keys %hash) {
510 push(@arr,$hash{$nse});
511 }
512
513 return @arr;
514
515 }
516
517 sub _alpha_startend {
518 my ($aname,$astart,$bname,$bstart);
519 ($aname,$astart) = split (/-/,$a);
520 ($bname,$bstart) = split (/-/,$b);
521
522 if( $aname eq $bname ) {
523 return $astart <=> $bstart;
524 }
525 else {
526 return $aname cmp $bname;
527 }
528
529 }
530
531 =head2 each_seq_with_id
532
533 Title : each_seq_with_id
534 Usage : foreach $seq ( $align->each_seq_with_id() )
535 Function :
536
537 Gets an array of Seq objects from the
538 alignment, the contents being those sequences
539 with the given name (there may be more than one)
540
541 Returns : an array
542 Argument : a seq name
543
544 =cut
545
546 sub eachSeqWithId {
547 my $self = shift;
548 $self->warn(ref($self). "::eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
549 $self->each_seq_with_id(@_);
550 }
551
552 sub each_seq_with_id {
553 my $self = shift;
554 my $id = shift;
555
556 $self->throw("Method each_seq_with_id needs a sequence name argument")
557 unless defined $id;
558
559 my (@arr, $seq);
560
561 if (exists($self->{'_start_end_lists'}->{$id})) {
562 @arr = @{$self->{'_start_end_lists'}->{$id}};
563 }
564 return @arr;
565 }
566
567 =head2 get_seq_by_pos
568
569 Title : get_seq_by_pos
570 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
571 Function :
572
573 Gets a sequence based on its position in the alignment.
574 Numbering starts from 1. Sequence positions larger than
575 no_sequences() will thow an error.
576
577 Returns : a Bio::LocatableSeq object
578 Args : positive integer for the sequence osition
579
580 =cut
581
582 sub get_seq_by_pos {
583
584 my $self = shift;
585 my ($pos) = @_;
586
587 $self->throw("Sequence position has to be a positive integer, not [$pos]")
588 unless $pos =~ /^\d+$/ and $pos > 0;
589 $self->throw("No sequence at position [$pos]")
590 unless $pos <= $self->no_sequences ;
591
592 my $nse = $self->{'_order'}->{--$pos};
593 return $self->{'_seq'}->{$nse};
594 }
595
596 =head1 Create new alignments
597
598 The result of these methods are horizontal or vertical subsets of the
599 current MSE.
600
601 =head2 select
602
603 Title : select
604 Usage : $aln2 = $aln->select(1, 3) # three first sequences
605 Function :
606
607 Creates a new alignment from a continuous subset of
608 sequences. Numbering starts from 1. Sequence positions
609 larger than no_sequences() will thow an error.
610
611 Returns : a Bio::SimpleAlign object
612 Args : positive integer for the first sequence
613 positive integer for the last sequence to include (optional)
614
615 =cut
616
617 sub select {
618 my $self = shift;
619 my ($start, $end) = @_;
620
621 $self->throw("Select start has to be a positive integer, not [$start]")
622 unless $start =~ /^\d+$/ and $start > 0;
623 $self->throw("Select end has to be a positive integer, not [$end]")
624 unless $end =~ /^\d+$/ and $end > 0;
625 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
626 unless $start <= $end;
627
628 my $aln = new $self;
629 foreach my $pos ($start .. $end) {
630 $aln->add_seq($self->get_seq_by_pos($pos));
631 }
632 $aln->id($self->id);
633 return $aln;
634 }
635
636 =head2 select_noncont
637
638 Title : select_noncont
639 Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences
640 Function :
641
642 Creates a new alignment from a subset of
643 sequences. Numbering starts from 1. Sequence positions
644 larger than no_sequences() will thow an error.
645
646 Returns : a Bio::SimpleAlign object
647 Args : array of integers for the sequences
648
649 =cut
650
651 sub select_noncont {
652 my $self = shift;
653 my (@pos) = @_;
654 my $end = $self->no_sequences;
655 foreach ( @pos ) {
656 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
657 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
658 }
659 my $aln = new $self;
660 foreach my $p (@pos) {
661 $aln->add_seq($self->get_seq_by_pos($p));
662 }
663 $aln->id($self->id);
664 return $aln;
665 }
666
667 =head2 slice
668
669 Title : slice
670 Usage : $aln2 = $aln->slice(20, 30)
671 Function :
672
673 Creates a slice from the alignment inclusive of start and
674 end columns. Sequences with no residues in the slice are
675 excluded from the new alignment and a warning is printed.
676 Slice beyond the length of the sequence does not do
677 padding.
678
679 Returns : a Bio::SimpleAlign object
680 Args : positive integer for start column
681 positive integer for end column
682
683 =cut
684
685 sub slice {
686 my $self = shift;
687 my ($start, $end) = @_;
688
689 $self->throw("Slice start has to be a positive integer, not [$start]")
690 unless $start =~ /^\d+$/ and $start > 0;
691 $self->throw("Slice end has to be a positive integer, not [$end]")
692 unless $end =~ /^\d+$/ and $end > 0;
693 $self->throw("Slice $start [$start] has to be smaller than or equal to end [$end]")
694 unless $start <= $end;
695 my $aln_length = $self->length;
696 $self->throw("This alignment has only ". $self->length.
697 " residues. Slice start [$start] is too bigger.")
698 if $start > $self->length;
699
700 my $aln = new $self;
701 $aln->id($self->id);
702 foreach my $seq ( $self->each_seq() ) {
703
704 my $new_seq = new Bio::LocatableSeq (-id => $seq->id);
705
706 # seq
707 my $seq_end = $end;
708 $seq_end = $seq->length if $end > $seq->length;
709 my $slice_seq = $seq->subseq($start, $seq_end);
710 $new_seq->seq( $slice_seq );
711
712 # start
713 if ($start > 1) {
714 my $pre_start_seq = $seq->subseq(1, $start - 1);
715 $pre_start_seq =~ s/\W//g; #print "$pre_start_seq\n";
716 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
717 } else {
718 $new_seq->start( $seq->start);
719 }
720
721 # end
722 $slice_seq =~ s/\W//g;
723 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
724
725 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
726 $aln->add_seq($new_seq);
727 } else {
728 my $nse = $seq->get_nse();
729 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
730 " Sequence excluded from the new alignment.");
731 }
732
733 }
734
735 return $aln;
736 }
737
738 =head2 remove_columns
739
740 Title : remove_column
741 Usage : $aln2 = $aln->remove_columns(['mismatch','weak'])
742 Function :
743 Creates an aligment with columns removed corresponding to
744 the specified criteria.
745 Returns : a L<Bio::SimpleAlign> object
746 Args : array ref of types, 'match'|'weak'|'strong'|'mismatch'
747
748 =cut
749
750 sub remove_columns{
751 my ($self,$type) = @_;
752 my %matchchars = ( 'match' => '\*',
753 'weak' => '\.',
754 'strong' => ':',
755 'mismatch'=> ' ',
756 );
757 #get the characters to delete against
758 my $del_char;
759 foreach my $type(@{$type}){
760 $del_char.= $matchchars{$type};
761 }
762
763 my $match_line = $self->match_line;
764 my $aln = new $self;
765
766 my @remove;
767 my $length = 0;
768
769 #do the matching to get the segments to remove
770 while($match_line=~m/[$del_char]/g){
771 my $start = pos($match_line)-1;
772 $match_line=~/\G[$del_char]+/gc;
773 my $end = pos($match_line)-1;
774
775 #have to offset the start and end for subsequent removes
776 $start-=$length;
777 $end -=$length;
778 $length += ($end-$start+1);
779 push @remove, [$start,$end];
780 }
781
782 #remove the segments
783 $aln = $self->_remove_col($aln,\@remove);
784
785 return $aln;
786 }
787
788 sub _remove_col {
789 my ($self,$aln,$remove) = @_;
790 my @new;
791
792 #splice out the segments and create new seq
793 foreach my $seq($self->each_seq){
794 my $new_seq = new Bio::LocatableSeq(-id=>$seq->id);
795 my $sequence;
796 foreach my $pair(@{$remove}){
797 my $start = $pair->[0];
798 my $end = $pair->[1];
799 $sequence = $seq->seq unless $sequence;
800 my $spliced;
801 $spliced .= $start > 0 ? substr($sequence,0,$start) : '';
802 $spliced .= substr($sequence,$end+1,$seq->length-$end+1);
803 $sequence = $spliced;
804 if ($start == 1) {
805 $new_seq->start($end);
806 }
807 else {
808 $new_seq->start( $seq->start);
809 }
810 # end
811 if($end >= $seq->end){
812 $new_seq->end( $start);
813 }
814 else {
815 $new_seq->end($seq->end);
816 }
817 }
818 $new_seq->seq($sequence);
819 push @new, $new_seq;
820 }
821 #add the new seqs to the alignment
822 foreach my $new(@new){
823 $aln->add_seq($new);
824 }
825 return $aln;
826 }
827
828 =head1 Change sequences within the MSE
829
830 These methods affect characters in all sequences without changeing the
831 alignment.
832
833
834 =head2 map_chars
835
836 Title : map_chars
837 Usage : $ali->map_chars('\.','-')
838 Function :
839
840 Does a s/$arg1/$arg2/ on the sequences. Useful for gap
841 characters
842
843 Notice that the from (arg1) is interpretted as a regex,
844 so be careful about quoting meta characters (eg
845 $ali->map_chars('.','-') wont do what you want)
846
847 Returns :
848 Argument : 'from' rexexp
849 'to' string
850
851 =cut
852
853 sub map_chars {
854 my $self = shift;
855 my $from = shift;
856 my $to = shift;
857 my ($seq,$temp);
858
859 $self->throw("Need exactly two arguments")
860 unless defined $from and defined $to;
861
862 foreach $seq ( $self->each_seq() ) {
863 $temp = $seq->seq();
864 $temp =~ s/$from/$to/g;
865 $seq->seq($temp);
866 }
867 return 1;
868 }
869
870
871 =head2 uppercase
872
873 Title : uppercase()
874 Usage : $ali->uppercase()
875 Function : Sets all the sequences to uppercase
876 Returns :
877 Argument :
878
879 =cut
880
881 sub uppercase {
882 my $self = shift;
883 my $seq;
884 my $temp;
885
886 foreach $seq ( $self->each_seq() ) {
887 $temp = $seq->seq();
888 $temp =~ tr/[a-z]/[A-Z]/;
889
890 $seq->seq($temp);
891 }
892 return 1;
893 }
894
895 =head2 cigar_line
896
897 Title : cigar_line()
898 Usage : $align->cigar_line()
899 Function : Generates a "cigar" line for each sequence in the alignment
900 The format is simply A-1,60;B-1,1:4,60;C-5,10:12,58
901 where A,B,C,etc. are the sequence identifiers, and the numbers
902 refer to conserved positions within the alignment
903 Args : none
904
905 =cut
906
907 sub cigar_line {
908 my ($self) = @_;
909
910 my %cigar;
911 my %clines;
912 my @seqchars;
913 my $seqcount = 0;
914 my $sc;
915 foreach my $seq ( $self->each_seq ) {
916 push @seqchars, [ split(//, uc ($seq->seq)) ];
917 $sc = scalar(@seqchars);
918 }
919
920 foreach my $pos ( 0..$self->length ) {
921 my $i=0;
922 foreach my $seq ( @seqchars ) {
923 $i++;
924 # print STDERR "Seq $i at pos $pos: ".$seq->[$pos]."\n";
925 if ($seq->[$pos] eq '.') {
926 if (defined $cigar{$i} && $clines{$i} !~ $cigar{$i}) {
927 $clines{$i}.=$cigar{$i};
928 }
929 }
930 else {
931 if (! defined $cigar{$i}) {
932 $clines{$i}.=($pos+1).",";
933 }
934 $cigar{$i}=$pos+1;
935 }
936 if ($pos+1 == $self->length && ($clines{$i} =~ /\,$/) ) {
937 $clines{$i}.=$cigar{$i};
938 }
939 }
940 }
941 for(my $i=1; $i<$sc+1;$i++) {
942 print STDERR "Seq $i cigar line ".$clines{$i}."\n";
943 }
944 return %clines;
945 }
946
947 =head2 match_line
948
949 Title : match_line()
950 Usage : $align->match_line()
951 Function : Generates a match line - much like consensus string
952 except that a line indicating the '*' for a match.
953 Args : (optional) Match line characters ('*' by default)
954 (optional) Strong match char (':' by default)
955 (optional) Weak match char ('.' by default)
956
957 =cut
958
959 sub match_line {
960 my ($self,$matchlinechar, $strong, $weak) = @_;
961 my %matchchars = ( 'match' => $matchlinechar || '*',
962 'weak' => $weak || '.',
963 'strong' => $strong || ':',
964 'mismatch'=> ' ',
965 );
966
967
968 my @seqchars;
969 my $seqcount = 0;
970 my $alphabet;
971 foreach my $seq ( $self->each_seq ) {
972 push @seqchars, [ split(//, uc ($seq->seq)) ];
973 $alphabet = $seq->alphabet unless defined $alphabet;
974 }
975 my $refseq = shift @seqchars;
976 # let's just march down the columns
977 my $matchline;
978 POS: foreach my $pos ( 0..$self->length ) {
979 my $refchar = $refseq->[$pos];
980 next unless $refchar; # skip ''
981 my %col = ($refchar => 1);
982 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
983 foreach my $seq ( @seqchars ) {
984 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
985 $seq->[$pos] eq ' ' );
986 $col{$seq->[$pos]}++;
987 }
988 my @colresidues = sort keys %col;
989 my $char = $matchchars{'mismatch'};
990 # if all the values are the same
991 if( $dash ) { $char = $matchchars{'mismatch'} }
992 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
993 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
994 # matches for protein seqs
995 TYPE: foreach my $type ( qw(strong weak) ) {
996 # iterate through categories
997 my %groups;
998 # iterate through each of the aa in the col
999 # look to see which groups it is in
1000 foreach my $c ( @colresidues ) {
1001 foreach my $f ( grep /\Q$c/, @{$CONSERVATION_GROUPS{$type}} ) {
1002 push @{$groups{$f}},$c;
1003 }
1004 }
1005 GRP: foreach my $cols ( values %groups ) {
1006 @$cols = sort @$cols;
1007 # now we are just testing to see if two arrays
1008 # are identical w/o changing either one
1009
1010 # have to be same len
1011 next if( scalar @$cols != scalar @colresidues );
1012 # walk down the length and check each slot
1013 for($_=0;$_ < (scalar @$cols);$_++ ) {
1014 next GRP if( $cols->[$_] ne $colresidues[$_] );
1015 }
1016 $char = $matchchars{$type};
1017 last TYPE;
1018 }
1019 }
1020 }
1021 $matchline .= $char;
1022 }
1023 return $matchline;
1024 }
1025
1026 =head2 match
1027
1028 Title : match()
1029 Usage : $ali->match()
1030 Function :
1031
1032 Goes through all columns and changes residues that are
1033 identical to residue in first sequence to match '.'
1034 character. Sets match_char.
1035
1036 USE WITH CARE: Most MSE formats do not support match
1037 characters in sequences, so this is mostly for output
1038 only. NEXUS format (Bio::AlignIO::nexus) can handle
1039 it.
1040
1041 Returns : 1
1042 Argument : a match character, optional, defaults to '.'
1043
1044 =cut
1045
1046 sub match {
1047 my ($self, $match) = @_;
1048
1049 $match ||= '.';
1050 my ($matching_char) = $match;
1051 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
1052 $self->map_chars($matching_char, '-');
1053
1054 my @seqs = $self->each_seq();
1055 return 1 unless scalar @seqs > 1;
1056
1057 my $refseq = shift @seqs ;
1058 my @refseq = split //, $refseq->seq;
1059 my $gapchar = $self->gap_char;
1060
1061 foreach my $seq ( @seqs ) {
1062 my @varseq = split //, $seq->seq();
1063 for ( my $i=0; $i < scalar @varseq; $i++) {
1064 $varseq[$i] = $match if defined $refseq[$i] &&
1065 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1066 $refseq[$i] =~ /$gapchar/ )
1067 && $refseq[$i] eq $varseq[$i];
1068 }
1069 $seq->seq(join '', @varseq);
1070 }
1071 $self->match_char($match);
1072 return 1;
1073 }
1074
1075
1076 =head2 unmatch
1077
1078 Title : unmatch()
1079 Usage : $ali->unmatch()
1080 Function : Undoes the effect of method match. Unsets match_char.
1081
1082 Returns : 1
1083 Argument : a match character, optional, defaults to '.'
1084
1085 See L<match> and L<match_char>
1086
1087 =cut
1088
1089 sub unmatch {
1090 my ($self, $match) = @_;
1091
1092 $match ||= '.';
1093
1094 my @seqs = $self->each_seq();
1095 return 1 unless scalar @seqs > 1;
1096
1097 my $refseq = shift @seqs ;
1098 my @refseq = split //, $refseq->seq;
1099 my $gapchar = $self->gap_char;
1100 foreach my $seq ( @seqs ) {
1101 my @varseq = split //, $seq->seq();
1102 for ( my $i=0; $i < scalar @varseq; $i++) {
1103 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
1104 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
1105 $refseq[$i] =~ /$gapchar/ ) &&
1106 $varseq[$i] eq $match;
1107 }
1108 $seq->seq(join '', @varseq);
1109 }
1110 $self->match_char('');
1111 return 1;
1112 }
1113
1114 =head1 MSE attibutes
1115
1116 Methods for setting and reading the MSE attributes.
1117
1118 Note that the methods defining character semantics depend on the user
1119 to set them sensibly. They are needed only by certain input/output
1120 methods. Unset them by setting to an empty string ('').
1121
1122 =head2 id
1123
1124 Title : id
1125 Usage : $myalign->id("Ig")
1126 Function : Gets/sets the id field of the alignment
1127 Returns : An id string
1128 Argument : An id string (optional)
1129
1130 =cut
1131
1132 sub id {
1133 my ($self, $name) = @_;
1134
1135 if (defined( $name )) {
1136 $self->{'_id'} = $name;
1137 }
1138
1139 return $self->{'_id'};
1140 }
1141
1142 =head2 missing_char
1143
1144 Title : missing_char
1145 Usage : $myalign->missing_char("?")
1146 Function : Gets/sets the missing_char attribute of the alignment
1147 It is generally recommended to set it to 'n' or 'N'
1148 for nucleotides and to 'X' for protein.
1149 Returns : An missing_char string,
1150 Argument : An missing_char string (optional)
1151
1152 =cut
1153
1154 sub missing_char {
1155 my ($self, $char) = @_;
1156
1157 if (defined $char ) {
1158 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
1159 $self->{'_missing_char'} = $char;
1160 }
1161
1162 return $self->{'_missing_char'};
1163 }
1164
1165 =head2 match_char
1166
1167 Title : match_char
1168 Usage : $myalign->match_char('.')
1169 Function : Gets/sets the match_char attribute of the alignment
1170 Returns : An match_char string,
1171 Argument : An match_char string (optional)
1172
1173 =cut
1174
1175 sub match_char {
1176 my ($self, $char) = @_;
1177
1178 if (defined $char ) {
1179 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
1180 $self->{'_match_char'} = $char;
1181 }
1182
1183 return $self->{'_match_char'};
1184 }
1185
1186 =head2 gap_char
1187
1188 Title : gap_char
1189 Usage : $myalign->gap_char('-')
1190 Function : Gets/sets the gap_char attribute of the alignment
1191 Returns : An gap_char string, defaults to '-'
1192 Argument : An gap_char string (optional)
1193
1194 =cut
1195
1196 sub gap_char {
1197 my ($self, $char) = @_;
1198
1199 if (defined $char || ! defined $self->{'_gap_char'} ) {
1200 $char= '-' unless defined $char;
1201 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
1202 $self->{'_gap_char'} = $char;
1203 }
1204 return $self->{'_gap_char'};
1205 }
1206
1207 =head2 symbol_chars
1208
1209 Title : symbol_chars
1210 Usage : my @symbolchars = $aln->symbol_chars;
1211 Function: Returns all the seen symbols (other than gaps)
1212 Returns : array of characters that are the seen symbols
1213 Args : boolean to include the gap/missing/match characters
1214
1215 =cut
1216
1217 sub symbol_chars{
1218 my ($self,$includeextra) = @_;
1219 if( ! defined $self->{'_symbols'} ) {
1220 $self->warn("Symbol list was not initialized");
1221 return ();
1222 }
1223 my %copy = %{$self->{'_symbols'}};
1224 if( ! $includeextra ) {
1225 foreach my $char ( $self->gap_char, $self->match_char,
1226 $self->missing_char) {
1227 delete $copy{$char} if( defined $char );
1228 }
1229 }
1230 return keys %copy;
1231 }
1232
1233 =head1 Alignment descriptors
1234
1235 These read only methods describe the MSE in various ways.
1236
1237
1238 =head2 consensus_string
1239
1240 Title : consensus_string
1241 Usage : $str = $ali->consensus_string($threshold_percent)
1242 Function : Makes a strict consensus
1243 Returns :
1244 Argument : Optional treshold ranging from 0 to 100.
1245 The consensus residue has to appear at least threshold %
1246 of the sequences at a given location, otherwise a '?'
1247 character will be placed at that location.
1248 (Default value = 0%)
1249
1250 =cut
1251
1252 sub consensus_string {
1253 my $self = shift;
1254 my $threshold = shift;
1255 my $len;
1256 my ($out,$count);
1257
1258 $out = "";
1259
1260 $len = $self->length - 1;
1261
1262 foreach $count ( 0 .. $len ) {
1263 $out .= $self->_consensus_aa($count,$threshold);
1264 }
1265 return $out;
1266 }
1267
1268 sub _consensus_aa {
1269 my $self = shift;
1270 my $point = shift;
1271 my $threshold_percent = shift || -1 ;
1272 my ($seq,%hash,$count,$letter,$key);
1273
1274 foreach $seq ( $self->each_seq() ) {
1275 $letter = substr($seq->seq,$point,1);
1276 $self->throw("--$point-----------") if $letter eq '';
1277 ($letter =~ /\./) && next;
1278 # print "Looking at $letter\n";
1279 $hash{$letter}++;
1280 }
1281 my $number_of_sequences = $self->no_sequences();
1282 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
1283 $count = -1;
1284 $letter = '?';
1285
1286 foreach $key ( sort keys %hash ) {
1287 # print "Now at $key $hash{$key}\n";
1288 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
1289 $letter = $key;
1290 $count = $hash{$key};
1291 }
1292 }
1293 return $letter;
1294 }
1295
1296
1297 =head2 consensus_iupac
1298
1299 Title : consensus_iupac
1300 Usage : $str = $ali->consensus_iupac()
1301 Function :
1302
1303 Makes a consensus using IUPAC ambiguity codes from DNA
1304 and RNA. The output is in upper case except when gaps in
1305 a column force output to be in lower case.
1306
1307 Note that if your alignment sequences contain a lot of
1308 IUPAC ambiquity codes you often have to manually set
1309 alphabet. Bio::PrimarySeq::_guess_type thinks they
1310 indicate a protein sequence.
1311
1312 Returns : consensus string
1313 Argument : none
1314 Throws : on protein sequences
1315
1316 =cut
1317
1318 sub consensus_iupac {
1319 my $self = shift;
1320 my $out = "";
1321 my $len = $self->length-1;
1322
1323 # only DNA and RNA sequences are valid
1324 foreach my $seq ( $self->each_seq() ) {
1325 $self->throw("Seq [". $seq->get_nse. "] is a protein")
1326 if $seq->alphabet eq 'protein';
1327 }
1328 # loop over the alignment columns
1329 foreach my $count ( 0 .. $len ) {
1330 $out .= $self->_consensus_iupac($count);
1331 }
1332 return $out;
1333 }
1334
1335 sub _consensus_iupac {
1336 my ($self, $column) = @_;
1337 my ($string, $char, $rna);
1338
1339 #determine all residues in a column
1340 foreach my $seq ( $self->each_seq() ) {
1341 $string .= substr($seq->seq, $column, 1);
1342 }
1343 $string = uc $string;
1344
1345 # quick exit if there's an N in the string
1346 if ($string =~ /N/) {
1347 $string =~ /\W/ ? return 'n' : return 'N';
1348 }
1349 # ... or if there are only gap characters
1350 return '-' if $string =~ /^\W+$/;
1351
1352 # treat RNA as DNA in regexps
1353 if ($string =~ /U/) {
1354 $string =~ s/U/T/;
1355 $rna = 1;
1356 }
1357
1358 # the following s///'s only need to be done to the _first_ ambiguity code
1359 # as we only need to see the _range_ of characters in $string
1360
1361 if ($string =~ /[VDHB]/) {
1362 $string =~ s/V/AGC/;
1363 $string =~ s/D/AGT/;
1364 $string =~ s/H/ACT/;
1365 $string =~ s/B/CTG/;
1366 }
1367
1368 if ($string =~ /[SKYRWM]/) {
1369 $string =~ s/S/GC/;
1370 $string =~ s/K/GT/;
1371 $string =~ s/Y/CT/;
1372 $string =~ s/R/AG/;
1373 $string =~ s/W/AT/;
1374 $string =~ s/M/AC/;
1375 }
1376
1377 # and now the guts of the thing
1378
1379 if ($string =~ /A/) {
1380 $char = 'A'; # A A
1381 if ($string =~ /G/) {
1382 $char = 'R'; # A and G (purines) R
1383 if ($string =~ /C/) {
1384 $char = 'V'; # A and G and C V
1385 if ($string =~ /T/) {
1386 $char = 'N'; # A and G and C and T N
1387 }
1388 } elsif ($string =~ /T/) {
1389 $char = 'D'; # A and G and T D
1390 }
1391 } elsif ($string =~ /C/) {
1392 $char = 'M'; # A and C M
1393 if ($string =~ /T/) {
1394 $char = 'H'; # A and C and T H
1395 }
1396 } elsif ($string =~ /T/) {
1397 $char = 'W'; # A and T W
1398 }
1399 } elsif ($string =~ /C/) {
1400 $char = 'C'; # C C
1401 if ($string =~ /T/) {
1402 $char = 'Y'; # C and T (pyrimidines) Y
1403 if ($string =~ /G/) {
1404 $char = 'B'; # C and T and G B
1405 }
1406 } elsif ($string =~ /G/) {
1407 $char = 'S'; # C and G S
1408 }
1409 } elsif ($string =~ /G/) {
1410 $char = 'G'; # G G
1411 if ($string =~ /C/) {
1412 $char = 'S'; # G and C S
1413 } elsif ($string =~ /T/) {
1414 $char = 'K'; # G and T K
1415 }
1416 } elsif ($string =~ /T/) {
1417 $char = 'T'; # T T
1418 }
1419
1420 $char = 'U' if $rna and $char eq 'T';
1421 $char = lc $char if $string =~ /\W/;
1422
1423 return $char;
1424 }
1425
1426 =head2 is_flush
1427
1428 Title : is_flush
1429 Usage : if( $ali->is_flush() )
1430 :
1431 :
1432 Function : Tells you whether the alignment
1433 : is flush, ie all of the same length
1434 :
1435 :
1436 Returns : 1 or 0
1437 Argument :
1438
1439 =cut
1440
1441 sub is_flush {
1442 my ($self,$report) = @_;
1443 my $seq;
1444 my $length = (-1);
1445 my $temp;
1446
1447 foreach $seq ( $self->each_seq() ) {
1448 if( $length == (-1) ) {
1449 $length = CORE::length($seq->seq());
1450 next;
1451 }
1452
1453 $temp = CORE::length($seq->seq());
1454 if( $temp != $length ) {
1455 $self->warn("expecting $length not $temp from ".
1456 $seq->display_id) if( $report );
1457 $self->debug("expecting $length not $temp from ".
1458 $seq->display_id);
1459 $self->debug($seq->seq(). "\n");
1460 return 0;
1461 }
1462 }
1463
1464 return 1;
1465 }
1466
1467
1468 =head2 length
1469
1470 Title : length()
1471 Usage : $len = $ali->length()
1472 Function : Returns the maximum length of the alignment.
1473 To be sure the alignment is a block, use is_flush
1474 Returns :
1475 Argument :
1476
1477 =cut
1478
1479 sub length_aln {
1480 my $self = shift;
1481 $self->warn(ref($self). "::length_aln - deprecated method. Use length() instead.");
1482 $self->length(@_);
1483 }
1484
1485 sub length {
1486 my $self = shift;
1487 my $seq;
1488 my $length = (-1);
1489 my ($temp,$len);
1490
1491 foreach $seq ( $self->each_seq() ) {
1492 $temp = CORE::length($seq->seq());
1493 if( $temp > $length ) {
1494 $length = $temp;
1495 }
1496 }
1497
1498 return $length;
1499 }
1500
1501
1502 =head2 maxdisplayname_length
1503
1504 Title : maxdisplayname_length
1505 Usage : $ali->maxdisplayname_length()
1506 Function :
1507
1508 Gets the maximum length of the displayname in the
1509 alignment. Used in writing out various MSE formats.
1510
1511 Returns : integer
1512 Argument :
1513
1514 =cut
1515
1516 sub maxname_length {
1517 my $self = shift;
1518 $self->warn(ref($self). "::maxname_length - deprecated method.".
1519 " Use maxdisplayname_length() instead.");
1520 $self->maxdisplayname_length();
1521 }
1522
1523 sub maxnse_length {
1524 my $self = shift;
1525 $self->warn(ref($self). "::maxnse_length - deprecated method.".
1526 " Use maxnse_length() instead.");
1527 $self->maxdisplayname_length();
1528 }
1529
1530 sub maxdisplayname_length {
1531 my $self = shift;
1532 my $maxname = (-1);
1533 my ($seq,$len);
1534
1535 foreach $seq ( $self->each_seq() ) {
1536 $len = CORE::length $self->displayname($seq->get_nse());
1537
1538 if( $len > $maxname ) {
1539 $maxname = $len;
1540 }
1541 }
1542
1543 return $maxname;
1544 }
1545
1546 =head2 no_residues
1547
1548 Title : no_residues
1549 Usage : $no = $ali->no_residues
1550 Function : number of residues in total in the alignment
1551 Returns : integer
1552 Argument :
1553
1554 =cut
1555
1556 sub no_residues {
1557 my $self = shift;
1558 my $count = 0;
1559
1560 foreach my $seq ($self->each_seq) {
1561 my $str = $seq->seq();
1562
1563 $count += ($str =~ s/[^A-Za-z]//g);
1564 }
1565
1566 return $count;
1567 }
1568
1569 =head2 no_sequences
1570
1571 Title : no_sequences
1572 Usage : $depth = $ali->no_sequences
1573 Function : number of sequence in the sequence alignment
1574 Returns : integer
1575 Argument :
1576
1577 =cut
1578
1579 sub no_sequences {
1580 my $self = shift;
1581
1582 return scalar($self->each_seq);
1583 }
1584
1585
1586 =head2 average_percentage_identity
1587
1588 Title : average_percentage_identity
1589 Usage : $id = $align->average_percentage_identity
1590 Function: The function uses a fast method to calculate the average
1591 percentage identity of the alignment
1592 Returns : The average percentage identity of the alignment
1593 Args : None
1594 Notes : This method implemented by Kevin Howe calculates a figure that is
1595 designed to be similar to the average pairwise identity of the
1596 alignment (identical in the absence of gaps), without having to
1597 explicitly calculate pairwise identities proposed by Richard Durbin.
1598 Validated by Ewan Birney ad Alex Bateman.
1599
1600 =cut
1601
1602 sub average_percentage_identity{
1603 my ($self,@args) = @_;
1604
1605 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
1606 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
1607
1608 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
1609
1610 if (! $self->is_flush()) {
1611 $self->throw("All sequences in the alignment must be the same length");
1612 }
1613
1614 @seqs = $self->each_seq();
1615 $len = $self->length();
1616
1617 # load the each hash with correct keys for existence checks
1618
1619 for( my $index=0; $index < $len; $index++) {
1620 foreach my $letter (@alphabet) {
1621 $countHashes[$index]->{$letter} = 0;
1622 }
1623 }
1624 foreach my $seq (@seqs) {
1625 my @seqChars = split //, $seq->seq();
1626 for( my $column=0; $column < @seqChars; $column++ ) {
1627 my $char = uc($seqChars[$column]);
1628 if (exists $countHashes[$column]->{$char}) {
1629 $countHashes[$column]->{$char}++;
1630 }
1631 }
1632 }
1633
1634 $total = 0;
1635 $divisor = 0;
1636 for(my $column =0; $column < $len; $column++) {
1637 my %hash = %{$countHashes[$column]};
1638 $subdivisor = 0;
1639 foreach my $res (keys %hash) {
1640 $total += $hash{$res}*($hash{$res} - 1);
1641 $subdivisor += $hash{$res};
1642 }
1643 $divisor += $subdivisor * ($subdivisor - 1);
1644 }
1645 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
1646 }
1647
1648 =head2 percentage_identity
1649
1650 Title : percentage_identity
1651 Usage : $id = $align->percentage_identity
1652 Function: The function calculates the average percentage identity
1653 (aliased for average_percentage_identity)
1654 Returns : The average percentage identity
1655 Args : None
1656
1657 =cut
1658
1659 sub percentage_identity {
1660 my $self = shift;
1661 return $self->average_percentage_identity();
1662 }
1663
1664 =head2 overall_percentage_identity
1665
1666 Title : percentage_identity
1667 Usage : $id = $align->percentage_identity
1668 Function: The function calculates the percentage identity of
1669 the conserved columns
1670 Returns : The percentage identity of the conserved columns
1671 Args : None
1672
1673 =cut
1674
1675 sub overall_percentage_identity{
1676 my ($self,@args) = @_;
1677
1678 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
1679 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
1680
1681 my ($len, $total, @seqs, @countHashes);
1682
1683 if (! $self->is_flush()) {
1684 $self->throw("All sequences in the alignment must be the same length");
1685 }
1686
1687 @seqs = $self->each_seq();
1688 $len = $self->length();
1689
1690 # load the each hash with correct keys for existence checks
1691 for( my $index=0; $index < $len; $index++) {
1692 foreach my $letter (@alphabet) {
1693 $countHashes[$index]->{$letter} = 0;
1694 }
1695 }
1696 foreach my $seq (@seqs) {
1697 my @seqChars = split //, $seq->seq();
1698 for( my $column=0; $column < @seqChars; $column++ ) {
1699 my $char = uc($seqChars[$column]);
1700 if (exists $countHashes[$column]->{$char}) {
1701 $countHashes[$column]->{$char}++;
1702 }
1703 }
1704 }
1705
1706 $total = 0;
1707 for(my $column =0; $column < $len; $column++) {
1708 my %hash = %{$countHashes[$column]};
1709 foreach ( values %hash ) {
1710 next if( $_ == 0 );
1711 $total++ if( $_ == scalar @seqs );
1712 last;
1713 }
1714 }
1715 return ($total / $len ) * 100.0;
1716 }
1717
1718 =head1 Alignment positions
1719
1720 Methods to map a sequence position into an alignment column and back.
1721 column_from_residue_number() does the former. The latter is really a
1722 property of the sequence object and can done using
1723 L<Bio::LocatableSeq::location_from_column>:
1724
1725 # select somehow a sequence from the alignment, e.g.
1726 my $seq = $aln->get_seq_by_pos(1);
1727 #$loc is undef or Bio::LocationI object
1728 my $loc = $seq->location_from_column(5);
1729
1730
1731 =head2 column_from_residue_number
1732
1733 Title : column_from_residue_number
1734 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
1735 Function:
1736
1737 This function gives the position in the alignment
1738 (i.e. column number) of the given residue number in the
1739 sequence with the given name. For example, for the
1740 alignment
1741
1742 Seq1/91-97 AC..DEF.GH
1743 Seq2/24-30 ACGG.RTY..
1744 Seq3/43-51 AC.DDEFGHI
1745
1746 column_from_residue_number( "Seq1", 94 ) returns 5.
1747 column_from_residue_number( "Seq2", 25 ) returns 2.
1748 column_from_residue_number( "Seq3", 50 ) returns 9.
1749
1750 An exception is thrown if the residue number would lie
1751 outside the length of the aligment
1752 (e.g. column_from_residue_number( "Seq2", 22 )
1753
1754 Note: If the the parent sequence is represented by more than
1755 one alignment sequence and the residue number is present in
1756 them, this method finds only the first one.
1757
1758 Returns : A column number for the position in the alignment of the
1759 given residue in the given sequence (1 = first column)
1760 Args : A sequence id/name (not a name/start-end)
1761 A residue number in the whole sequence (not just that
1762 segment of it in the alignment)
1763
1764 =cut
1765
1766 sub column_from_residue_number {
1767 my ($self, $name, $resnumber) = @_;
1768
1769 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
1770 $self->throw("Second argument residue number missing") unless $resnumber;
1771
1772 foreach my $seq ($self->each_seq_with_id($name)) {
1773 my $col;
1774 eval {
1775 $col = $seq->column_from_residue_number($resnumber);
1776 };
1777 next if $@;
1778 return $col;
1779 }
1780
1781 $self->throw("Could not find a sequence segment in $name ".
1782 "containing residue number $resnumber");
1783
1784 }
1785
1786 =head1 Sequence names
1787
1788 Methods to manipulate the display name. The default name based on the
1789 sequence id and subsequence positions can be overridden in various
1790 ways.
1791
1792 =head2 displayname
1793
1794 Title : displayname
1795 Usage : $myalign->displayname("Ig", "IgA")
1796 Function : Gets/sets the display name of a sequence in the alignment
1797 :
1798 Returns : A display name string
1799 Argument : name of the sequence
1800 displayname of the sequence (optional)
1801
1802 =cut
1803
1804 sub get_displayname {
1805 my $self = shift;
1806 $self->warn(ref($self). "::get_displayname - deprecated method. Use displayname() instead.");
1807 $self->displayname(@_);
1808 }
1809
1810 sub set_displayname {
1811 my $self = shift;
1812 $self->warn(ref($self). "::set_displayname - deprecated method. Use displayname() instead.");
1813 $self->displayname(@_);
1814 }
1815
1816 sub displayname {
1817 my ($self, $name, $disname) = @_;
1818
1819 $self->throw("No sequence with name [$name]") unless $self->{'_seq'}->{$name};
1820
1821 if( $disname and $name) {
1822 $self->{'_dis_name'}->{$name} = $disname;
1823 return $disname;
1824 }
1825 elsif( defined $self->{'_dis_name'}->{$name} ) {
1826 return $self->{'_dis_name'}->{$name};
1827 } else {
1828 return $name;
1829 }
1830 }
1831
1832 =head2 set_displayname_count
1833
1834 Title : set_displayname_count
1835 Usage : $ali->set_displayname_count
1836 Function :
1837
1838 Sets the names to be name_# where # is the number of
1839 times this name has been used.
1840
1841 Returns :
1842 Argument :
1843
1844 =cut
1845
1846 sub set_displayname_count {
1847 my $self= shift;
1848 my (@arr,$name,$seq,$count,$temp,$nse);
1849
1850 foreach $seq ( $self->each_alphabetically() ) {
1851 $nse = $seq->get_nse();
1852
1853 #name will be set when this is the second
1854 #time (or greater) is has been seen
1855
1856 if( defined $name and $name eq ($seq->id()) ) {
1857 $temp = sprintf("%s_%s",$name,$count);
1858 $self->displayname($nse,$temp);
1859 $count++;
1860 } else {
1861 $count = 1;
1862 $name = $seq->id();
1863 $temp = sprintf("%s_%s",$name,$count);
1864 $self->displayname($nse,$temp);
1865 $count++;
1866 }
1867 }
1868 return 1;
1869 }
1870
1871 =head2 set_displayname_flat
1872
1873 Title : set_displayname_flat
1874 Usage : $ali->set_displayname_flat()
1875 Function : Makes all the sequences be displayed as just their name,
1876 not name/start-end
1877 Returns : 1
1878 Argument :
1879
1880 =cut
1881
1882 sub set_displayname_flat {
1883 my $self = shift;
1884 my ($nse,$seq);
1885
1886 foreach $seq ( $self->each_seq() ) {
1887 $nse = $seq->get_nse();
1888 $self->displayname($nse,$seq->id());
1889 }
1890 return 1;
1891 }
1892
1893 =head2 set_displayname_normal
1894
1895 Title : set_displayname_normal
1896 Usage : $ali->set_displayname_normal()
1897 Function : Makes all the sequences be displayed as name/start-end
1898 Returns :
1899 Argument :
1900
1901 =cut
1902
1903 sub set_displayname_normal {
1904 my $self = shift;
1905 my ($nse,$seq);
1906
1907 foreach $seq ( $self->each_seq() ) {
1908 $nse = $seq->get_nse();
1909 $self->displayname($nse,$nse);
1910 }
1911 return 1;
1912 }
1913
1914 =head2 source
1915
1916 Title : source
1917 Usage : $obj->source($newval)
1918 Function: sets the Alignment source program
1919 Example :
1920 Returns : value of source
1921 Args : newvalue (optional)
1922
1923
1924 =cut
1925
1926 sub source{
1927 my ($self,$value) = @_;
1928 if( defined $value) {
1929 $self->{'_source'} = $value;
1930 }
1931 return $self->{'_source'};
1932 }
1933
1934 1;