0
|
1 # $Id: SimpleAlign.pm,v 1.65.2.1 2003/07/02 16:00:19 jason Exp $
|
|
2 # BioPerl module for SimpleAlign
|
|
3 #
|
|
4 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
|
|
5 #
|
|
6 # Copyright Ewan Birney
|
|
7 #
|
|
8 # You may distribute this module under the same terms as perl itself
|
|
9
|
|
10 # POD documentation - main docs before the code
|
|
11 #
|
|
12 # History:
|
|
13 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS
|
|
14 # May 2001 major rewrite - Heikki Lehvaslaiho
|
|
15
|
|
16 =head1 NAME
|
|
17
|
|
18 Bio::SimpleAlign - Multiple alignments held as a set of sequences
|
|
19
|
|
20 =head1 SYNOPSIS
|
|
21
|
|
22 # use Bio::AlignIO to read in the alignment
|
|
23 $str = Bio::AlignIO->new('-file' => 't/data/testaln.pfam');
|
|
24 $aln = $str->next_aln();
|
|
25
|
|
26 # some descriptors
|
|
27 print $aln->length, "\n";
|
|
28 print $aln->no_residues, "\n";
|
|
29 print $aln->is_flush, "\n";
|
|
30 print $aln->no_sequences, "\n";
|
|
31 print $aln->percentage_identity, "\n";
|
|
32 print $aln->consensus_string(50), "\n";
|
|
33
|
|
34 # find the position in the alignment for a sequence location
|
|
35 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6;
|
|
36
|
|
37 # extract sequences and check values for the alignment column $pos
|
|
38 foreach $seq ($aln->each_seq) {
|
|
39 $res = $seq->subseq($pos, $pos);
|
|
40 $count{$res}++;
|
|
41 }
|
|
42 foreach $res (keys %count) {
|
|
43 printf "Res: %s Count: %2d\n", $res, $count{$res};
|
|
44 }
|
|
45
|
|
46
|
|
47 =head1 DESCRIPTION
|
|
48
|
|
49 SimpleAlign handles multiple alignments of sequences. It is very
|
|
50 permissive of types (it won't insist on things being all same length
|
|
51 etc): really it is a SequenceSet explicitly held in memory with a
|
|
52 whole series of built in manipulations and especially file format
|
|
53 systems for read/writing alignments.
|
|
54
|
|
55 SimpleAlign basically views an alignment as an immutable block of
|
|
56 text. SimpleAlign *is not* the object to be using if you want to
|
|
57 perform complex alignment manipulations.
|
|
58
|
|
59 However for lightweight display/formatting and minimal manipulation
|
|
60 (e.g. removing all-gaps columns) - this is the one to use.
|
|
61
|
|
62 SimpleAlign uses a subclass of L<Bio::PrimarySeq> class
|
|
63 L<Bio::LocatableSeq> to store its sequences. These are subsequences
|
|
64 with a start and end positions in the parent reference sequence.
|
|
65
|
|
66 Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in
|
|
67 the alignment, and this is the key for the internal hashes.
|
|
68 (name,start,end is abbreviated nse in the code). However, in many
|
|
69 cases people don't want the name/start-end to be displayed: either
|
|
70 multiple names in an alignment or names specific to the alignment
|
|
71 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called
|
|
72 'displayname', and generally is what is used to print out the
|
|
73 alignment. They default to name/start-end.
|
|
74
|
|
75 The SimpleAlign Module came from Ewan Birney's Align module.
|
|
76
|
|
77 =head1 PROGRESS
|
|
78
|
|
79 SimpleAlign is being slowly converted to bioperl coding standards,
|
|
80 mainly by Ewan.
|
|
81
|
|
82 =over 3
|
|
83
|
|
84 =item Use Bio::Root::Object - done
|
|
85
|
|
86 =item Use proper exceptions - done
|
|
87
|
|
88 =item Use hashed constructor - not done!
|
|
89
|
|
90 =back
|
|
91
|
|
92 =head1 FEEDBACK
|
|
93
|
|
94 =head2 Mailing Lists
|
|
95
|
|
96 User feedback is an integral part of the evolution of this and other
|
|
97 Bioperl modules. Send your comments and suggestions preferably to one
|
|
98 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
99
|
|
100 bioperl-l@bioperl.org - General discussion
|
|
101 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
102
|
|
103 =head2 Reporting Bugs
|
|
104
|
|
105 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
106 the bugs and their resolution. Bug reports can be submitted via email
|
|
107 or the web:
|
|
108
|
|
109 bioperl-bugs@bio.perl.org
|
|
110 http://bugzilla.bioperl.org/
|
|
111
|
|
112 =head1 AUTHOR
|
|
113
|
|
114 Ewan Birney, birney@sanger.ac.uk
|
|
115
|
|
116 =head1 CONTRIBUTORS
|
|
117
|
|
118 Richard Adams, Richard.Adams@ed.ac.uk,
|
|
119 David J. Evans, David.Evans@vir.gla.ac.uk,
|
|
120 Heikki Lehvaslaiho, heikki@ebi.ac.uk,
|
|
121 Allen Smith, allens@cpan.org,
|
|
122 Jason Stajich, jason@bioperl.org,
|
|
123 Anthony Underwood, aunderwood@phls.org.uk,
|
|
124 Xintao Wei & Giri Narasimhan, giri@cs.fiu.edu
|
|
125
|
|
126
|
|
127 =head1 SEE ALSO
|
|
128
|
|
129 L<Bio::LocatableSeq.pm>
|
|
130
|
|
131 =head1 APPENDIX
|
|
132
|
|
133 The rest of the documentation details each of the object
|
|
134 methods. Internal methods are usually preceded with a _
|
|
135
|
|
136 =cut
|
|
137
|
|
138 # 'Let the code begin...
|
|
139
|
|
140 package Bio::SimpleAlign;
|
|
141 use vars qw(@ISA %CONSERVATION_GROUPS);
|
|
142 use strict;
|
|
143
|
|
144 use Bio::Root::Root;
|
|
145 use Bio::LocatableSeq; # uses Seq's as list
|
|
146 use Bio::Align::AlignI;
|
|
147
|
|
148 BEGIN {
|
|
149 # This data should probably be in a more centralized module...
|
|
150 # it is taken from Clustalw documentation
|
|
151 # These are all the positively scoring groups that occur in the
|
|
152 # Gonnet Pam250 matrix. The strong and weak groups are
|
|
153 # defined as strong score >0.5 and weak score =<0.5 respectively.
|
|
154
|
|
155 %CONSERVATION_GROUPS = ( 'strong' => [ qw(STA
|
|
156 NEQK
|
|
157 NHQK
|
|
158 NDEQ
|
|
159 QHRK
|
|
160 MILV
|
|
161 MILF
|
|
162 HY
|
|
163 FYW)
|
|
164 ],
|
|
165 'weak' => [ qw(CSA
|
|
166 ATV
|
|
167 SAG
|
|
168 STNK
|
|
169 STPA
|
|
170 SGND
|
|
171 SNDEQK
|
|
172 NDEQHK
|
|
173 NEQHRK
|
|
174 FVLIM
|
|
175 HFY) ],
|
|
176 );
|
|
177
|
|
178 }
|
|
179
|
|
180 @ISA = qw(Bio::Root::Root Bio::Align::AlignI);
|
|
181
|
|
182 =head2 new
|
|
183
|
|
184 Title : new
|
|
185 Usage : my $aln = new Bio::SimpleAlign();
|
|
186 Function : Creates a new simple align object
|
|
187 Returns : Bio::SimpleAlign
|
|
188 Args : -source => string representing the source program
|
|
189 where this alignment came from
|
|
190
|
|
191 =cut
|
|
192
|
|
193
|
|
194 sub new {
|
|
195 my($class,@args) = @_;
|
|
196
|
|
197 my $self = $class->SUPER::new(@args);
|
|
198
|
|
199 my ($src) = $self->_rearrange([qw(SOURCE)], @args);
|
|
200 $src && $self->source($src);
|
|
201 # we need to set up internal hashs first!
|
|
202
|
|
203 $self->{'_seq'} = {};
|
|
204 $self->{'_order'} = {};
|
|
205 $self->{'_start_end_lists'} = {};
|
|
206 $self->{'_dis_name'} = {};
|
|
207 $self->{'_id'} = 'NoName';
|
|
208 $self->{'_symbols'} = {};
|
|
209 # maybe we should automatically read in from args. Hmmm...
|
|
210
|
|
211 return $self; # success - we hope!
|
|
212 }
|
|
213
|
|
214 =head1 Modifier methods
|
|
215
|
|
216 These methods modify the MSE by adding, removing or shuffling complete
|
|
217 sequences.
|
|
218
|
|
219 =head2 add_seq
|
|
220
|
|
221 Title : add_seq
|
|
222 Usage : $myalign->add_seq($newseq);
|
|
223 Function : Adds another sequence to the alignment. *Does not* align
|
|
224 it - just adds it to the hashes.
|
|
225 Returns : nothing
|
|
226 Args : a Bio::LocatableSeq object
|
|
227 order (optional)
|
|
228
|
|
229 See L<Bio::LocatableSeq> for more information
|
|
230
|
|
231 =cut
|
|
232
|
|
233 sub addSeq {
|
|
234 my $self = shift;
|
|
235 $self->warn(ref($self). "::addSeq - deprecated method. Use add_seq() instead.");
|
|
236 $self->add_seq(@_);
|
|
237 }
|
|
238
|
|
239 sub add_seq {
|
|
240 my $self = shift;
|
|
241 my $seq = shift;
|
|
242 my $order = shift;
|
|
243 my ($name,$id,$start,$end);
|
|
244
|
|
245 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) {
|
|
246 $self->throw("Unable to process non locatable sequences [", ref($seq), "]");
|
|
247 }
|
|
248
|
|
249 $id = $seq->id() ||$seq->display_id || $seq->primary_id;
|
|
250 $start = $seq->start();
|
|
251 $end = $seq->end();
|
|
252
|
|
253 # build the symbol list for this sequence,
|
|
254 # will prune out the gap and missing/match chars
|
|
255 # when actually asked for the symbol list in the
|
|
256 # symbol_chars
|
|
257 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq);
|
|
258
|
|
259 if( !defined $order ) {
|
|
260 $order = keys %{$self->{'_seq'}};
|
|
261 }
|
|
262 $name = sprintf("%s/%d-%d",$id,$start,$end);
|
|
263
|
|
264 if( $self->{'_seq'}->{$name} ) {
|
|
265 $self->warn("Replacing one sequence [$name]\n");
|
|
266 }
|
|
267 else {
|
|
268 #print STDERR "Assigning $name to $order\n";
|
|
269
|
|
270 $self->{'_order'}->{$order} = $name;
|
|
271
|
|
272 unless( exists( $self->{'_start_end_lists'}->{$id})) {
|
|
273 $self->{'_start_end_lists'}->{$id} = [];
|
|
274 }
|
|
275 push @{$self->{'_start_end_lists'}->{$id}}, $seq;
|
|
276 }
|
|
277
|
|
278 $self->{'_seq'}->{$name} = $seq;
|
|
279
|
|
280 }
|
|
281
|
|
282
|
|
283 =head2 remove_seq
|
|
284
|
|
285 Title : remove_seq
|
|
286 Usage : $aln->remove_seq($seq);
|
|
287 Function : Removes a single sequence from an alignment
|
|
288 Returns :
|
|
289 Argument : a Bio::LocatableSeq object
|
|
290
|
|
291 =cut
|
|
292
|
|
293 sub removeSeq {
|
|
294 my $self = shift;
|
|
295 $self->warn(ref($self). "::removeSeq - deprecated method. Use remove_seq() instead.");
|
|
296 $self->remove_seq(@_);
|
|
297 }
|
|
298
|
|
299 sub remove_seq {
|
|
300 my $self = shift;
|
|
301 my $seq = shift;
|
|
302 my ($name,$id,$start,$end);
|
|
303
|
|
304 $self->throw("Need Bio::Locatable seq argument ")
|
|
305 unless ref $seq && $seq->isa('Bio::LocatableSeq');
|
|
306
|
|
307 $id = $seq->id();
|
|
308 $start = $seq->start();
|
|
309 $end = $seq->end();
|
|
310 $name = sprintf("%s/%d-%d",$id,$start,$end);
|
|
311
|
|
312 if( !exists $self->{'_seq'}->{$name} ) {
|
|
313 $self->throw("Sequence $name does not exist in the alignment to remove!");
|
|
314 }
|
|
315
|
|
316 delete $self->{'_seq'}->{$name};
|
|
317
|
|
318 # we need to remove this seq from the start_end_lists hash
|
|
319
|
|
320 if (exists $self->{'_start_end_lists'}->{$id}) {
|
|
321 # we need to find the sequence in the array.
|
|
322
|
|
323 my ($i, $found);;
|
|
324 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) {
|
|
325 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) {
|
|
326 $found = 1;
|
|
327 last;
|
|
328 }
|
|
329 }
|
|
330 if ($found) {
|
|
331 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1;
|
|
332 }
|
|
333 else {
|
|
334 $self->throw("Could not find the sequence to remoce from the start-end list");
|
|
335 }
|
|
336 }
|
|
337 else {
|
|
338 $self->throw("There is no seq list for the name $id");
|
|
339 }
|
|
340 return 1;
|
|
341 # we can't do anything about the order hash but that is ok
|
|
342 # because each_seq will handle it
|
|
343 }
|
|
344
|
|
345
|
|
346 =head2 purge
|
|
347
|
|
348 Title : purge
|
|
349 Usage : $aln->purge(0.7);
|
|
350 Function:
|
|
351
|
|
352 Removes sequences above given sequence similarity
|
|
353 This function will grind on large alignments. Beware!
|
|
354
|
|
355 Example :
|
|
356 Returns : An array of the removed sequences
|
|
357 Args : float, threshold for similarity
|
|
358
|
|
359 =cut
|
|
360
|
|
361 sub purge {
|
|
362 my ($self,$perc) = @_;
|
|
363 my (%duplicate, @dups);
|
|
364
|
|
365 my @seqs = $self->each_seq();
|
|
366
|
|
367 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment
|
|
368 my $seq = $seqs[$i];
|
|
369
|
|
370 #skip if already in duplicate hash
|
|
371 next if exists $duplicate{$seq->display_id} ;
|
|
372 my $one = $seq->seq();
|
|
373
|
|
374 my @one = split '', $one; #split to get 1aa per array element
|
|
375
|
|
376 for (my $j=$i+1;$j < @seqs;$j++) {
|
|
377 my $seq2 = $seqs[$j];
|
|
378
|
|
379 #skip if already in duplicate hash
|
|
380 next if exists $duplicate{$seq2->display_id} ;
|
|
381
|
|
382 my $two = $seq2->seq();
|
|
383 my @two = split '', $two;
|
|
384
|
|
385 my $count = 0;
|
|
386 my $res = 0;
|
|
387 for (my $k=0;$k<@one;$k++) {
|
|
388 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
|
|
389 $one[$k] eq $two[$k]) {
|
|
390 $count++;
|
|
391 }
|
|
392 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) &&
|
|
393 $two[$k] ne '.' && $two[$k] ne '-' ) {
|
|
394 $res++;
|
|
395 }
|
|
396 }
|
|
397
|
|
398 my $ratio = 0;
|
|
399 $ratio = $count/$res unless $res == 0;
|
|
400
|
|
401 # if above threshold put in duplicate hash and push onto
|
|
402 # duplicate array for returning to get_unique
|
|
403 if ( $ratio > $perc ) {
|
|
404 print STDERR "duplicate!", $seq2->display_id, "\n" if $self->verbose > 0;
|
|
405 $duplicate{$seq2->display_id} = 1;
|
|
406 push @dups, $seq2;
|
|
407 }
|
|
408 }
|
|
409 }
|
|
410 foreach my $seq (@dups) {
|
|
411 $self->remove_seq($seq);
|
|
412 }
|
|
413 return @dups;
|
|
414 }
|
|
415
|
|
416 =head2 sort_alphabetically
|
|
417
|
|
418 Title : sort_alphabetically
|
|
419 Usage : $ali->sort_alphabetically
|
|
420 Function :
|
|
421
|
|
422 Changes the order of the alignemnt to alphabetical on name
|
|
423 followed by numerical by number.
|
|
424
|
|
425 Returns :
|
|
426 Argument :
|
|
427
|
|
428 =cut
|
|
429
|
|
430 sub sort_alphabetically {
|
|
431 my $self = shift;
|
|
432 my ($seq,$nse,@arr,%hash,$count);
|
|
433
|
|
434 foreach $seq ( $self->each_seq() ) {
|
|
435 $nse = $seq->get_nse;
|
|
436 $hash{$nse} = $seq;
|
|
437 }
|
|
438
|
|
439 $count = 0;
|
|
440
|
|
441 %{$self->{'_order'}} = (); # reset the hash;
|
|
442
|
|
443 foreach $nse ( sort _alpha_startend keys %hash) {
|
|
444 $self->{'_order'}->{$count} = $nse;
|
|
445
|
|
446 $count++;
|
|
447 }
|
|
448 1;
|
|
449 }
|
|
450
|
|
451 =head1 Sequence selection methods
|
|
452
|
|
453 Methods returning one or more sequences objects.
|
|
454
|
|
455 =head2 each_seq
|
|
456
|
|
457 Title : each_seq
|
|
458 Usage : foreach $seq ( $align->each_seq() )
|
|
459 Function : Gets an array of Seq objects from the alignment
|
|
460 Returns : an array
|
|
461 Argument :
|
|
462
|
|
463 =cut
|
|
464
|
|
465 sub eachSeq {
|
|
466 my $self = shift;
|
|
467 $self->warn(ref($self). "::eachSeq - deprecated method. Use each_seq() instead.");
|
|
468 $self->each_seq();
|
|
469 }
|
|
470
|
|
471 sub each_seq {
|
|
472 my $self = shift;
|
|
473 my (@arr,$order);
|
|
474
|
|
475 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) {
|
|
476 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) {
|
|
477 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}});
|
|
478 }
|
|
479 }
|
|
480
|
|
481 return @arr;
|
|
482 }
|
|
483
|
|
484
|
|
485 =head2 each_alphabetically
|
|
486
|
|
487 Title : each_alphabetically
|
|
488 Usage : foreach $seq ( $ali->each_alphabetically() )
|
|
489 Function :
|
|
490
|
|
491 Returns an array of sequence object sorted alphabetically
|
|
492 by name and then by start point.
|
|
493 Does not change the order of the alignment
|
|
494
|
|
495 Returns :
|
|
496 Argument :
|
|
497
|
|
498 =cut
|
|
499
|
|
500 sub each_alphabetically {
|
|
501 my $self = shift;
|
|
502 my ($seq,$nse,@arr,%hash,$count);
|
|
503
|
|
504 foreach $seq ( $self->each_seq() ) {
|
|
505 $nse = $seq->get_nse;
|
|
506 $hash{$nse} = $seq;
|
|
507 }
|
|
508
|
|
509 foreach $nse ( sort _alpha_startend keys %hash) {
|
|
510 push(@arr,$hash{$nse});
|
|
511 }
|
|
512
|
|
513 return @arr;
|
|
514
|
|
515 }
|
|
516
|
|
517 sub _alpha_startend {
|
|
518 my ($aname,$astart,$bname,$bstart);
|
|
519 ($aname,$astart) = split (/-/,$a);
|
|
520 ($bname,$bstart) = split (/-/,$b);
|
|
521
|
|
522 if( $aname eq $bname ) {
|
|
523 return $astart <=> $bstart;
|
|
524 }
|
|
525 else {
|
|
526 return $aname cmp $bname;
|
|
527 }
|
|
528
|
|
529 }
|
|
530
|
|
531 =head2 each_seq_with_id
|
|
532
|
|
533 Title : each_seq_with_id
|
|
534 Usage : foreach $seq ( $align->each_seq_with_id() )
|
|
535 Function :
|
|
536
|
|
537 Gets an array of Seq objects from the
|
|
538 alignment, the contents being those sequences
|
|
539 with the given name (there may be more than one)
|
|
540
|
|
541 Returns : an array
|
|
542 Argument : a seq name
|
|
543
|
|
544 =cut
|
|
545
|
|
546 sub eachSeqWithId {
|
|
547 my $self = shift;
|
|
548 $self->warn(ref($self). "::eachSeqWithId - deprecated method. Use each_seq_with_id() instead.");
|
|
549 $self->each_seq_with_id(@_);
|
|
550 }
|
|
551
|
|
552 sub each_seq_with_id {
|
|
553 my $self = shift;
|
|
554 my $id = shift;
|
|
555
|
|
556 $self->throw("Method each_seq_with_id needs a sequence name argument")
|
|
557 unless defined $id;
|
|
558
|
|
559 my (@arr, $seq);
|
|
560
|
|
561 if (exists($self->{'_start_end_lists'}->{$id})) {
|
|
562 @arr = @{$self->{'_start_end_lists'}->{$id}};
|
|
563 }
|
|
564 return @arr;
|
|
565 }
|
|
566
|
|
567 =head2 get_seq_by_pos
|
|
568
|
|
569 Title : get_seq_by_pos
|
|
570 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment
|
|
571 Function :
|
|
572
|
|
573 Gets a sequence based on its position in the alignment.
|
|
574 Numbering starts from 1. Sequence positions larger than
|
|
575 no_sequences() will thow an error.
|
|
576
|
|
577 Returns : a Bio::LocatableSeq object
|
|
578 Args : positive integer for the sequence osition
|
|
579
|
|
580 =cut
|
|
581
|
|
582 sub get_seq_by_pos {
|
|
583
|
|
584 my $self = shift;
|
|
585 my ($pos) = @_;
|
|
586
|
|
587 $self->throw("Sequence position has to be a positive integer, not [$pos]")
|
|
588 unless $pos =~ /^\d+$/ and $pos > 0;
|
|
589 $self->throw("No sequence at position [$pos]")
|
|
590 unless $pos <= $self->no_sequences ;
|
|
591
|
|
592 my $nse = $self->{'_order'}->{--$pos};
|
|
593 return $self->{'_seq'}->{$nse};
|
|
594 }
|
|
595
|
|
596 =head1 Create new alignments
|
|
597
|
|
598 The result of these methods are horizontal or vertical subsets of the
|
|
599 current MSE.
|
|
600
|
|
601 =head2 select
|
|
602
|
|
603 Title : select
|
|
604 Usage : $aln2 = $aln->select(1, 3) # three first sequences
|
|
605 Function :
|
|
606
|
|
607 Creates a new alignment from a continuous subset of
|
|
608 sequences. Numbering starts from 1. Sequence positions
|
|
609 larger than no_sequences() will thow an error.
|
|
610
|
|
611 Returns : a Bio::SimpleAlign object
|
|
612 Args : positive integer for the first sequence
|
|
613 positive integer for the last sequence to include (optional)
|
|
614
|
|
615 =cut
|
|
616
|
|
617 sub select {
|
|
618 my $self = shift;
|
|
619 my ($start, $end) = @_;
|
|
620
|
|
621 $self->throw("Select start has to be a positive integer, not [$start]")
|
|
622 unless $start =~ /^\d+$/ and $start > 0;
|
|
623 $self->throw("Select end has to be a positive integer, not [$end]")
|
|
624 unless $end =~ /^\d+$/ and $end > 0;
|
|
625 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]")
|
|
626 unless $start <= $end;
|
|
627
|
|
628 my $aln = new $self;
|
|
629 foreach my $pos ($start .. $end) {
|
|
630 $aln->add_seq($self->get_seq_by_pos($pos));
|
|
631 }
|
|
632 $aln->id($self->id);
|
|
633 return $aln;
|
|
634 }
|
|
635
|
|
636 =head2 select_noncont
|
|
637
|
|
638 Title : select_noncont
|
|
639 Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences
|
|
640 Function :
|
|
641
|
|
642 Creates a new alignment from a subset of
|
|
643 sequences. Numbering starts from 1. Sequence positions
|
|
644 larger than no_sequences() will thow an error.
|
|
645
|
|
646 Returns : a Bio::SimpleAlign object
|
|
647 Args : array of integers for the sequences
|
|
648
|
|
649 =cut
|
|
650
|
|
651 sub select_noncont {
|
|
652 my $self = shift;
|
|
653 my (@pos) = @_;
|
|
654 my $end = $self->no_sequences;
|
|
655 foreach ( @pos ) {
|
|
656 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]")
|
|
657 unless( /^\d+$/ && $_ > 0 && $_ <= $end );
|
|
658 }
|
|
659 my $aln = new $self;
|
|
660 foreach my $p (@pos) {
|
|
661 $aln->add_seq($self->get_seq_by_pos($p));
|
|
662 }
|
|
663 $aln->id($self->id);
|
|
664 return $aln;
|
|
665 }
|
|
666
|
|
667 =head2 slice
|
|
668
|
|
669 Title : slice
|
|
670 Usage : $aln2 = $aln->slice(20, 30)
|
|
671 Function :
|
|
672
|
|
673 Creates a slice from the alignment inclusive of start and
|
|
674 end columns. Sequences with no residues in the slice are
|
|
675 excluded from the new alignment and a warning is printed.
|
|
676 Slice beyond the length of the sequence does not do
|
|
677 padding.
|
|
678
|
|
679 Returns : a Bio::SimpleAlign object
|
|
680 Args : positive integer for start column
|
|
681 positive integer for end column
|
|
682
|
|
683 =cut
|
|
684
|
|
685 sub slice {
|
|
686 my $self = shift;
|
|
687 my ($start, $end) = @_;
|
|
688
|
|
689 $self->throw("Slice start has to be a positive integer, not [$start]")
|
|
690 unless $start =~ /^\d+$/ and $start > 0;
|
|
691 $self->throw("Slice end has to be a positive integer, not [$end]")
|
|
692 unless $end =~ /^\d+$/ and $end > 0;
|
|
693 $self->throw("Slice $start [$start] has to be smaller than or equal to end [$end]")
|
|
694 unless $start <= $end;
|
|
695 my $aln_length = $self->length;
|
|
696 $self->throw("This alignment has only ". $self->length.
|
|
697 " residues. Slice start [$start] is too bigger.")
|
|
698 if $start > $self->length;
|
|
699
|
|
700 my $aln = new $self;
|
|
701 $aln->id($self->id);
|
|
702 foreach my $seq ( $self->each_seq() ) {
|
|
703
|
|
704 my $new_seq = new Bio::LocatableSeq (-id => $seq->id);
|
|
705
|
|
706 # seq
|
|
707 my $seq_end = $end;
|
|
708 $seq_end = $seq->length if $end > $seq->length;
|
|
709 my $slice_seq = $seq->subseq($start, $seq_end);
|
|
710 $new_seq->seq( $slice_seq );
|
|
711
|
|
712 # start
|
|
713 if ($start > 1) {
|
|
714 my $pre_start_seq = $seq->subseq(1, $start - 1);
|
|
715 $pre_start_seq =~ s/\W//g; #print "$pre_start_seq\n";
|
|
716 $new_seq->start( $seq->start + CORE::length($pre_start_seq) );
|
|
717 } else {
|
|
718 $new_seq->start( $seq->start);
|
|
719 }
|
|
720
|
|
721 # end
|
|
722 $slice_seq =~ s/\W//g;
|
|
723 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 );
|
|
724
|
|
725 if ($new_seq->start and $new_seq->end >= $new_seq->start) {
|
|
726 $aln->add_seq($new_seq);
|
|
727 } else {
|
|
728 my $nse = $seq->get_nse();
|
|
729 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.".
|
|
730 " Sequence excluded from the new alignment.");
|
|
731 }
|
|
732
|
|
733 }
|
|
734
|
|
735 return $aln;
|
|
736 }
|
|
737
|
|
738 =head2 remove_columns
|
|
739
|
|
740 Title : remove_column
|
|
741 Usage : $aln2 = $aln->remove_columns(['mismatch','weak'])
|
|
742 Function :
|
|
743 Creates an aligment with columns removed corresponding to
|
|
744 the specified criteria.
|
|
745 Returns : a L<Bio::SimpleAlign> object
|
|
746 Args : array ref of types, 'match'|'weak'|'strong'|'mismatch'
|
|
747
|
|
748 =cut
|
|
749
|
|
750 sub remove_columns{
|
|
751 my ($self,$type) = @_;
|
|
752 my %matchchars = ( 'match' => '\*',
|
|
753 'weak' => '\.',
|
|
754 'strong' => ':',
|
|
755 'mismatch'=> ' ',
|
|
756 );
|
|
757 #get the characters to delete against
|
|
758 my $del_char;
|
|
759 foreach my $type(@{$type}){
|
|
760 $del_char.= $matchchars{$type};
|
|
761 }
|
|
762
|
|
763 my $match_line = $self->match_line;
|
|
764 my $aln = new $self;
|
|
765
|
|
766 my @remove;
|
|
767 my $length = 0;
|
|
768
|
|
769 #do the matching to get the segments to remove
|
|
770 while($match_line=~m/[$del_char]/g){
|
|
771 my $start = pos($match_line)-1;
|
|
772 $match_line=~/\G[$del_char]+/gc;
|
|
773 my $end = pos($match_line)-1;
|
|
774
|
|
775 #have to offset the start and end for subsequent removes
|
|
776 $start-=$length;
|
|
777 $end -=$length;
|
|
778 $length += ($end-$start+1);
|
|
779 push @remove, [$start,$end];
|
|
780 }
|
|
781
|
|
782 #remove the segments
|
|
783 $aln = $self->_remove_col($aln,\@remove);
|
|
784
|
|
785 return $aln;
|
|
786 }
|
|
787
|
|
788 sub _remove_col {
|
|
789 my ($self,$aln,$remove) = @_;
|
|
790 my @new;
|
|
791
|
|
792 #splice out the segments and create new seq
|
|
793 foreach my $seq($self->each_seq){
|
|
794 my $new_seq = new Bio::LocatableSeq(-id=>$seq->id);
|
|
795 my $sequence;
|
|
796 foreach my $pair(@{$remove}){
|
|
797 my $start = $pair->[0];
|
|
798 my $end = $pair->[1];
|
|
799 $sequence = $seq->seq unless $sequence;
|
|
800 my $spliced;
|
|
801 $spliced .= $start > 0 ? substr($sequence,0,$start) : '';
|
|
802 $spliced .= substr($sequence,$end+1,$seq->length-$end+1);
|
|
803 $sequence = $spliced;
|
|
804 if ($start == 1) {
|
|
805 $new_seq->start($end);
|
|
806 }
|
|
807 else {
|
|
808 $new_seq->start( $seq->start);
|
|
809 }
|
|
810 # end
|
|
811 if($end >= $seq->end){
|
|
812 $new_seq->end( $start);
|
|
813 }
|
|
814 else {
|
|
815 $new_seq->end($seq->end);
|
|
816 }
|
|
817 }
|
|
818 $new_seq->seq($sequence);
|
|
819 push @new, $new_seq;
|
|
820 }
|
|
821 #add the new seqs to the alignment
|
|
822 foreach my $new(@new){
|
|
823 $aln->add_seq($new);
|
|
824 }
|
|
825 return $aln;
|
|
826 }
|
|
827
|
|
828 =head1 Change sequences within the MSE
|
|
829
|
|
830 These methods affect characters in all sequences without changeing the
|
|
831 alignment.
|
|
832
|
|
833
|
|
834 =head2 map_chars
|
|
835
|
|
836 Title : map_chars
|
|
837 Usage : $ali->map_chars('\.','-')
|
|
838 Function :
|
|
839
|
|
840 Does a s/$arg1/$arg2/ on the sequences. Useful for gap
|
|
841 characters
|
|
842
|
|
843 Notice that the from (arg1) is interpretted as a regex,
|
|
844 so be careful about quoting meta characters (eg
|
|
845 $ali->map_chars('.','-') wont do what you want)
|
|
846
|
|
847 Returns :
|
|
848 Argument : 'from' rexexp
|
|
849 'to' string
|
|
850
|
|
851 =cut
|
|
852
|
|
853 sub map_chars {
|
|
854 my $self = shift;
|
|
855 my $from = shift;
|
|
856 my $to = shift;
|
|
857 my ($seq,$temp);
|
|
858
|
|
859 $self->throw("Need exactly two arguments")
|
|
860 unless defined $from and defined $to;
|
|
861
|
|
862 foreach $seq ( $self->each_seq() ) {
|
|
863 $temp = $seq->seq();
|
|
864 $temp =~ s/$from/$to/g;
|
|
865 $seq->seq($temp);
|
|
866 }
|
|
867 return 1;
|
|
868 }
|
|
869
|
|
870
|
|
871 =head2 uppercase
|
|
872
|
|
873 Title : uppercase()
|
|
874 Usage : $ali->uppercase()
|
|
875 Function : Sets all the sequences to uppercase
|
|
876 Returns :
|
|
877 Argument :
|
|
878
|
|
879 =cut
|
|
880
|
|
881 sub uppercase {
|
|
882 my $self = shift;
|
|
883 my $seq;
|
|
884 my $temp;
|
|
885
|
|
886 foreach $seq ( $self->each_seq() ) {
|
|
887 $temp = $seq->seq();
|
|
888 $temp =~ tr/[a-z]/[A-Z]/;
|
|
889
|
|
890 $seq->seq($temp);
|
|
891 }
|
|
892 return 1;
|
|
893 }
|
|
894
|
|
895 =head2 cigar_line
|
|
896
|
|
897 Title : cigar_line()
|
|
898 Usage : $align->cigar_line()
|
|
899 Function : Generates a "cigar" line for each sequence in the alignment
|
|
900 The format is simply A-1,60;B-1,1:4,60;C-5,10:12,58
|
|
901 where A,B,C,etc. are the sequence identifiers, and the numbers
|
|
902 refer to conserved positions within the alignment
|
|
903 Args : none
|
|
904
|
|
905 =cut
|
|
906
|
|
907 sub cigar_line {
|
|
908 my ($self) = @_;
|
|
909
|
|
910 my %cigar;
|
|
911 my %clines;
|
|
912 my @seqchars;
|
|
913 my $seqcount = 0;
|
|
914 my $sc;
|
|
915 foreach my $seq ( $self->each_seq ) {
|
|
916 push @seqchars, [ split(//, uc ($seq->seq)) ];
|
|
917 $sc = scalar(@seqchars);
|
|
918 }
|
|
919
|
|
920 foreach my $pos ( 0..$self->length ) {
|
|
921 my $i=0;
|
|
922 foreach my $seq ( @seqchars ) {
|
|
923 $i++;
|
|
924 # print STDERR "Seq $i at pos $pos: ".$seq->[$pos]."\n";
|
|
925 if ($seq->[$pos] eq '.') {
|
|
926 if (defined $cigar{$i} && $clines{$i} !~ $cigar{$i}) {
|
|
927 $clines{$i}.=$cigar{$i};
|
|
928 }
|
|
929 }
|
|
930 else {
|
|
931 if (! defined $cigar{$i}) {
|
|
932 $clines{$i}.=($pos+1).",";
|
|
933 }
|
|
934 $cigar{$i}=$pos+1;
|
|
935 }
|
|
936 if ($pos+1 == $self->length && ($clines{$i} =~ /\,$/) ) {
|
|
937 $clines{$i}.=$cigar{$i};
|
|
938 }
|
|
939 }
|
|
940 }
|
|
941 for(my $i=1; $i<$sc+1;$i++) {
|
|
942 print STDERR "Seq $i cigar line ".$clines{$i}."\n";
|
|
943 }
|
|
944 return %clines;
|
|
945 }
|
|
946
|
|
947 =head2 match_line
|
|
948
|
|
949 Title : match_line()
|
|
950 Usage : $align->match_line()
|
|
951 Function : Generates a match line - much like consensus string
|
|
952 except that a line indicating the '*' for a match.
|
|
953 Args : (optional) Match line characters ('*' by default)
|
|
954 (optional) Strong match char (':' by default)
|
|
955 (optional) Weak match char ('.' by default)
|
|
956
|
|
957 =cut
|
|
958
|
|
959 sub match_line {
|
|
960 my ($self,$matchlinechar, $strong, $weak) = @_;
|
|
961 my %matchchars = ( 'match' => $matchlinechar || '*',
|
|
962 'weak' => $weak || '.',
|
|
963 'strong' => $strong || ':',
|
|
964 'mismatch'=> ' ',
|
|
965 );
|
|
966
|
|
967
|
|
968 my @seqchars;
|
|
969 my $seqcount = 0;
|
|
970 my $alphabet;
|
|
971 foreach my $seq ( $self->each_seq ) {
|
|
972 push @seqchars, [ split(//, uc ($seq->seq)) ];
|
|
973 $alphabet = $seq->alphabet unless defined $alphabet;
|
|
974 }
|
|
975 my $refseq = shift @seqchars;
|
|
976 # let's just march down the columns
|
|
977 my $matchline;
|
|
978 POS: foreach my $pos ( 0..$self->length ) {
|
|
979 my $refchar = $refseq->[$pos];
|
|
980 next unless $refchar; # skip ''
|
|
981 my %col = ($refchar => 1);
|
|
982 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' ');
|
|
983 foreach my $seq ( @seqchars ) {
|
|
984 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' ||
|
|
985 $seq->[$pos] eq ' ' );
|
|
986 $col{$seq->[$pos]}++;
|
|
987 }
|
|
988 my @colresidues = sort keys %col;
|
|
989 my $char = $matchchars{'mismatch'};
|
|
990 # if all the values are the same
|
|
991 if( $dash ) { $char = $matchchars{'mismatch'} }
|
|
992 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} }
|
|
993 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong
|
|
994 # matches for protein seqs
|
|
995 TYPE: foreach my $type ( qw(strong weak) ) {
|
|
996 # iterate through categories
|
|
997 my %groups;
|
|
998 # iterate through each of the aa in the col
|
|
999 # look to see which groups it is in
|
|
1000 foreach my $c ( @colresidues ) {
|
|
1001 foreach my $f ( grep /\Q$c/, @{$CONSERVATION_GROUPS{$type}} ) {
|
|
1002 push @{$groups{$f}},$c;
|
|
1003 }
|
|
1004 }
|
|
1005 GRP: foreach my $cols ( values %groups ) {
|
|
1006 @$cols = sort @$cols;
|
|
1007 # now we are just testing to see if two arrays
|
|
1008 # are identical w/o changing either one
|
|
1009
|
|
1010 # have to be same len
|
|
1011 next if( scalar @$cols != scalar @colresidues );
|
|
1012 # walk down the length and check each slot
|
|
1013 for($_=0;$_ < (scalar @$cols);$_++ ) {
|
|
1014 next GRP if( $cols->[$_] ne $colresidues[$_] );
|
|
1015 }
|
|
1016 $char = $matchchars{$type};
|
|
1017 last TYPE;
|
|
1018 }
|
|
1019 }
|
|
1020 }
|
|
1021 $matchline .= $char;
|
|
1022 }
|
|
1023 return $matchline;
|
|
1024 }
|
|
1025
|
|
1026 =head2 match
|
|
1027
|
|
1028 Title : match()
|
|
1029 Usage : $ali->match()
|
|
1030 Function :
|
|
1031
|
|
1032 Goes through all columns and changes residues that are
|
|
1033 identical to residue in first sequence to match '.'
|
|
1034 character. Sets match_char.
|
|
1035
|
|
1036 USE WITH CARE: Most MSE formats do not support match
|
|
1037 characters in sequences, so this is mostly for output
|
|
1038 only. NEXUS format (Bio::AlignIO::nexus) can handle
|
|
1039 it.
|
|
1040
|
|
1041 Returns : 1
|
|
1042 Argument : a match character, optional, defaults to '.'
|
|
1043
|
|
1044 =cut
|
|
1045
|
|
1046 sub match {
|
|
1047 my ($self, $match) = @_;
|
|
1048
|
|
1049 $match ||= '.';
|
|
1050 my ($matching_char) = $match;
|
|
1051 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #';
|
|
1052 $self->map_chars($matching_char, '-');
|
|
1053
|
|
1054 my @seqs = $self->each_seq();
|
|
1055 return 1 unless scalar @seqs > 1;
|
|
1056
|
|
1057 my $refseq = shift @seqs ;
|
|
1058 my @refseq = split //, $refseq->seq;
|
|
1059 my $gapchar = $self->gap_char;
|
|
1060
|
|
1061 foreach my $seq ( @seqs ) {
|
|
1062 my @varseq = split //, $seq->seq();
|
|
1063 for ( my $i=0; $i < scalar @varseq; $i++) {
|
|
1064 $varseq[$i] = $match if defined $refseq[$i] &&
|
|
1065 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
|
|
1066 $refseq[$i] =~ /$gapchar/ )
|
|
1067 && $refseq[$i] eq $varseq[$i];
|
|
1068 }
|
|
1069 $seq->seq(join '', @varseq);
|
|
1070 }
|
|
1071 $self->match_char($match);
|
|
1072 return 1;
|
|
1073 }
|
|
1074
|
|
1075
|
|
1076 =head2 unmatch
|
|
1077
|
|
1078 Title : unmatch()
|
|
1079 Usage : $ali->unmatch()
|
|
1080 Function : Undoes the effect of method match. Unsets match_char.
|
|
1081
|
|
1082 Returns : 1
|
|
1083 Argument : a match character, optional, defaults to '.'
|
|
1084
|
|
1085 See L<match> and L<match_char>
|
|
1086
|
|
1087 =cut
|
|
1088
|
|
1089 sub unmatch {
|
|
1090 my ($self, $match) = @_;
|
|
1091
|
|
1092 $match ||= '.';
|
|
1093
|
|
1094 my @seqs = $self->each_seq();
|
|
1095 return 1 unless scalar @seqs > 1;
|
|
1096
|
|
1097 my $refseq = shift @seqs ;
|
|
1098 my @refseq = split //, $refseq->seq;
|
|
1099 my $gapchar = $self->gap_char;
|
|
1100 foreach my $seq ( @seqs ) {
|
|
1101 my @varseq = split //, $seq->seq();
|
|
1102 for ( my $i=0; $i < scalar @varseq; $i++) {
|
|
1103 $varseq[$i] = $refseq[$i] if defined $refseq[$i] &&
|
|
1104 ( $refseq[$i] =~ /[A-Za-z\*]/ ||
|
|
1105 $refseq[$i] =~ /$gapchar/ ) &&
|
|
1106 $varseq[$i] eq $match;
|
|
1107 }
|
|
1108 $seq->seq(join '', @varseq);
|
|
1109 }
|
|
1110 $self->match_char('');
|
|
1111 return 1;
|
|
1112 }
|
|
1113
|
|
1114 =head1 MSE attibutes
|
|
1115
|
|
1116 Methods for setting and reading the MSE attributes.
|
|
1117
|
|
1118 Note that the methods defining character semantics depend on the user
|
|
1119 to set them sensibly. They are needed only by certain input/output
|
|
1120 methods. Unset them by setting to an empty string ('').
|
|
1121
|
|
1122 =head2 id
|
|
1123
|
|
1124 Title : id
|
|
1125 Usage : $myalign->id("Ig")
|
|
1126 Function : Gets/sets the id field of the alignment
|
|
1127 Returns : An id string
|
|
1128 Argument : An id string (optional)
|
|
1129
|
|
1130 =cut
|
|
1131
|
|
1132 sub id {
|
|
1133 my ($self, $name) = @_;
|
|
1134
|
|
1135 if (defined( $name )) {
|
|
1136 $self->{'_id'} = $name;
|
|
1137 }
|
|
1138
|
|
1139 return $self->{'_id'};
|
|
1140 }
|
|
1141
|
|
1142 =head2 missing_char
|
|
1143
|
|
1144 Title : missing_char
|
|
1145 Usage : $myalign->missing_char("?")
|
|
1146 Function : Gets/sets the missing_char attribute of the alignment
|
|
1147 It is generally recommended to set it to 'n' or 'N'
|
|
1148 for nucleotides and to 'X' for protein.
|
|
1149 Returns : An missing_char string,
|
|
1150 Argument : An missing_char string (optional)
|
|
1151
|
|
1152 =cut
|
|
1153
|
|
1154 sub missing_char {
|
|
1155 my ($self, $char) = @_;
|
|
1156
|
|
1157 if (defined $char ) {
|
|
1158 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1;
|
|
1159 $self->{'_missing_char'} = $char;
|
|
1160 }
|
|
1161
|
|
1162 return $self->{'_missing_char'};
|
|
1163 }
|
|
1164
|
|
1165 =head2 match_char
|
|
1166
|
|
1167 Title : match_char
|
|
1168 Usage : $myalign->match_char('.')
|
|
1169 Function : Gets/sets the match_char attribute of the alignment
|
|
1170 Returns : An match_char string,
|
|
1171 Argument : An match_char string (optional)
|
|
1172
|
|
1173 =cut
|
|
1174
|
|
1175 sub match_char {
|
|
1176 my ($self, $char) = @_;
|
|
1177
|
|
1178 if (defined $char ) {
|
|
1179 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1;
|
|
1180 $self->{'_match_char'} = $char;
|
|
1181 }
|
|
1182
|
|
1183 return $self->{'_match_char'};
|
|
1184 }
|
|
1185
|
|
1186 =head2 gap_char
|
|
1187
|
|
1188 Title : gap_char
|
|
1189 Usage : $myalign->gap_char('-')
|
|
1190 Function : Gets/sets the gap_char attribute of the alignment
|
|
1191 Returns : An gap_char string, defaults to '-'
|
|
1192 Argument : An gap_char string (optional)
|
|
1193
|
|
1194 =cut
|
|
1195
|
|
1196 sub gap_char {
|
|
1197 my ($self, $char) = @_;
|
|
1198
|
|
1199 if (defined $char || ! defined $self->{'_gap_char'} ) {
|
|
1200 $char= '-' unless defined $char;
|
|
1201 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1;
|
|
1202 $self->{'_gap_char'} = $char;
|
|
1203 }
|
|
1204 return $self->{'_gap_char'};
|
|
1205 }
|
|
1206
|
|
1207 =head2 symbol_chars
|
|
1208
|
|
1209 Title : symbol_chars
|
|
1210 Usage : my @symbolchars = $aln->symbol_chars;
|
|
1211 Function: Returns all the seen symbols (other than gaps)
|
|
1212 Returns : array of characters that are the seen symbols
|
|
1213 Args : boolean to include the gap/missing/match characters
|
|
1214
|
|
1215 =cut
|
|
1216
|
|
1217 sub symbol_chars{
|
|
1218 my ($self,$includeextra) = @_;
|
|
1219 if( ! defined $self->{'_symbols'} ) {
|
|
1220 $self->warn("Symbol list was not initialized");
|
|
1221 return ();
|
|
1222 }
|
|
1223 my %copy = %{$self->{'_symbols'}};
|
|
1224 if( ! $includeextra ) {
|
|
1225 foreach my $char ( $self->gap_char, $self->match_char,
|
|
1226 $self->missing_char) {
|
|
1227 delete $copy{$char} if( defined $char );
|
|
1228 }
|
|
1229 }
|
|
1230 return keys %copy;
|
|
1231 }
|
|
1232
|
|
1233 =head1 Alignment descriptors
|
|
1234
|
|
1235 These read only methods describe the MSE in various ways.
|
|
1236
|
|
1237
|
|
1238 =head2 consensus_string
|
|
1239
|
|
1240 Title : consensus_string
|
|
1241 Usage : $str = $ali->consensus_string($threshold_percent)
|
|
1242 Function : Makes a strict consensus
|
|
1243 Returns :
|
|
1244 Argument : Optional treshold ranging from 0 to 100.
|
|
1245 The consensus residue has to appear at least threshold %
|
|
1246 of the sequences at a given location, otherwise a '?'
|
|
1247 character will be placed at that location.
|
|
1248 (Default value = 0%)
|
|
1249
|
|
1250 =cut
|
|
1251
|
|
1252 sub consensus_string {
|
|
1253 my $self = shift;
|
|
1254 my $threshold = shift;
|
|
1255 my $len;
|
|
1256 my ($out,$count);
|
|
1257
|
|
1258 $out = "";
|
|
1259
|
|
1260 $len = $self->length - 1;
|
|
1261
|
|
1262 foreach $count ( 0 .. $len ) {
|
|
1263 $out .= $self->_consensus_aa($count,$threshold);
|
|
1264 }
|
|
1265 return $out;
|
|
1266 }
|
|
1267
|
|
1268 sub _consensus_aa {
|
|
1269 my $self = shift;
|
|
1270 my $point = shift;
|
|
1271 my $threshold_percent = shift || -1 ;
|
|
1272 my ($seq,%hash,$count,$letter,$key);
|
|
1273
|
|
1274 foreach $seq ( $self->each_seq() ) {
|
|
1275 $letter = substr($seq->seq,$point,1);
|
|
1276 $self->throw("--$point-----------") if $letter eq '';
|
|
1277 ($letter =~ /\./) && next;
|
|
1278 # print "Looking at $letter\n";
|
|
1279 $hash{$letter}++;
|
|
1280 }
|
|
1281 my $number_of_sequences = $self->no_sequences();
|
|
1282 my $threshold = $number_of_sequences * $threshold_percent / 100. ;
|
|
1283 $count = -1;
|
|
1284 $letter = '?';
|
|
1285
|
|
1286 foreach $key ( sort keys %hash ) {
|
|
1287 # print "Now at $key $hash{$key}\n";
|
|
1288 if( $hash{$key} > $count && $hash{$key} >= $threshold) {
|
|
1289 $letter = $key;
|
|
1290 $count = $hash{$key};
|
|
1291 }
|
|
1292 }
|
|
1293 return $letter;
|
|
1294 }
|
|
1295
|
|
1296
|
|
1297 =head2 consensus_iupac
|
|
1298
|
|
1299 Title : consensus_iupac
|
|
1300 Usage : $str = $ali->consensus_iupac()
|
|
1301 Function :
|
|
1302
|
|
1303 Makes a consensus using IUPAC ambiguity codes from DNA
|
|
1304 and RNA. The output is in upper case except when gaps in
|
|
1305 a column force output to be in lower case.
|
|
1306
|
|
1307 Note that if your alignment sequences contain a lot of
|
|
1308 IUPAC ambiquity codes you often have to manually set
|
|
1309 alphabet. Bio::PrimarySeq::_guess_type thinks they
|
|
1310 indicate a protein sequence.
|
|
1311
|
|
1312 Returns : consensus string
|
|
1313 Argument : none
|
|
1314 Throws : on protein sequences
|
|
1315
|
|
1316 =cut
|
|
1317
|
|
1318 sub consensus_iupac {
|
|
1319 my $self = shift;
|
|
1320 my $out = "";
|
|
1321 my $len = $self->length-1;
|
|
1322
|
|
1323 # only DNA and RNA sequences are valid
|
|
1324 foreach my $seq ( $self->each_seq() ) {
|
|
1325 $self->throw("Seq [". $seq->get_nse. "] is a protein")
|
|
1326 if $seq->alphabet eq 'protein';
|
|
1327 }
|
|
1328 # loop over the alignment columns
|
|
1329 foreach my $count ( 0 .. $len ) {
|
|
1330 $out .= $self->_consensus_iupac($count);
|
|
1331 }
|
|
1332 return $out;
|
|
1333 }
|
|
1334
|
|
1335 sub _consensus_iupac {
|
|
1336 my ($self, $column) = @_;
|
|
1337 my ($string, $char, $rna);
|
|
1338
|
|
1339 #determine all residues in a column
|
|
1340 foreach my $seq ( $self->each_seq() ) {
|
|
1341 $string .= substr($seq->seq, $column, 1);
|
|
1342 }
|
|
1343 $string = uc $string;
|
|
1344
|
|
1345 # quick exit if there's an N in the string
|
|
1346 if ($string =~ /N/) {
|
|
1347 $string =~ /\W/ ? return 'n' : return 'N';
|
|
1348 }
|
|
1349 # ... or if there are only gap characters
|
|
1350 return '-' if $string =~ /^\W+$/;
|
|
1351
|
|
1352 # treat RNA as DNA in regexps
|
|
1353 if ($string =~ /U/) {
|
|
1354 $string =~ s/U/T/;
|
|
1355 $rna = 1;
|
|
1356 }
|
|
1357
|
|
1358 # the following s///'s only need to be done to the _first_ ambiguity code
|
|
1359 # as we only need to see the _range_ of characters in $string
|
|
1360
|
|
1361 if ($string =~ /[VDHB]/) {
|
|
1362 $string =~ s/V/AGC/;
|
|
1363 $string =~ s/D/AGT/;
|
|
1364 $string =~ s/H/ACT/;
|
|
1365 $string =~ s/B/CTG/;
|
|
1366 }
|
|
1367
|
|
1368 if ($string =~ /[SKYRWM]/) {
|
|
1369 $string =~ s/S/GC/;
|
|
1370 $string =~ s/K/GT/;
|
|
1371 $string =~ s/Y/CT/;
|
|
1372 $string =~ s/R/AG/;
|
|
1373 $string =~ s/W/AT/;
|
|
1374 $string =~ s/M/AC/;
|
|
1375 }
|
|
1376
|
|
1377 # and now the guts of the thing
|
|
1378
|
|
1379 if ($string =~ /A/) {
|
|
1380 $char = 'A'; # A A
|
|
1381 if ($string =~ /G/) {
|
|
1382 $char = 'R'; # A and G (purines) R
|
|
1383 if ($string =~ /C/) {
|
|
1384 $char = 'V'; # A and G and C V
|
|
1385 if ($string =~ /T/) {
|
|
1386 $char = 'N'; # A and G and C and T N
|
|
1387 }
|
|
1388 } elsif ($string =~ /T/) {
|
|
1389 $char = 'D'; # A and G and T D
|
|
1390 }
|
|
1391 } elsif ($string =~ /C/) {
|
|
1392 $char = 'M'; # A and C M
|
|
1393 if ($string =~ /T/) {
|
|
1394 $char = 'H'; # A and C and T H
|
|
1395 }
|
|
1396 } elsif ($string =~ /T/) {
|
|
1397 $char = 'W'; # A and T W
|
|
1398 }
|
|
1399 } elsif ($string =~ /C/) {
|
|
1400 $char = 'C'; # C C
|
|
1401 if ($string =~ /T/) {
|
|
1402 $char = 'Y'; # C and T (pyrimidines) Y
|
|
1403 if ($string =~ /G/) {
|
|
1404 $char = 'B'; # C and T and G B
|
|
1405 }
|
|
1406 } elsif ($string =~ /G/) {
|
|
1407 $char = 'S'; # C and G S
|
|
1408 }
|
|
1409 } elsif ($string =~ /G/) {
|
|
1410 $char = 'G'; # G G
|
|
1411 if ($string =~ /C/) {
|
|
1412 $char = 'S'; # G and C S
|
|
1413 } elsif ($string =~ /T/) {
|
|
1414 $char = 'K'; # G and T K
|
|
1415 }
|
|
1416 } elsif ($string =~ /T/) {
|
|
1417 $char = 'T'; # T T
|
|
1418 }
|
|
1419
|
|
1420 $char = 'U' if $rna and $char eq 'T';
|
|
1421 $char = lc $char if $string =~ /\W/;
|
|
1422
|
|
1423 return $char;
|
|
1424 }
|
|
1425
|
|
1426 =head2 is_flush
|
|
1427
|
|
1428 Title : is_flush
|
|
1429 Usage : if( $ali->is_flush() )
|
|
1430 :
|
|
1431 :
|
|
1432 Function : Tells you whether the alignment
|
|
1433 : is flush, ie all of the same length
|
|
1434 :
|
|
1435 :
|
|
1436 Returns : 1 or 0
|
|
1437 Argument :
|
|
1438
|
|
1439 =cut
|
|
1440
|
|
1441 sub is_flush {
|
|
1442 my ($self,$report) = @_;
|
|
1443 my $seq;
|
|
1444 my $length = (-1);
|
|
1445 my $temp;
|
|
1446
|
|
1447 foreach $seq ( $self->each_seq() ) {
|
|
1448 if( $length == (-1) ) {
|
|
1449 $length = CORE::length($seq->seq());
|
|
1450 next;
|
|
1451 }
|
|
1452
|
|
1453 $temp = CORE::length($seq->seq());
|
|
1454 if( $temp != $length ) {
|
|
1455 $self->warn("expecting $length not $temp from ".
|
|
1456 $seq->display_id) if( $report );
|
|
1457 $self->debug("expecting $length not $temp from ".
|
|
1458 $seq->display_id);
|
|
1459 $self->debug($seq->seq(). "\n");
|
|
1460 return 0;
|
|
1461 }
|
|
1462 }
|
|
1463
|
|
1464 return 1;
|
|
1465 }
|
|
1466
|
|
1467
|
|
1468 =head2 length
|
|
1469
|
|
1470 Title : length()
|
|
1471 Usage : $len = $ali->length()
|
|
1472 Function : Returns the maximum length of the alignment.
|
|
1473 To be sure the alignment is a block, use is_flush
|
|
1474 Returns :
|
|
1475 Argument :
|
|
1476
|
|
1477 =cut
|
|
1478
|
|
1479 sub length_aln {
|
|
1480 my $self = shift;
|
|
1481 $self->warn(ref($self). "::length_aln - deprecated method. Use length() instead.");
|
|
1482 $self->length(@_);
|
|
1483 }
|
|
1484
|
|
1485 sub length {
|
|
1486 my $self = shift;
|
|
1487 my $seq;
|
|
1488 my $length = (-1);
|
|
1489 my ($temp,$len);
|
|
1490
|
|
1491 foreach $seq ( $self->each_seq() ) {
|
|
1492 $temp = CORE::length($seq->seq());
|
|
1493 if( $temp > $length ) {
|
|
1494 $length = $temp;
|
|
1495 }
|
|
1496 }
|
|
1497
|
|
1498 return $length;
|
|
1499 }
|
|
1500
|
|
1501
|
|
1502 =head2 maxdisplayname_length
|
|
1503
|
|
1504 Title : maxdisplayname_length
|
|
1505 Usage : $ali->maxdisplayname_length()
|
|
1506 Function :
|
|
1507
|
|
1508 Gets the maximum length of the displayname in the
|
|
1509 alignment. Used in writing out various MSE formats.
|
|
1510
|
|
1511 Returns : integer
|
|
1512 Argument :
|
|
1513
|
|
1514 =cut
|
|
1515
|
|
1516 sub maxname_length {
|
|
1517 my $self = shift;
|
|
1518 $self->warn(ref($self). "::maxname_length - deprecated method.".
|
|
1519 " Use maxdisplayname_length() instead.");
|
|
1520 $self->maxdisplayname_length();
|
|
1521 }
|
|
1522
|
|
1523 sub maxnse_length {
|
|
1524 my $self = shift;
|
|
1525 $self->warn(ref($self). "::maxnse_length - deprecated method.".
|
|
1526 " Use maxnse_length() instead.");
|
|
1527 $self->maxdisplayname_length();
|
|
1528 }
|
|
1529
|
|
1530 sub maxdisplayname_length {
|
|
1531 my $self = shift;
|
|
1532 my $maxname = (-1);
|
|
1533 my ($seq,$len);
|
|
1534
|
|
1535 foreach $seq ( $self->each_seq() ) {
|
|
1536 $len = CORE::length $self->displayname($seq->get_nse());
|
|
1537
|
|
1538 if( $len > $maxname ) {
|
|
1539 $maxname = $len;
|
|
1540 }
|
|
1541 }
|
|
1542
|
|
1543 return $maxname;
|
|
1544 }
|
|
1545
|
|
1546 =head2 no_residues
|
|
1547
|
|
1548 Title : no_residues
|
|
1549 Usage : $no = $ali->no_residues
|
|
1550 Function : number of residues in total in the alignment
|
|
1551 Returns : integer
|
|
1552 Argument :
|
|
1553
|
|
1554 =cut
|
|
1555
|
|
1556 sub no_residues {
|
|
1557 my $self = shift;
|
|
1558 my $count = 0;
|
|
1559
|
|
1560 foreach my $seq ($self->each_seq) {
|
|
1561 my $str = $seq->seq();
|
|
1562
|
|
1563 $count += ($str =~ s/[^A-Za-z]//g);
|
|
1564 }
|
|
1565
|
|
1566 return $count;
|
|
1567 }
|
|
1568
|
|
1569 =head2 no_sequences
|
|
1570
|
|
1571 Title : no_sequences
|
|
1572 Usage : $depth = $ali->no_sequences
|
|
1573 Function : number of sequence in the sequence alignment
|
|
1574 Returns : integer
|
|
1575 Argument :
|
|
1576
|
|
1577 =cut
|
|
1578
|
|
1579 sub no_sequences {
|
|
1580 my $self = shift;
|
|
1581
|
|
1582 return scalar($self->each_seq);
|
|
1583 }
|
|
1584
|
|
1585
|
|
1586 =head2 average_percentage_identity
|
|
1587
|
|
1588 Title : average_percentage_identity
|
|
1589 Usage : $id = $align->average_percentage_identity
|
|
1590 Function: The function uses a fast method to calculate the average
|
|
1591 percentage identity of the alignment
|
|
1592 Returns : The average percentage identity of the alignment
|
|
1593 Args : None
|
|
1594 Notes : This method implemented by Kevin Howe calculates a figure that is
|
|
1595 designed to be similar to the average pairwise identity of the
|
|
1596 alignment (identical in the absence of gaps), without having to
|
|
1597 explicitly calculate pairwise identities proposed by Richard Durbin.
|
|
1598 Validated by Ewan Birney ad Alex Bateman.
|
|
1599
|
|
1600 =cut
|
|
1601
|
|
1602 sub average_percentage_identity{
|
|
1603 my ($self,@args) = @_;
|
|
1604
|
|
1605 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
|
|
1606 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
|
|
1607
|
|
1608 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes);
|
|
1609
|
|
1610 if (! $self->is_flush()) {
|
|
1611 $self->throw("All sequences in the alignment must be the same length");
|
|
1612 }
|
|
1613
|
|
1614 @seqs = $self->each_seq();
|
|
1615 $len = $self->length();
|
|
1616
|
|
1617 # load the each hash with correct keys for existence checks
|
|
1618
|
|
1619 for( my $index=0; $index < $len; $index++) {
|
|
1620 foreach my $letter (@alphabet) {
|
|
1621 $countHashes[$index]->{$letter} = 0;
|
|
1622 }
|
|
1623 }
|
|
1624 foreach my $seq (@seqs) {
|
|
1625 my @seqChars = split //, $seq->seq();
|
|
1626 for( my $column=0; $column < @seqChars; $column++ ) {
|
|
1627 my $char = uc($seqChars[$column]);
|
|
1628 if (exists $countHashes[$column]->{$char}) {
|
|
1629 $countHashes[$column]->{$char}++;
|
|
1630 }
|
|
1631 }
|
|
1632 }
|
|
1633
|
|
1634 $total = 0;
|
|
1635 $divisor = 0;
|
|
1636 for(my $column =0; $column < $len; $column++) {
|
|
1637 my %hash = %{$countHashes[$column]};
|
|
1638 $subdivisor = 0;
|
|
1639 foreach my $res (keys %hash) {
|
|
1640 $total += $hash{$res}*($hash{$res} - 1);
|
|
1641 $subdivisor += $hash{$res};
|
|
1642 }
|
|
1643 $divisor += $subdivisor * ($subdivisor - 1);
|
|
1644 }
|
|
1645 return $divisor > 0 ? ($total / $divisor )*100.0 : 0;
|
|
1646 }
|
|
1647
|
|
1648 =head2 percentage_identity
|
|
1649
|
|
1650 Title : percentage_identity
|
|
1651 Usage : $id = $align->percentage_identity
|
|
1652 Function: The function calculates the average percentage identity
|
|
1653 (aliased for average_percentage_identity)
|
|
1654 Returns : The average percentage identity
|
|
1655 Args : None
|
|
1656
|
|
1657 =cut
|
|
1658
|
|
1659 sub percentage_identity {
|
|
1660 my $self = shift;
|
|
1661 return $self->average_percentage_identity();
|
|
1662 }
|
|
1663
|
|
1664 =head2 overall_percentage_identity
|
|
1665
|
|
1666 Title : percentage_identity
|
|
1667 Usage : $id = $align->percentage_identity
|
|
1668 Function: The function calculates the percentage identity of
|
|
1669 the conserved columns
|
|
1670 Returns : The percentage identity of the conserved columns
|
|
1671 Args : None
|
|
1672
|
|
1673 =cut
|
|
1674
|
|
1675 sub overall_percentage_identity{
|
|
1676 my ($self,@args) = @_;
|
|
1677
|
|
1678 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M',
|
|
1679 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z');
|
|
1680
|
|
1681 my ($len, $total, @seqs, @countHashes);
|
|
1682
|
|
1683 if (! $self->is_flush()) {
|
|
1684 $self->throw("All sequences in the alignment must be the same length");
|
|
1685 }
|
|
1686
|
|
1687 @seqs = $self->each_seq();
|
|
1688 $len = $self->length();
|
|
1689
|
|
1690 # load the each hash with correct keys for existence checks
|
|
1691 for( my $index=0; $index < $len; $index++) {
|
|
1692 foreach my $letter (@alphabet) {
|
|
1693 $countHashes[$index]->{$letter} = 0;
|
|
1694 }
|
|
1695 }
|
|
1696 foreach my $seq (@seqs) {
|
|
1697 my @seqChars = split //, $seq->seq();
|
|
1698 for( my $column=0; $column < @seqChars; $column++ ) {
|
|
1699 my $char = uc($seqChars[$column]);
|
|
1700 if (exists $countHashes[$column]->{$char}) {
|
|
1701 $countHashes[$column]->{$char}++;
|
|
1702 }
|
|
1703 }
|
|
1704 }
|
|
1705
|
|
1706 $total = 0;
|
|
1707 for(my $column =0; $column < $len; $column++) {
|
|
1708 my %hash = %{$countHashes[$column]};
|
|
1709 foreach ( values %hash ) {
|
|
1710 next if( $_ == 0 );
|
|
1711 $total++ if( $_ == scalar @seqs );
|
|
1712 last;
|
|
1713 }
|
|
1714 }
|
|
1715 return ($total / $len ) * 100.0;
|
|
1716 }
|
|
1717
|
|
1718 =head1 Alignment positions
|
|
1719
|
|
1720 Methods to map a sequence position into an alignment column and back.
|
|
1721 column_from_residue_number() does the former. The latter is really a
|
|
1722 property of the sequence object and can done using
|
|
1723 L<Bio::LocatableSeq::location_from_column>:
|
|
1724
|
|
1725 # select somehow a sequence from the alignment, e.g.
|
|
1726 my $seq = $aln->get_seq_by_pos(1);
|
|
1727 #$loc is undef or Bio::LocationI object
|
|
1728 my $loc = $seq->location_from_column(5);
|
|
1729
|
|
1730
|
|
1731 =head2 column_from_residue_number
|
|
1732
|
|
1733 Title : column_from_residue_number
|
|
1734 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber)
|
|
1735 Function:
|
|
1736
|
|
1737 This function gives the position in the alignment
|
|
1738 (i.e. column number) of the given residue number in the
|
|
1739 sequence with the given name. For example, for the
|
|
1740 alignment
|
|
1741
|
|
1742 Seq1/91-97 AC..DEF.GH
|
|
1743 Seq2/24-30 ACGG.RTY..
|
|
1744 Seq3/43-51 AC.DDEFGHI
|
|
1745
|
|
1746 column_from_residue_number( "Seq1", 94 ) returns 5.
|
|
1747 column_from_residue_number( "Seq2", 25 ) returns 2.
|
|
1748 column_from_residue_number( "Seq3", 50 ) returns 9.
|
|
1749
|
|
1750 An exception is thrown if the residue number would lie
|
|
1751 outside the length of the aligment
|
|
1752 (e.g. column_from_residue_number( "Seq2", 22 )
|
|
1753
|
|
1754 Note: If the the parent sequence is represented by more than
|
|
1755 one alignment sequence and the residue number is present in
|
|
1756 them, this method finds only the first one.
|
|
1757
|
|
1758 Returns : A column number for the position in the alignment of the
|
|
1759 given residue in the given sequence (1 = first column)
|
|
1760 Args : A sequence id/name (not a name/start-end)
|
|
1761 A residue number in the whole sequence (not just that
|
|
1762 segment of it in the alignment)
|
|
1763
|
|
1764 =cut
|
|
1765
|
|
1766 sub column_from_residue_number {
|
|
1767 my ($self, $name, $resnumber) = @_;
|
|
1768
|
|
1769 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name};
|
|
1770 $self->throw("Second argument residue number missing") unless $resnumber;
|
|
1771
|
|
1772 foreach my $seq ($self->each_seq_with_id($name)) {
|
|
1773 my $col;
|
|
1774 eval {
|
|
1775 $col = $seq->column_from_residue_number($resnumber);
|
|
1776 };
|
|
1777 next if $@;
|
|
1778 return $col;
|
|
1779 }
|
|
1780
|
|
1781 $self->throw("Could not find a sequence segment in $name ".
|
|
1782 "containing residue number $resnumber");
|
|
1783
|
|
1784 }
|
|
1785
|
|
1786 =head1 Sequence names
|
|
1787
|
|
1788 Methods to manipulate the display name. The default name based on the
|
|
1789 sequence id and subsequence positions can be overridden in various
|
|
1790 ways.
|
|
1791
|
|
1792 =head2 displayname
|
|
1793
|
|
1794 Title : displayname
|
|
1795 Usage : $myalign->displayname("Ig", "IgA")
|
|
1796 Function : Gets/sets the display name of a sequence in the alignment
|
|
1797 :
|
|
1798 Returns : A display name string
|
|
1799 Argument : name of the sequence
|
|
1800 displayname of the sequence (optional)
|
|
1801
|
|
1802 =cut
|
|
1803
|
|
1804 sub get_displayname {
|
|
1805 my $self = shift;
|
|
1806 $self->warn(ref($self). "::get_displayname - deprecated method. Use displayname() instead.");
|
|
1807 $self->displayname(@_);
|
|
1808 }
|
|
1809
|
|
1810 sub set_displayname {
|
|
1811 my $self = shift;
|
|
1812 $self->warn(ref($self). "::set_displayname - deprecated method. Use displayname() instead.");
|
|
1813 $self->displayname(@_);
|
|
1814 }
|
|
1815
|
|
1816 sub displayname {
|
|
1817 my ($self, $name, $disname) = @_;
|
|
1818
|
|
1819 $self->throw("No sequence with name [$name]") unless $self->{'_seq'}->{$name};
|
|
1820
|
|
1821 if( $disname and $name) {
|
|
1822 $self->{'_dis_name'}->{$name} = $disname;
|
|
1823 return $disname;
|
|
1824 }
|
|
1825 elsif( defined $self->{'_dis_name'}->{$name} ) {
|
|
1826 return $self->{'_dis_name'}->{$name};
|
|
1827 } else {
|
|
1828 return $name;
|
|
1829 }
|
|
1830 }
|
|
1831
|
|
1832 =head2 set_displayname_count
|
|
1833
|
|
1834 Title : set_displayname_count
|
|
1835 Usage : $ali->set_displayname_count
|
|
1836 Function :
|
|
1837
|
|
1838 Sets the names to be name_# where # is the number of
|
|
1839 times this name has been used.
|
|
1840
|
|
1841 Returns :
|
|
1842 Argument :
|
|
1843
|
|
1844 =cut
|
|
1845
|
|
1846 sub set_displayname_count {
|
|
1847 my $self= shift;
|
|
1848 my (@arr,$name,$seq,$count,$temp,$nse);
|
|
1849
|
|
1850 foreach $seq ( $self->each_alphabetically() ) {
|
|
1851 $nse = $seq->get_nse();
|
|
1852
|
|
1853 #name will be set when this is the second
|
|
1854 #time (or greater) is has been seen
|
|
1855
|
|
1856 if( defined $name and $name eq ($seq->id()) ) {
|
|
1857 $temp = sprintf("%s_%s",$name,$count);
|
|
1858 $self->displayname($nse,$temp);
|
|
1859 $count++;
|
|
1860 } else {
|
|
1861 $count = 1;
|
|
1862 $name = $seq->id();
|
|
1863 $temp = sprintf("%s_%s",$name,$count);
|
|
1864 $self->displayname($nse,$temp);
|
|
1865 $count++;
|
|
1866 }
|
|
1867 }
|
|
1868 return 1;
|
|
1869 }
|
|
1870
|
|
1871 =head2 set_displayname_flat
|
|
1872
|
|
1873 Title : set_displayname_flat
|
|
1874 Usage : $ali->set_displayname_flat()
|
|
1875 Function : Makes all the sequences be displayed as just their name,
|
|
1876 not name/start-end
|
|
1877 Returns : 1
|
|
1878 Argument :
|
|
1879
|
|
1880 =cut
|
|
1881
|
|
1882 sub set_displayname_flat {
|
|
1883 my $self = shift;
|
|
1884 my ($nse,$seq);
|
|
1885
|
|
1886 foreach $seq ( $self->each_seq() ) {
|
|
1887 $nse = $seq->get_nse();
|
|
1888 $self->displayname($nse,$seq->id());
|
|
1889 }
|
|
1890 return 1;
|
|
1891 }
|
|
1892
|
|
1893 =head2 set_displayname_normal
|
|
1894
|
|
1895 Title : set_displayname_normal
|
|
1896 Usage : $ali->set_displayname_normal()
|
|
1897 Function : Makes all the sequences be displayed as name/start-end
|
|
1898 Returns :
|
|
1899 Argument :
|
|
1900
|
|
1901 =cut
|
|
1902
|
|
1903 sub set_displayname_normal {
|
|
1904 my $self = shift;
|
|
1905 my ($nse,$seq);
|
|
1906
|
|
1907 foreach $seq ( $self->each_seq() ) {
|
|
1908 $nse = $seq->get_nse();
|
|
1909 $self->displayname($nse,$nse);
|
|
1910 }
|
|
1911 return 1;
|
|
1912 }
|
|
1913
|
|
1914 =head2 source
|
|
1915
|
|
1916 Title : source
|
|
1917 Usage : $obj->source($newval)
|
|
1918 Function: sets the Alignment source program
|
|
1919 Example :
|
|
1920 Returns : value of source
|
|
1921 Args : newvalue (optional)
|
|
1922
|
|
1923
|
|
1924 =cut
|
|
1925
|
|
1926 sub source{
|
|
1927 my ($self,$value) = @_;
|
|
1928 if( defined $value) {
|
|
1929 $self->{'_source'} = $value;
|
|
1930 }
|
|
1931 return $self->{'_source'};
|
|
1932 }
|
|
1933
|
|
1934 1;
|