Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SimpleAlign.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: SimpleAlign.pm,v 1.65.2.1 2003/07/02 16:00:19 jason Exp $ | |
| 2 # BioPerl module for SimpleAlign | |
| 3 # | |
| 4 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> | |
| 5 # | |
| 6 # Copyright Ewan Birney | |
| 7 # | |
| 8 # You may distribute this module under the same terms as perl itself | |
| 9 | |
| 10 # POD documentation - main docs before the code | |
| 11 # | |
| 12 # History: | |
| 13 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS | |
| 14 # May 2001 major rewrite - Heikki Lehvaslaiho | |
| 15 | |
| 16 =head1 NAME | |
| 17 | |
| 18 Bio::SimpleAlign - Multiple alignments held as a set of sequences | |
| 19 | |
| 20 =head1 SYNOPSIS | |
| 21 | |
| 22 # use Bio::AlignIO to read in the alignment | |
| 23 $str = Bio::AlignIO->new('-file' => 't/data/testaln.pfam'); | |
| 24 $aln = $str->next_aln(); | |
| 25 | |
| 26 # some descriptors | |
| 27 print $aln->length, "\n"; | |
| 28 print $aln->no_residues, "\n"; | |
| 29 print $aln->is_flush, "\n"; | |
| 30 print $aln->no_sequences, "\n"; | |
| 31 print $aln->percentage_identity, "\n"; | |
| 32 print $aln->consensus_string(50), "\n"; | |
| 33 | |
| 34 # find the position in the alignment for a sequence location | |
| 35 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6; | |
| 36 | |
| 37 # extract sequences and check values for the alignment column $pos | |
| 38 foreach $seq ($aln->each_seq) { | |
| 39 $res = $seq->subseq($pos, $pos); | |
| 40 $count{$res}++; | |
| 41 } | |
| 42 foreach $res (keys %count) { | |
| 43 printf "Res: %s Count: %2d\n", $res, $count{$res}; | |
| 44 } | |
| 45 | |
| 46 | |
| 47 =head1 DESCRIPTION | |
| 48 | |
| 49 SimpleAlign handles multiple alignments of sequences. It is very | |
| 50 permissive of types (it won't insist on things being all same length | |
| 51 etc): really it is a SequenceSet explicitly held in memory with a | |
| 52 whole series of built in manipulations and especially file format | |
| 53 systems for read/writing alignments. | |
| 54 | |
| 55 SimpleAlign basically views an alignment as an immutable block of | |
| 56 text. SimpleAlign *is not* the object to be using if you want to | |
| 57 perform complex alignment manipulations. | |
| 58 | |
| 59 However for lightweight display/formatting and minimal manipulation | |
| 60 (e.g. removing all-gaps columns) - this is the one to use. | |
| 61 | |
| 62 SimpleAlign uses a subclass of L<Bio::PrimarySeq> class | |
| 63 L<Bio::LocatableSeq> to store its sequences. These are subsequences | |
| 64 with a start and end positions in the parent reference sequence. | |
| 65 | |
| 66 Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in | |
| 67 the alignment, and this is the key for the internal hashes. | |
| 68 (name,start,end is abbreviated nse in the code). However, in many | |
| 69 cases people don't want the name/start-end to be displayed: either | |
| 70 multiple names in an alignment or names specific to the alignment | |
| 71 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called | |
| 72 'displayname', and generally is what is used to print out the | |
| 73 alignment. They default to name/start-end. | |
| 74 | |
| 75 The SimpleAlign Module came from Ewan Birney's Align module. | |
| 76 | |
| 77 =head1 PROGRESS | |
| 78 | |
| 79 SimpleAlign is being slowly converted to bioperl coding standards, | |
| 80 mainly by Ewan. | |
| 81 | |
| 82 =over 3 | |
| 83 | |
| 84 =item Use Bio::Root::Object - done | |
| 85 | |
| 86 =item Use proper exceptions - done | |
| 87 | |
| 88 =item Use hashed constructor - not done! | |
| 89 | |
| 90 =back | |
| 91 | |
| 92 =head1 FEEDBACK | |
| 93 | |
| 94 =head2 Mailing Lists | |
| 95 | |
| 96 User feedback is an integral part of the evolution of this and other | |
| 97 Bioperl modules. Send your comments and suggestions preferably to one | |
| 98 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 99 | |
| 100 bioperl-l@bioperl.org - General discussion | |
| 101 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 102 | |
| 103 =head2 Reporting Bugs | |
| 104 | |
| 105 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 106 the bugs and their resolution. Bug reports can be submitted via email | |
| 107 or the web: | |
| 108 | |
| 109 bioperl-bugs@bio.perl.org | |
| 110 http://bugzilla.bioperl.org/ | |
| 111 | |
| 112 =head1 AUTHOR | |
| 113 | |
| 114 Ewan Birney, birney@sanger.ac.uk | |
| 115 | |
| 116 =head1 CONTRIBUTORS | |
| 117 | |
| 118 Richard Adams, Richard.Adams@ed.ac.uk, | |
| 119 David J. Evans, David.Evans@vir.gla.ac.uk, | |
| 120 Heikki Lehvaslaiho, heikki@ebi.ac.uk, | |
| 121 Allen Smith, allens@cpan.org, | |
| 122 Jason Stajich, jason@bioperl.org, | |
| 123 Anthony Underwood, aunderwood@phls.org.uk, | |
| 124 Xintao Wei & Giri Narasimhan, giri@cs.fiu.edu | |
| 125 | |
| 126 | |
| 127 =head1 SEE ALSO | |
| 128 | |
| 129 L<Bio::LocatableSeq.pm> | |
| 130 | |
| 131 =head1 APPENDIX | |
| 132 | |
| 133 The rest of the documentation details each of the object | |
| 134 methods. Internal methods are usually preceded with a _ | |
| 135 | |
| 136 =cut | |
| 137 | |
| 138 # 'Let the code begin... | |
| 139 | |
| 140 package Bio::SimpleAlign; | |
| 141 use vars qw(@ISA %CONSERVATION_GROUPS); | |
| 142 use strict; | |
| 143 | |
| 144 use Bio::Root::Root; | |
| 145 use Bio::LocatableSeq; # uses Seq's as list | |
| 146 use Bio::Align::AlignI; | |
| 147 | |
| 148 BEGIN { | |
| 149 # This data should probably be in a more centralized module... | |
| 150 # it is taken from Clustalw documentation | |
| 151 # These are all the positively scoring groups that occur in the | |
| 152 # Gonnet Pam250 matrix. The strong and weak groups are | |
| 153 # defined as strong score >0.5 and weak score =<0.5 respectively. | |
| 154 | |
| 155 %CONSERVATION_GROUPS = ( 'strong' => [ qw(STA | |
| 156 NEQK | |
| 157 NHQK | |
| 158 NDEQ | |
| 159 QHRK | |
| 160 MILV | |
| 161 MILF | |
| 162 HY | |
| 163 FYW) | |
| 164 ], | |
| 165 'weak' => [ qw(CSA | |
| 166 ATV | |
| 167 SAG | |
| 168 STNK | |
| 169 STPA | |
| 170 SGND | |
| 171 SNDEQK | |
| 172 NDEQHK | |
| 173 NEQHRK | |
| 174 FVLIM | |
| 175 HFY) ], | |
| 176 ); | |
| 177 | |
| 178 } | |
| 179 | |
| 180 @ISA = qw(Bio::Root::Root Bio::Align::AlignI); | |
| 181 | |
| 182 =head2 new | |
| 183 | |
| 184 Title : new | |
| 185 Usage : my $aln = new Bio::SimpleAlign(); | |
| 186 Function : Creates a new simple align object | |
| 187 Returns : Bio::SimpleAlign | |
| 188 Args : -source => string representing the source program | |
| 189 where this alignment came from | |
| 190 | |
| 191 =cut | |
| 192 | |
| 193 | |
| 194 sub new { | |
| 195 my($class,@args) = @_; | |
| 196 | |
| 197 my $self = $class->SUPER::new(@args); | |
| 198 | |
| 199 my ($src) = $self->_rearrange([qw(SOURCE)], @args); | |
| 200 $src && $self->source($src); | |
| 201 # we need to set up internal hashs first! | |
| 202 | |
| 203 $self->{'_seq'} = {}; | |
| 204 $self->{'_order'} = {}; | |
| 205 $self->{'_start_end_lists'} = {}; | |
| 206 $self->{'_dis_name'} = {}; | |
| 207 $self->{'_id'} = 'NoName'; | |
| 208 $self->{'_symbols'} = {}; | |
| 209 # maybe we should automatically read in from args. Hmmm... | |
| 210 | |
| 211 return $self; # success - we hope! | |
| 212 } | |
| 213 | |
| 214 =head1 Modifier methods | |
| 215 | |
| 216 These methods modify the MSE by adding, removing or shuffling complete | |
| 217 sequences. | |
| 218 | |
| 219 =head2 add_seq | |
| 220 | |
| 221 Title : add_seq | |
| 222 Usage : $myalign->add_seq($newseq); | |
| 223 Function : Adds another sequence to the alignment. *Does not* align | |
| 224 it - just adds it to the hashes. | |
| 225 Returns : nothing | |
| 226 Args : a Bio::LocatableSeq object | |
| 227 order (optional) | |
| 228 | |
| 229 See L<Bio::LocatableSeq> for more information | |
| 230 | |
| 231 =cut | |
| 232 | |
| 233 sub addSeq { | |
| 234 my $self = shift; | |
| 235 $self->warn(ref($self). "::addSeq - deprecated method. Use add_seq() instead."); | |
| 236 $self->add_seq(@_); | |
| 237 } | |
| 238 | |
| 239 sub add_seq { | |
| 240 my $self = shift; | |
| 241 my $seq = shift; | |
| 242 my $order = shift; | |
| 243 my ($name,$id,$start,$end); | |
| 244 | |
| 245 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
| 246 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
| 247 } | |
| 248 | |
| 249 $id = $seq->id() ||$seq->display_id || $seq->primary_id; | |
| 250 $start = $seq->start(); | |
| 251 $end = $seq->end(); | |
| 252 | |
| 253 # build the symbol list for this sequence, | |
| 254 # will prune out the gap and missing/match chars | |
| 255 # when actually asked for the symbol list in the | |
| 256 # symbol_chars | |
| 257 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq); | |
| 258 | |
| 259 if( !defined $order ) { | |
| 260 $order = keys %{$self->{'_seq'}}; | |
| 261 } | |
| 262 $name = sprintf("%s/%d-%d",$id,$start,$end); | |
| 263 | |
| 264 if( $self->{'_seq'}->{$name} ) { | |
| 265 $self->warn("Replacing one sequence [$name]\n"); | |
| 266 } | |
| 267 else { | |
| 268 #print STDERR "Assigning $name to $order\n"; | |
| 269 | |
| 270 $self->{'_order'}->{$order} = $name; | |
| 271 | |
| 272 unless( exists( $self->{'_start_end_lists'}->{$id})) { | |
| 273 $self->{'_start_end_lists'}->{$id} = []; | |
| 274 } | |
| 275 push @{$self->{'_start_end_lists'}->{$id}}, $seq; | |
| 276 } | |
| 277 | |
| 278 $self->{'_seq'}->{$name} = $seq; | |
| 279 | |
| 280 } | |
| 281 | |
| 282 | |
| 283 =head2 remove_seq | |
| 284 | |
| 285 Title : remove_seq | |
| 286 Usage : $aln->remove_seq($seq); | |
| 287 Function : Removes a single sequence from an alignment | |
| 288 Returns : | |
| 289 Argument : a Bio::LocatableSeq object | |
| 290 | |
| 291 =cut | |
| 292 | |
| 293 sub removeSeq { | |
| 294 my $self = shift; | |
| 295 $self->warn(ref($self). "::removeSeq - deprecated method. Use remove_seq() instead."); | |
| 296 $self->remove_seq(@_); | |
| 297 } | |
| 298 | |
| 299 sub remove_seq { | |
| 300 my $self = shift; | |
| 301 my $seq = shift; | |
| 302 my ($name,$id,$start,$end); | |
| 303 | |
| 304 $self->throw("Need Bio::Locatable seq argument ") | |
| 305 unless ref $seq && $seq->isa('Bio::LocatableSeq'); | |
| 306 | |
| 307 $id = $seq->id(); | |
| 308 $start = $seq->start(); | |
| 309 $end = $seq->end(); | |
| 310 $name = sprintf("%s/%d-%d",$id,$start,$end); | |
| 311 | |
| 312 if( !exists $self->{'_seq'}->{$name} ) { | |
| 313 $self->throw("Sequence $name does not exist in the alignment to remove!"); | |
| 314 } | |
| 315 | |
| 316 delete $self->{'_seq'}->{$name}; | |
| 317 | |
| 318 # we need to remove this seq from the start_end_lists hash | |
| 319 | |
| 320 if (exists $self->{'_start_end_lists'}->{$id}) { | |
| 321 # we need to find the sequence in the array. | |
| 322 | |
| 323 my ($i, $found);; | |
| 324 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) { | |
| 325 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) { | |
| 326 $found = 1; | |
| 327 last; | |
| 328 } | |
| 329 } | |
| 330 if ($found) { | |
| 331 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1; | |
| 332 } | |
| 333 else { | |
| 334 $self->throw("Could not find the sequence to remoce from the start-end list"); | |
| 335 } | |
| 336 } | |
| 337 else { | |
| 338 $self->throw("There is no seq list for the name $id"); | |
| 339 } | |
| 340 return 1; | |
| 341 # we can't do anything about the order hash but that is ok | |
| 342 # because each_seq will handle it | |
| 343 } | |
| 344 | |
| 345 | |
| 346 =head2 purge | |
| 347 | |
| 348 Title : purge | |
| 349 Usage : $aln->purge(0.7); | |
| 350 Function: | |
| 351 | |
| 352 Removes sequences above given sequence similarity | |
| 353 This function will grind on large alignments. Beware! | |
| 354 | |
| 355 Example : | |
| 356 Returns : An array of the removed sequences | |
| 357 Args : float, threshold for similarity | |
| 358 | |
| 359 =cut | |
| 360 | |
| 361 sub purge { | |
| 362 my ($self,$perc) = @_; | |
| 363 my (%duplicate, @dups); | |
| 364 | |
| 365 my @seqs = $self->each_seq(); | |
| 366 | |
| 367 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment | |
| 368 my $seq = $seqs[$i]; | |
| 369 | |
| 370 #skip if already in duplicate hash | |
| 371 next if exists $duplicate{$seq->display_id} ; | |
| 372 my $one = $seq->seq(); | |
| 373 | |
| 374 my @one = split '', $one; #split to get 1aa per array element | |
| 375 | |
| 376 for (my $j=$i+1;$j < @seqs;$j++) { | |
| 377 my $seq2 = $seqs[$j]; | |
| 378 | |
| 379 #skip if already in duplicate hash | |
| 380 next if exists $duplicate{$seq2->display_id} ; | |
| 381 | |
| 382 my $two = $seq2->seq(); | |
| 383 my @two = split '', $two; | |
| 384 | |
| 385 my $count = 0; | |
| 386 my $res = 0; | |
| 387 for (my $k=0;$k<@one;$k++) { | |
| 388 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) && | |
| 389 $one[$k] eq $two[$k]) { | |
| 390 $count++; | |
| 391 } | |
| 392 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) && | |
| 393 $two[$k] ne '.' && $two[$k] ne '-' ) { | |
| 394 $res++; | |
| 395 } | |
| 396 } | |
| 397 | |
| 398 my $ratio = 0; | |
| 399 $ratio = $count/$res unless $res == 0; | |
| 400 | |
| 401 # if above threshold put in duplicate hash and push onto | |
| 402 # duplicate array for returning to get_unique | |
| 403 if ( $ratio > $perc ) { | |
| 404 print STDERR "duplicate!", $seq2->display_id, "\n" if $self->verbose > 0; | |
| 405 $duplicate{$seq2->display_id} = 1; | |
| 406 push @dups, $seq2; | |
| 407 } | |
| 408 } | |
| 409 } | |
| 410 foreach my $seq (@dups) { | |
| 411 $self->remove_seq($seq); | |
| 412 } | |
| 413 return @dups; | |
| 414 } | |
| 415 | |
| 416 =head2 sort_alphabetically | |
| 417 | |
| 418 Title : sort_alphabetically | |
| 419 Usage : $ali->sort_alphabetically | |
| 420 Function : | |
| 421 | |
| 422 Changes the order of the alignemnt to alphabetical on name | |
| 423 followed by numerical by number. | |
| 424 | |
| 425 Returns : | |
| 426 Argument : | |
| 427 | |
| 428 =cut | |
| 429 | |
| 430 sub sort_alphabetically { | |
| 431 my $self = shift; | |
| 432 my ($seq,$nse,@arr,%hash,$count); | |
| 433 | |
| 434 foreach $seq ( $self->each_seq() ) { | |
| 435 $nse = $seq->get_nse; | |
| 436 $hash{$nse} = $seq; | |
| 437 } | |
| 438 | |
| 439 $count = 0; | |
| 440 | |
| 441 %{$self->{'_order'}} = (); # reset the hash; | |
| 442 | |
| 443 foreach $nse ( sort _alpha_startend keys %hash) { | |
| 444 $self->{'_order'}->{$count} = $nse; | |
| 445 | |
| 446 $count++; | |
| 447 } | |
| 448 1; | |
| 449 } | |
| 450 | |
| 451 =head1 Sequence selection methods | |
| 452 | |
| 453 Methods returning one or more sequences objects. | |
| 454 | |
| 455 =head2 each_seq | |
| 456 | |
| 457 Title : each_seq | |
| 458 Usage : foreach $seq ( $align->each_seq() ) | |
| 459 Function : Gets an array of Seq objects from the alignment | |
| 460 Returns : an array | |
| 461 Argument : | |
| 462 | |
| 463 =cut | |
| 464 | |
| 465 sub eachSeq { | |
| 466 my $self = shift; | |
| 467 $self->warn(ref($self). "::eachSeq - deprecated method. Use each_seq() instead."); | |
| 468 $self->each_seq(); | |
| 469 } | |
| 470 | |
| 471 sub each_seq { | |
| 472 my $self = shift; | |
| 473 my (@arr,$order); | |
| 474 | |
| 475 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) { | |
| 476 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) { | |
| 477 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}}); | |
| 478 } | |
| 479 } | |
| 480 | |
| 481 return @arr; | |
| 482 } | |
| 483 | |
| 484 | |
| 485 =head2 each_alphabetically | |
| 486 | |
| 487 Title : each_alphabetically | |
| 488 Usage : foreach $seq ( $ali->each_alphabetically() ) | |
| 489 Function : | |
| 490 | |
| 491 Returns an array of sequence object sorted alphabetically | |
| 492 by name and then by start point. | |
| 493 Does not change the order of the alignment | |
| 494 | |
| 495 Returns : | |
| 496 Argument : | |
| 497 | |
| 498 =cut | |
| 499 | |
| 500 sub each_alphabetically { | |
| 501 my $self = shift; | |
| 502 my ($seq,$nse,@arr,%hash,$count); | |
| 503 | |
| 504 foreach $seq ( $self->each_seq() ) { | |
| 505 $nse = $seq->get_nse; | |
| 506 $hash{$nse} = $seq; | |
| 507 } | |
| 508 | |
| 509 foreach $nse ( sort _alpha_startend keys %hash) { | |
| 510 push(@arr,$hash{$nse}); | |
| 511 } | |
| 512 | |
| 513 return @arr; | |
| 514 | |
| 515 } | |
| 516 | |
| 517 sub _alpha_startend { | |
| 518 my ($aname,$astart,$bname,$bstart); | |
| 519 ($aname,$astart) = split (/-/,$a); | |
| 520 ($bname,$bstart) = split (/-/,$b); | |
| 521 | |
| 522 if( $aname eq $bname ) { | |
| 523 return $astart <=> $bstart; | |
| 524 } | |
| 525 else { | |
| 526 return $aname cmp $bname; | |
| 527 } | |
| 528 | |
| 529 } | |
| 530 | |
| 531 =head2 each_seq_with_id | |
| 532 | |
| 533 Title : each_seq_with_id | |
| 534 Usage : foreach $seq ( $align->each_seq_with_id() ) | |
| 535 Function : | |
| 536 | |
| 537 Gets an array of Seq objects from the | |
| 538 alignment, the contents being those sequences | |
| 539 with the given name (there may be more than one) | |
| 540 | |
| 541 Returns : an array | |
| 542 Argument : a seq name | |
| 543 | |
| 544 =cut | |
| 545 | |
| 546 sub eachSeqWithId { | |
| 547 my $self = shift; | |
| 548 $self->warn(ref($self). "::eachSeqWithId - deprecated method. Use each_seq_with_id() instead."); | |
| 549 $self->each_seq_with_id(@_); | |
| 550 } | |
| 551 | |
| 552 sub each_seq_with_id { | |
| 553 my $self = shift; | |
| 554 my $id = shift; | |
| 555 | |
| 556 $self->throw("Method each_seq_with_id needs a sequence name argument") | |
| 557 unless defined $id; | |
| 558 | |
| 559 my (@arr, $seq); | |
| 560 | |
| 561 if (exists($self->{'_start_end_lists'}->{$id})) { | |
| 562 @arr = @{$self->{'_start_end_lists'}->{$id}}; | |
| 563 } | |
| 564 return @arr; | |
| 565 } | |
| 566 | |
| 567 =head2 get_seq_by_pos | |
| 568 | |
| 569 Title : get_seq_by_pos | |
| 570 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment | |
| 571 Function : | |
| 572 | |
| 573 Gets a sequence based on its position in the alignment. | |
| 574 Numbering starts from 1. Sequence positions larger than | |
| 575 no_sequences() will thow an error. | |
| 576 | |
| 577 Returns : a Bio::LocatableSeq object | |
| 578 Args : positive integer for the sequence osition | |
| 579 | |
| 580 =cut | |
| 581 | |
| 582 sub get_seq_by_pos { | |
| 583 | |
| 584 my $self = shift; | |
| 585 my ($pos) = @_; | |
| 586 | |
| 587 $self->throw("Sequence position has to be a positive integer, not [$pos]") | |
| 588 unless $pos =~ /^\d+$/ and $pos > 0; | |
| 589 $self->throw("No sequence at position [$pos]") | |
| 590 unless $pos <= $self->no_sequences ; | |
| 591 | |
| 592 my $nse = $self->{'_order'}->{--$pos}; | |
| 593 return $self->{'_seq'}->{$nse}; | |
| 594 } | |
| 595 | |
| 596 =head1 Create new alignments | |
| 597 | |
| 598 The result of these methods are horizontal or vertical subsets of the | |
| 599 current MSE. | |
| 600 | |
| 601 =head2 select | |
| 602 | |
| 603 Title : select | |
| 604 Usage : $aln2 = $aln->select(1, 3) # three first sequences | |
| 605 Function : | |
| 606 | |
| 607 Creates a new alignment from a continuous subset of | |
| 608 sequences. Numbering starts from 1. Sequence positions | |
| 609 larger than no_sequences() will thow an error. | |
| 610 | |
| 611 Returns : a Bio::SimpleAlign object | |
| 612 Args : positive integer for the first sequence | |
| 613 positive integer for the last sequence to include (optional) | |
| 614 | |
| 615 =cut | |
| 616 | |
| 617 sub select { | |
| 618 my $self = shift; | |
| 619 my ($start, $end) = @_; | |
| 620 | |
| 621 $self->throw("Select start has to be a positive integer, not [$start]") | |
| 622 unless $start =~ /^\d+$/ and $start > 0; | |
| 623 $self->throw("Select end has to be a positive integer, not [$end]") | |
| 624 unless $end =~ /^\d+$/ and $end > 0; | |
| 625 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]") | |
| 626 unless $start <= $end; | |
| 627 | |
| 628 my $aln = new $self; | |
| 629 foreach my $pos ($start .. $end) { | |
| 630 $aln->add_seq($self->get_seq_by_pos($pos)); | |
| 631 } | |
| 632 $aln->id($self->id); | |
| 633 return $aln; | |
| 634 } | |
| 635 | |
| 636 =head2 select_noncont | |
| 637 | |
| 638 Title : select_noncont | |
| 639 Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences | |
| 640 Function : | |
| 641 | |
| 642 Creates a new alignment from a subset of | |
| 643 sequences. Numbering starts from 1. Sequence positions | |
| 644 larger than no_sequences() will thow an error. | |
| 645 | |
| 646 Returns : a Bio::SimpleAlign object | |
| 647 Args : array of integers for the sequences | |
| 648 | |
| 649 =cut | |
| 650 | |
| 651 sub select_noncont { | |
| 652 my $self = shift; | |
| 653 my (@pos) = @_; | |
| 654 my $end = $self->no_sequences; | |
| 655 foreach ( @pos ) { | |
| 656 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]") | |
| 657 unless( /^\d+$/ && $_ > 0 && $_ <= $end ); | |
| 658 } | |
| 659 my $aln = new $self; | |
| 660 foreach my $p (@pos) { | |
| 661 $aln->add_seq($self->get_seq_by_pos($p)); | |
| 662 } | |
| 663 $aln->id($self->id); | |
| 664 return $aln; | |
| 665 } | |
| 666 | |
| 667 =head2 slice | |
| 668 | |
| 669 Title : slice | |
| 670 Usage : $aln2 = $aln->slice(20, 30) | |
| 671 Function : | |
| 672 | |
| 673 Creates a slice from the alignment inclusive of start and | |
| 674 end columns. Sequences with no residues in the slice are | |
| 675 excluded from the new alignment and a warning is printed. | |
| 676 Slice beyond the length of the sequence does not do | |
| 677 padding. | |
| 678 | |
| 679 Returns : a Bio::SimpleAlign object | |
| 680 Args : positive integer for start column | |
| 681 positive integer for end column | |
| 682 | |
| 683 =cut | |
| 684 | |
| 685 sub slice { | |
| 686 my $self = shift; | |
| 687 my ($start, $end) = @_; | |
| 688 | |
| 689 $self->throw("Slice start has to be a positive integer, not [$start]") | |
| 690 unless $start =~ /^\d+$/ and $start > 0; | |
| 691 $self->throw("Slice end has to be a positive integer, not [$end]") | |
| 692 unless $end =~ /^\d+$/ and $end > 0; | |
| 693 $self->throw("Slice $start [$start] has to be smaller than or equal to end [$end]") | |
| 694 unless $start <= $end; | |
| 695 my $aln_length = $self->length; | |
| 696 $self->throw("This alignment has only ". $self->length. | |
| 697 " residues. Slice start [$start] is too bigger.") | |
| 698 if $start > $self->length; | |
| 699 | |
| 700 my $aln = new $self; | |
| 701 $aln->id($self->id); | |
| 702 foreach my $seq ( $self->each_seq() ) { | |
| 703 | |
| 704 my $new_seq = new Bio::LocatableSeq (-id => $seq->id); | |
| 705 | |
| 706 # seq | |
| 707 my $seq_end = $end; | |
| 708 $seq_end = $seq->length if $end > $seq->length; | |
| 709 my $slice_seq = $seq->subseq($start, $seq_end); | |
| 710 $new_seq->seq( $slice_seq ); | |
| 711 | |
| 712 # start | |
| 713 if ($start > 1) { | |
| 714 my $pre_start_seq = $seq->subseq(1, $start - 1); | |
| 715 $pre_start_seq =~ s/\W//g; #print "$pre_start_seq\n"; | |
| 716 $new_seq->start( $seq->start + CORE::length($pre_start_seq) ); | |
| 717 } else { | |
| 718 $new_seq->start( $seq->start); | |
| 719 } | |
| 720 | |
| 721 # end | |
| 722 $slice_seq =~ s/\W//g; | |
| 723 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 ); | |
| 724 | |
| 725 if ($new_seq->start and $new_seq->end >= $new_seq->start) { | |
| 726 $aln->add_seq($new_seq); | |
| 727 } else { | |
| 728 my $nse = $seq->get_nse(); | |
| 729 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.". | |
| 730 " Sequence excluded from the new alignment."); | |
| 731 } | |
| 732 | |
| 733 } | |
| 734 | |
| 735 return $aln; | |
| 736 } | |
| 737 | |
| 738 =head2 remove_columns | |
| 739 | |
| 740 Title : remove_column | |
| 741 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) | |
| 742 Function : | |
| 743 Creates an aligment with columns removed corresponding to | |
| 744 the specified criteria. | |
| 745 Returns : a L<Bio::SimpleAlign> object | |
| 746 Args : array ref of types, 'match'|'weak'|'strong'|'mismatch' | |
| 747 | |
| 748 =cut | |
| 749 | |
| 750 sub remove_columns{ | |
| 751 my ($self,$type) = @_; | |
| 752 my %matchchars = ( 'match' => '\*', | |
| 753 'weak' => '\.', | |
| 754 'strong' => ':', | |
| 755 'mismatch'=> ' ', | |
| 756 ); | |
| 757 #get the characters to delete against | |
| 758 my $del_char; | |
| 759 foreach my $type(@{$type}){ | |
| 760 $del_char.= $matchchars{$type}; | |
| 761 } | |
| 762 | |
| 763 my $match_line = $self->match_line; | |
| 764 my $aln = new $self; | |
| 765 | |
| 766 my @remove; | |
| 767 my $length = 0; | |
| 768 | |
| 769 #do the matching to get the segments to remove | |
| 770 while($match_line=~m/[$del_char]/g){ | |
| 771 my $start = pos($match_line)-1; | |
| 772 $match_line=~/\G[$del_char]+/gc; | |
| 773 my $end = pos($match_line)-1; | |
| 774 | |
| 775 #have to offset the start and end for subsequent removes | |
| 776 $start-=$length; | |
| 777 $end -=$length; | |
| 778 $length += ($end-$start+1); | |
| 779 push @remove, [$start,$end]; | |
| 780 } | |
| 781 | |
| 782 #remove the segments | |
| 783 $aln = $self->_remove_col($aln,\@remove); | |
| 784 | |
| 785 return $aln; | |
| 786 } | |
| 787 | |
| 788 sub _remove_col { | |
| 789 my ($self,$aln,$remove) = @_; | |
| 790 my @new; | |
| 791 | |
| 792 #splice out the segments and create new seq | |
| 793 foreach my $seq($self->each_seq){ | |
| 794 my $new_seq = new Bio::LocatableSeq(-id=>$seq->id); | |
| 795 my $sequence; | |
| 796 foreach my $pair(@{$remove}){ | |
| 797 my $start = $pair->[0]; | |
| 798 my $end = $pair->[1]; | |
| 799 $sequence = $seq->seq unless $sequence; | |
| 800 my $spliced; | |
| 801 $spliced .= $start > 0 ? substr($sequence,0,$start) : ''; | |
| 802 $spliced .= substr($sequence,$end+1,$seq->length-$end+1); | |
| 803 $sequence = $spliced; | |
| 804 if ($start == 1) { | |
| 805 $new_seq->start($end); | |
| 806 } | |
| 807 else { | |
| 808 $new_seq->start( $seq->start); | |
| 809 } | |
| 810 # end | |
| 811 if($end >= $seq->end){ | |
| 812 $new_seq->end( $start); | |
| 813 } | |
| 814 else { | |
| 815 $new_seq->end($seq->end); | |
| 816 } | |
| 817 } | |
| 818 $new_seq->seq($sequence); | |
| 819 push @new, $new_seq; | |
| 820 } | |
| 821 #add the new seqs to the alignment | |
| 822 foreach my $new(@new){ | |
| 823 $aln->add_seq($new); | |
| 824 } | |
| 825 return $aln; | |
| 826 } | |
| 827 | |
| 828 =head1 Change sequences within the MSE | |
| 829 | |
| 830 These methods affect characters in all sequences without changeing the | |
| 831 alignment. | |
| 832 | |
| 833 | |
| 834 =head2 map_chars | |
| 835 | |
| 836 Title : map_chars | |
| 837 Usage : $ali->map_chars('\.','-') | |
| 838 Function : | |
| 839 | |
| 840 Does a s/$arg1/$arg2/ on the sequences. Useful for gap | |
| 841 characters | |
| 842 | |
| 843 Notice that the from (arg1) is interpretted as a regex, | |
| 844 so be careful about quoting meta characters (eg | |
| 845 $ali->map_chars('.','-') wont do what you want) | |
| 846 | |
| 847 Returns : | |
| 848 Argument : 'from' rexexp | |
| 849 'to' string | |
| 850 | |
| 851 =cut | |
| 852 | |
| 853 sub map_chars { | |
| 854 my $self = shift; | |
| 855 my $from = shift; | |
| 856 my $to = shift; | |
| 857 my ($seq,$temp); | |
| 858 | |
| 859 $self->throw("Need exactly two arguments") | |
| 860 unless defined $from and defined $to; | |
| 861 | |
| 862 foreach $seq ( $self->each_seq() ) { | |
| 863 $temp = $seq->seq(); | |
| 864 $temp =~ s/$from/$to/g; | |
| 865 $seq->seq($temp); | |
| 866 } | |
| 867 return 1; | |
| 868 } | |
| 869 | |
| 870 | |
| 871 =head2 uppercase | |
| 872 | |
| 873 Title : uppercase() | |
| 874 Usage : $ali->uppercase() | |
| 875 Function : Sets all the sequences to uppercase | |
| 876 Returns : | |
| 877 Argument : | |
| 878 | |
| 879 =cut | |
| 880 | |
| 881 sub uppercase { | |
| 882 my $self = shift; | |
| 883 my $seq; | |
| 884 my $temp; | |
| 885 | |
| 886 foreach $seq ( $self->each_seq() ) { | |
| 887 $temp = $seq->seq(); | |
| 888 $temp =~ tr/[a-z]/[A-Z]/; | |
| 889 | |
| 890 $seq->seq($temp); | |
| 891 } | |
| 892 return 1; | |
| 893 } | |
| 894 | |
| 895 =head2 cigar_line | |
| 896 | |
| 897 Title : cigar_line() | |
| 898 Usage : $align->cigar_line() | |
| 899 Function : Generates a "cigar" line for each sequence in the alignment | |
| 900 The format is simply A-1,60;B-1,1:4,60;C-5,10:12,58 | |
| 901 where A,B,C,etc. are the sequence identifiers, and the numbers | |
| 902 refer to conserved positions within the alignment | |
| 903 Args : none | |
| 904 | |
| 905 =cut | |
| 906 | |
| 907 sub cigar_line { | |
| 908 my ($self) = @_; | |
| 909 | |
| 910 my %cigar; | |
| 911 my %clines; | |
| 912 my @seqchars; | |
| 913 my $seqcount = 0; | |
| 914 my $sc; | |
| 915 foreach my $seq ( $self->each_seq ) { | |
| 916 push @seqchars, [ split(//, uc ($seq->seq)) ]; | |
| 917 $sc = scalar(@seqchars); | |
| 918 } | |
| 919 | |
| 920 foreach my $pos ( 0..$self->length ) { | |
| 921 my $i=0; | |
| 922 foreach my $seq ( @seqchars ) { | |
| 923 $i++; | |
| 924 # print STDERR "Seq $i at pos $pos: ".$seq->[$pos]."\n"; | |
| 925 if ($seq->[$pos] eq '.') { | |
| 926 if (defined $cigar{$i} && $clines{$i} !~ $cigar{$i}) { | |
| 927 $clines{$i}.=$cigar{$i}; | |
| 928 } | |
| 929 } | |
| 930 else { | |
| 931 if (! defined $cigar{$i}) { | |
| 932 $clines{$i}.=($pos+1).","; | |
| 933 } | |
| 934 $cigar{$i}=$pos+1; | |
| 935 } | |
| 936 if ($pos+1 == $self->length && ($clines{$i} =~ /\,$/) ) { | |
| 937 $clines{$i}.=$cigar{$i}; | |
| 938 } | |
| 939 } | |
| 940 } | |
| 941 for(my $i=1; $i<$sc+1;$i++) { | |
| 942 print STDERR "Seq $i cigar line ".$clines{$i}."\n"; | |
| 943 } | |
| 944 return %clines; | |
| 945 } | |
| 946 | |
| 947 =head2 match_line | |
| 948 | |
| 949 Title : match_line() | |
| 950 Usage : $align->match_line() | |
| 951 Function : Generates a match line - much like consensus string | |
| 952 except that a line indicating the '*' for a match. | |
| 953 Args : (optional) Match line characters ('*' by default) | |
| 954 (optional) Strong match char (':' by default) | |
| 955 (optional) Weak match char ('.' by default) | |
| 956 | |
| 957 =cut | |
| 958 | |
| 959 sub match_line { | |
| 960 my ($self,$matchlinechar, $strong, $weak) = @_; | |
| 961 my %matchchars = ( 'match' => $matchlinechar || '*', | |
| 962 'weak' => $weak || '.', | |
| 963 'strong' => $strong || ':', | |
| 964 'mismatch'=> ' ', | |
| 965 ); | |
| 966 | |
| 967 | |
| 968 my @seqchars; | |
| 969 my $seqcount = 0; | |
| 970 my $alphabet; | |
| 971 foreach my $seq ( $self->each_seq ) { | |
| 972 push @seqchars, [ split(//, uc ($seq->seq)) ]; | |
| 973 $alphabet = $seq->alphabet unless defined $alphabet; | |
| 974 } | |
| 975 my $refseq = shift @seqchars; | |
| 976 # let's just march down the columns | |
| 977 my $matchline; | |
| 978 POS: foreach my $pos ( 0..$self->length ) { | |
| 979 my $refchar = $refseq->[$pos]; | |
| 980 next unless $refchar; # skip '' | |
| 981 my %col = ($refchar => 1); | |
| 982 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' '); | |
| 983 foreach my $seq ( @seqchars ) { | |
| 984 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' || | |
| 985 $seq->[$pos] eq ' ' ); | |
| 986 $col{$seq->[$pos]}++; | |
| 987 } | |
| 988 my @colresidues = sort keys %col; | |
| 989 my $char = $matchchars{'mismatch'}; | |
| 990 # if all the values are the same | |
| 991 if( $dash ) { $char = $matchchars{'mismatch'} } | |
| 992 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} } | |
| 993 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong | |
| 994 # matches for protein seqs | |
| 995 TYPE: foreach my $type ( qw(strong weak) ) { | |
| 996 # iterate through categories | |
| 997 my %groups; | |
| 998 # iterate through each of the aa in the col | |
| 999 # look to see which groups it is in | |
| 1000 foreach my $c ( @colresidues ) { | |
| 1001 foreach my $f ( grep /\Q$c/, @{$CONSERVATION_GROUPS{$type}} ) { | |
| 1002 push @{$groups{$f}},$c; | |
| 1003 } | |
| 1004 } | |
| 1005 GRP: foreach my $cols ( values %groups ) { | |
| 1006 @$cols = sort @$cols; | |
| 1007 # now we are just testing to see if two arrays | |
| 1008 # are identical w/o changing either one | |
| 1009 | |
| 1010 # have to be same len | |
| 1011 next if( scalar @$cols != scalar @colresidues ); | |
| 1012 # walk down the length and check each slot | |
| 1013 for($_=0;$_ < (scalar @$cols);$_++ ) { | |
| 1014 next GRP if( $cols->[$_] ne $colresidues[$_] ); | |
| 1015 } | |
| 1016 $char = $matchchars{$type}; | |
| 1017 last TYPE; | |
| 1018 } | |
| 1019 } | |
| 1020 } | |
| 1021 $matchline .= $char; | |
| 1022 } | |
| 1023 return $matchline; | |
| 1024 } | |
| 1025 | |
| 1026 =head2 match | |
| 1027 | |
| 1028 Title : match() | |
| 1029 Usage : $ali->match() | |
| 1030 Function : | |
| 1031 | |
| 1032 Goes through all columns and changes residues that are | |
| 1033 identical to residue in first sequence to match '.' | |
| 1034 character. Sets match_char. | |
| 1035 | |
| 1036 USE WITH CARE: Most MSE formats do not support match | |
| 1037 characters in sequences, so this is mostly for output | |
| 1038 only. NEXUS format (Bio::AlignIO::nexus) can handle | |
| 1039 it. | |
| 1040 | |
| 1041 Returns : 1 | |
| 1042 Argument : a match character, optional, defaults to '.' | |
| 1043 | |
| 1044 =cut | |
| 1045 | |
| 1046 sub match { | |
| 1047 my ($self, $match) = @_; | |
| 1048 | |
| 1049 $match ||= '.'; | |
| 1050 my ($matching_char) = $match; | |
| 1051 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #'; | |
| 1052 $self->map_chars($matching_char, '-'); | |
| 1053 | |
| 1054 my @seqs = $self->each_seq(); | |
| 1055 return 1 unless scalar @seqs > 1; | |
| 1056 | |
| 1057 my $refseq = shift @seqs ; | |
| 1058 my @refseq = split //, $refseq->seq; | |
| 1059 my $gapchar = $self->gap_char; | |
| 1060 | |
| 1061 foreach my $seq ( @seqs ) { | |
| 1062 my @varseq = split //, $seq->seq(); | |
| 1063 for ( my $i=0; $i < scalar @varseq; $i++) { | |
| 1064 $varseq[$i] = $match if defined $refseq[$i] && | |
| 1065 ( $refseq[$i] =~ /[A-Za-z\*]/ || | |
| 1066 $refseq[$i] =~ /$gapchar/ ) | |
| 1067 && $refseq[$i] eq $varseq[$i]; | |
| 1068 } | |
| 1069 $seq->seq(join '', @varseq); | |
| 1070 } | |
| 1071 $self->match_char($match); | |
| 1072 return 1; | |
| 1073 } | |
| 1074 | |
| 1075 | |
| 1076 =head2 unmatch | |
| 1077 | |
| 1078 Title : unmatch() | |
| 1079 Usage : $ali->unmatch() | |
| 1080 Function : Undoes the effect of method match. Unsets match_char. | |
| 1081 | |
| 1082 Returns : 1 | |
| 1083 Argument : a match character, optional, defaults to '.' | |
| 1084 | |
| 1085 See L<match> and L<match_char> | |
| 1086 | |
| 1087 =cut | |
| 1088 | |
| 1089 sub unmatch { | |
| 1090 my ($self, $match) = @_; | |
| 1091 | |
| 1092 $match ||= '.'; | |
| 1093 | |
| 1094 my @seqs = $self->each_seq(); | |
| 1095 return 1 unless scalar @seqs > 1; | |
| 1096 | |
| 1097 my $refseq = shift @seqs ; | |
| 1098 my @refseq = split //, $refseq->seq; | |
| 1099 my $gapchar = $self->gap_char; | |
| 1100 foreach my $seq ( @seqs ) { | |
| 1101 my @varseq = split //, $seq->seq(); | |
| 1102 for ( my $i=0; $i < scalar @varseq; $i++) { | |
| 1103 $varseq[$i] = $refseq[$i] if defined $refseq[$i] && | |
| 1104 ( $refseq[$i] =~ /[A-Za-z\*]/ || | |
| 1105 $refseq[$i] =~ /$gapchar/ ) && | |
| 1106 $varseq[$i] eq $match; | |
| 1107 } | |
| 1108 $seq->seq(join '', @varseq); | |
| 1109 } | |
| 1110 $self->match_char(''); | |
| 1111 return 1; | |
| 1112 } | |
| 1113 | |
| 1114 =head1 MSE attibutes | |
| 1115 | |
| 1116 Methods for setting and reading the MSE attributes. | |
| 1117 | |
| 1118 Note that the methods defining character semantics depend on the user | |
| 1119 to set them sensibly. They are needed only by certain input/output | |
| 1120 methods. Unset them by setting to an empty string (''). | |
| 1121 | |
| 1122 =head2 id | |
| 1123 | |
| 1124 Title : id | |
| 1125 Usage : $myalign->id("Ig") | |
| 1126 Function : Gets/sets the id field of the alignment | |
| 1127 Returns : An id string | |
| 1128 Argument : An id string (optional) | |
| 1129 | |
| 1130 =cut | |
| 1131 | |
| 1132 sub id { | |
| 1133 my ($self, $name) = @_; | |
| 1134 | |
| 1135 if (defined( $name )) { | |
| 1136 $self->{'_id'} = $name; | |
| 1137 } | |
| 1138 | |
| 1139 return $self->{'_id'}; | |
| 1140 } | |
| 1141 | |
| 1142 =head2 missing_char | |
| 1143 | |
| 1144 Title : missing_char | |
| 1145 Usage : $myalign->missing_char("?") | |
| 1146 Function : Gets/sets the missing_char attribute of the alignment | |
| 1147 It is generally recommended to set it to 'n' or 'N' | |
| 1148 for nucleotides and to 'X' for protein. | |
| 1149 Returns : An missing_char string, | |
| 1150 Argument : An missing_char string (optional) | |
| 1151 | |
| 1152 =cut | |
| 1153 | |
| 1154 sub missing_char { | |
| 1155 my ($self, $char) = @_; | |
| 1156 | |
| 1157 if (defined $char ) { | |
| 1158 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1; | |
| 1159 $self->{'_missing_char'} = $char; | |
| 1160 } | |
| 1161 | |
| 1162 return $self->{'_missing_char'}; | |
| 1163 } | |
| 1164 | |
| 1165 =head2 match_char | |
| 1166 | |
| 1167 Title : match_char | |
| 1168 Usage : $myalign->match_char('.') | |
| 1169 Function : Gets/sets the match_char attribute of the alignment | |
| 1170 Returns : An match_char string, | |
| 1171 Argument : An match_char string (optional) | |
| 1172 | |
| 1173 =cut | |
| 1174 | |
| 1175 sub match_char { | |
| 1176 my ($self, $char) = @_; | |
| 1177 | |
| 1178 if (defined $char ) { | |
| 1179 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1; | |
| 1180 $self->{'_match_char'} = $char; | |
| 1181 } | |
| 1182 | |
| 1183 return $self->{'_match_char'}; | |
| 1184 } | |
| 1185 | |
| 1186 =head2 gap_char | |
| 1187 | |
| 1188 Title : gap_char | |
| 1189 Usage : $myalign->gap_char('-') | |
| 1190 Function : Gets/sets the gap_char attribute of the alignment | |
| 1191 Returns : An gap_char string, defaults to '-' | |
| 1192 Argument : An gap_char string (optional) | |
| 1193 | |
| 1194 =cut | |
| 1195 | |
| 1196 sub gap_char { | |
| 1197 my ($self, $char) = @_; | |
| 1198 | |
| 1199 if (defined $char || ! defined $self->{'_gap_char'} ) { | |
| 1200 $char= '-' unless defined $char; | |
| 1201 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1; | |
| 1202 $self->{'_gap_char'} = $char; | |
| 1203 } | |
| 1204 return $self->{'_gap_char'}; | |
| 1205 } | |
| 1206 | |
| 1207 =head2 symbol_chars | |
| 1208 | |
| 1209 Title : symbol_chars | |
| 1210 Usage : my @symbolchars = $aln->symbol_chars; | |
| 1211 Function: Returns all the seen symbols (other than gaps) | |
| 1212 Returns : array of characters that are the seen symbols | |
| 1213 Args : boolean to include the gap/missing/match characters | |
| 1214 | |
| 1215 =cut | |
| 1216 | |
| 1217 sub symbol_chars{ | |
| 1218 my ($self,$includeextra) = @_; | |
| 1219 if( ! defined $self->{'_symbols'} ) { | |
| 1220 $self->warn("Symbol list was not initialized"); | |
| 1221 return (); | |
| 1222 } | |
| 1223 my %copy = %{$self->{'_symbols'}}; | |
| 1224 if( ! $includeextra ) { | |
| 1225 foreach my $char ( $self->gap_char, $self->match_char, | |
| 1226 $self->missing_char) { | |
| 1227 delete $copy{$char} if( defined $char ); | |
| 1228 } | |
| 1229 } | |
| 1230 return keys %copy; | |
| 1231 } | |
| 1232 | |
| 1233 =head1 Alignment descriptors | |
| 1234 | |
| 1235 These read only methods describe the MSE in various ways. | |
| 1236 | |
| 1237 | |
| 1238 =head2 consensus_string | |
| 1239 | |
| 1240 Title : consensus_string | |
| 1241 Usage : $str = $ali->consensus_string($threshold_percent) | |
| 1242 Function : Makes a strict consensus | |
| 1243 Returns : | |
| 1244 Argument : Optional treshold ranging from 0 to 100. | |
| 1245 The consensus residue has to appear at least threshold % | |
| 1246 of the sequences at a given location, otherwise a '?' | |
| 1247 character will be placed at that location. | |
| 1248 (Default value = 0%) | |
| 1249 | |
| 1250 =cut | |
| 1251 | |
| 1252 sub consensus_string { | |
| 1253 my $self = shift; | |
| 1254 my $threshold = shift; | |
| 1255 my $len; | |
| 1256 my ($out,$count); | |
| 1257 | |
| 1258 $out = ""; | |
| 1259 | |
| 1260 $len = $self->length - 1; | |
| 1261 | |
| 1262 foreach $count ( 0 .. $len ) { | |
| 1263 $out .= $self->_consensus_aa($count,$threshold); | |
| 1264 } | |
| 1265 return $out; | |
| 1266 } | |
| 1267 | |
| 1268 sub _consensus_aa { | |
| 1269 my $self = shift; | |
| 1270 my $point = shift; | |
| 1271 my $threshold_percent = shift || -1 ; | |
| 1272 my ($seq,%hash,$count,$letter,$key); | |
| 1273 | |
| 1274 foreach $seq ( $self->each_seq() ) { | |
| 1275 $letter = substr($seq->seq,$point,1); | |
| 1276 $self->throw("--$point-----------") if $letter eq ''; | |
| 1277 ($letter =~ /\./) && next; | |
| 1278 # print "Looking at $letter\n"; | |
| 1279 $hash{$letter}++; | |
| 1280 } | |
| 1281 my $number_of_sequences = $self->no_sequences(); | |
| 1282 my $threshold = $number_of_sequences * $threshold_percent / 100. ; | |
| 1283 $count = -1; | |
| 1284 $letter = '?'; | |
| 1285 | |
| 1286 foreach $key ( sort keys %hash ) { | |
| 1287 # print "Now at $key $hash{$key}\n"; | |
| 1288 if( $hash{$key} > $count && $hash{$key} >= $threshold) { | |
| 1289 $letter = $key; | |
| 1290 $count = $hash{$key}; | |
| 1291 } | |
| 1292 } | |
| 1293 return $letter; | |
| 1294 } | |
| 1295 | |
| 1296 | |
| 1297 =head2 consensus_iupac | |
| 1298 | |
| 1299 Title : consensus_iupac | |
| 1300 Usage : $str = $ali->consensus_iupac() | |
| 1301 Function : | |
| 1302 | |
| 1303 Makes a consensus using IUPAC ambiguity codes from DNA | |
| 1304 and RNA. The output is in upper case except when gaps in | |
| 1305 a column force output to be in lower case. | |
| 1306 | |
| 1307 Note that if your alignment sequences contain a lot of | |
| 1308 IUPAC ambiquity codes you often have to manually set | |
| 1309 alphabet. Bio::PrimarySeq::_guess_type thinks they | |
| 1310 indicate a protein sequence. | |
| 1311 | |
| 1312 Returns : consensus string | |
| 1313 Argument : none | |
| 1314 Throws : on protein sequences | |
| 1315 | |
| 1316 =cut | |
| 1317 | |
| 1318 sub consensus_iupac { | |
| 1319 my $self = shift; | |
| 1320 my $out = ""; | |
| 1321 my $len = $self->length-1; | |
| 1322 | |
| 1323 # only DNA and RNA sequences are valid | |
| 1324 foreach my $seq ( $self->each_seq() ) { | |
| 1325 $self->throw("Seq [". $seq->get_nse. "] is a protein") | |
| 1326 if $seq->alphabet eq 'protein'; | |
| 1327 } | |
| 1328 # loop over the alignment columns | |
| 1329 foreach my $count ( 0 .. $len ) { | |
| 1330 $out .= $self->_consensus_iupac($count); | |
| 1331 } | |
| 1332 return $out; | |
| 1333 } | |
| 1334 | |
| 1335 sub _consensus_iupac { | |
| 1336 my ($self, $column) = @_; | |
| 1337 my ($string, $char, $rna); | |
| 1338 | |
| 1339 #determine all residues in a column | |
| 1340 foreach my $seq ( $self->each_seq() ) { | |
| 1341 $string .= substr($seq->seq, $column, 1); | |
| 1342 } | |
| 1343 $string = uc $string; | |
| 1344 | |
| 1345 # quick exit if there's an N in the string | |
| 1346 if ($string =~ /N/) { | |
| 1347 $string =~ /\W/ ? return 'n' : return 'N'; | |
| 1348 } | |
| 1349 # ... or if there are only gap characters | |
| 1350 return '-' if $string =~ /^\W+$/; | |
| 1351 | |
| 1352 # treat RNA as DNA in regexps | |
| 1353 if ($string =~ /U/) { | |
| 1354 $string =~ s/U/T/; | |
| 1355 $rna = 1; | |
| 1356 } | |
| 1357 | |
| 1358 # the following s///'s only need to be done to the _first_ ambiguity code | |
| 1359 # as we only need to see the _range_ of characters in $string | |
| 1360 | |
| 1361 if ($string =~ /[VDHB]/) { | |
| 1362 $string =~ s/V/AGC/; | |
| 1363 $string =~ s/D/AGT/; | |
| 1364 $string =~ s/H/ACT/; | |
| 1365 $string =~ s/B/CTG/; | |
| 1366 } | |
| 1367 | |
| 1368 if ($string =~ /[SKYRWM]/) { | |
| 1369 $string =~ s/S/GC/; | |
| 1370 $string =~ s/K/GT/; | |
| 1371 $string =~ s/Y/CT/; | |
| 1372 $string =~ s/R/AG/; | |
| 1373 $string =~ s/W/AT/; | |
| 1374 $string =~ s/M/AC/; | |
| 1375 } | |
| 1376 | |
| 1377 # and now the guts of the thing | |
| 1378 | |
| 1379 if ($string =~ /A/) { | |
| 1380 $char = 'A'; # A A | |
| 1381 if ($string =~ /G/) { | |
| 1382 $char = 'R'; # A and G (purines) R | |
| 1383 if ($string =~ /C/) { | |
| 1384 $char = 'V'; # A and G and C V | |
| 1385 if ($string =~ /T/) { | |
| 1386 $char = 'N'; # A and G and C and T N | |
| 1387 } | |
| 1388 } elsif ($string =~ /T/) { | |
| 1389 $char = 'D'; # A and G and T D | |
| 1390 } | |
| 1391 } elsif ($string =~ /C/) { | |
| 1392 $char = 'M'; # A and C M | |
| 1393 if ($string =~ /T/) { | |
| 1394 $char = 'H'; # A and C and T H | |
| 1395 } | |
| 1396 } elsif ($string =~ /T/) { | |
| 1397 $char = 'W'; # A and T W | |
| 1398 } | |
| 1399 } elsif ($string =~ /C/) { | |
| 1400 $char = 'C'; # C C | |
| 1401 if ($string =~ /T/) { | |
| 1402 $char = 'Y'; # C and T (pyrimidines) Y | |
| 1403 if ($string =~ /G/) { | |
| 1404 $char = 'B'; # C and T and G B | |
| 1405 } | |
| 1406 } elsif ($string =~ /G/) { | |
| 1407 $char = 'S'; # C and G S | |
| 1408 } | |
| 1409 } elsif ($string =~ /G/) { | |
| 1410 $char = 'G'; # G G | |
| 1411 if ($string =~ /C/) { | |
| 1412 $char = 'S'; # G and C S | |
| 1413 } elsif ($string =~ /T/) { | |
| 1414 $char = 'K'; # G and T K | |
| 1415 } | |
| 1416 } elsif ($string =~ /T/) { | |
| 1417 $char = 'T'; # T T | |
| 1418 } | |
| 1419 | |
| 1420 $char = 'U' if $rna and $char eq 'T'; | |
| 1421 $char = lc $char if $string =~ /\W/; | |
| 1422 | |
| 1423 return $char; | |
| 1424 } | |
| 1425 | |
| 1426 =head2 is_flush | |
| 1427 | |
| 1428 Title : is_flush | |
| 1429 Usage : if( $ali->is_flush() ) | |
| 1430 : | |
| 1431 : | |
| 1432 Function : Tells you whether the alignment | |
| 1433 : is flush, ie all of the same length | |
| 1434 : | |
| 1435 : | |
| 1436 Returns : 1 or 0 | |
| 1437 Argument : | |
| 1438 | |
| 1439 =cut | |
| 1440 | |
| 1441 sub is_flush { | |
| 1442 my ($self,$report) = @_; | |
| 1443 my $seq; | |
| 1444 my $length = (-1); | |
| 1445 my $temp; | |
| 1446 | |
| 1447 foreach $seq ( $self->each_seq() ) { | |
| 1448 if( $length == (-1) ) { | |
| 1449 $length = CORE::length($seq->seq()); | |
| 1450 next; | |
| 1451 } | |
| 1452 | |
| 1453 $temp = CORE::length($seq->seq()); | |
| 1454 if( $temp != $length ) { | |
| 1455 $self->warn("expecting $length not $temp from ". | |
| 1456 $seq->display_id) if( $report ); | |
| 1457 $self->debug("expecting $length not $temp from ". | |
| 1458 $seq->display_id); | |
| 1459 $self->debug($seq->seq(). "\n"); | |
| 1460 return 0; | |
| 1461 } | |
| 1462 } | |
| 1463 | |
| 1464 return 1; | |
| 1465 } | |
| 1466 | |
| 1467 | |
| 1468 =head2 length | |
| 1469 | |
| 1470 Title : length() | |
| 1471 Usage : $len = $ali->length() | |
| 1472 Function : Returns the maximum length of the alignment. | |
| 1473 To be sure the alignment is a block, use is_flush | |
| 1474 Returns : | |
| 1475 Argument : | |
| 1476 | |
| 1477 =cut | |
| 1478 | |
| 1479 sub length_aln { | |
| 1480 my $self = shift; | |
| 1481 $self->warn(ref($self). "::length_aln - deprecated method. Use length() instead."); | |
| 1482 $self->length(@_); | |
| 1483 } | |
| 1484 | |
| 1485 sub length { | |
| 1486 my $self = shift; | |
| 1487 my $seq; | |
| 1488 my $length = (-1); | |
| 1489 my ($temp,$len); | |
| 1490 | |
| 1491 foreach $seq ( $self->each_seq() ) { | |
| 1492 $temp = CORE::length($seq->seq()); | |
| 1493 if( $temp > $length ) { | |
| 1494 $length = $temp; | |
| 1495 } | |
| 1496 } | |
| 1497 | |
| 1498 return $length; | |
| 1499 } | |
| 1500 | |
| 1501 | |
| 1502 =head2 maxdisplayname_length | |
| 1503 | |
| 1504 Title : maxdisplayname_length | |
| 1505 Usage : $ali->maxdisplayname_length() | |
| 1506 Function : | |
| 1507 | |
| 1508 Gets the maximum length of the displayname in the | |
| 1509 alignment. Used in writing out various MSE formats. | |
| 1510 | |
| 1511 Returns : integer | |
| 1512 Argument : | |
| 1513 | |
| 1514 =cut | |
| 1515 | |
| 1516 sub maxname_length { | |
| 1517 my $self = shift; | |
| 1518 $self->warn(ref($self). "::maxname_length - deprecated method.". | |
| 1519 " Use maxdisplayname_length() instead."); | |
| 1520 $self->maxdisplayname_length(); | |
| 1521 } | |
| 1522 | |
| 1523 sub maxnse_length { | |
| 1524 my $self = shift; | |
| 1525 $self->warn(ref($self). "::maxnse_length - deprecated method.". | |
| 1526 " Use maxnse_length() instead."); | |
| 1527 $self->maxdisplayname_length(); | |
| 1528 } | |
| 1529 | |
| 1530 sub maxdisplayname_length { | |
| 1531 my $self = shift; | |
| 1532 my $maxname = (-1); | |
| 1533 my ($seq,$len); | |
| 1534 | |
| 1535 foreach $seq ( $self->each_seq() ) { | |
| 1536 $len = CORE::length $self->displayname($seq->get_nse()); | |
| 1537 | |
| 1538 if( $len > $maxname ) { | |
| 1539 $maxname = $len; | |
| 1540 } | |
| 1541 } | |
| 1542 | |
| 1543 return $maxname; | |
| 1544 } | |
| 1545 | |
| 1546 =head2 no_residues | |
| 1547 | |
| 1548 Title : no_residues | |
| 1549 Usage : $no = $ali->no_residues | |
| 1550 Function : number of residues in total in the alignment | |
| 1551 Returns : integer | |
| 1552 Argument : | |
| 1553 | |
| 1554 =cut | |
| 1555 | |
| 1556 sub no_residues { | |
| 1557 my $self = shift; | |
| 1558 my $count = 0; | |
| 1559 | |
| 1560 foreach my $seq ($self->each_seq) { | |
| 1561 my $str = $seq->seq(); | |
| 1562 | |
| 1563 $count += ($str =~ s/[^A-Za-z]//g); | |
| 1564 } | |
| 1565 | |
| 1566 return $count; | |
| 1567 } | |
| 1568 | |
| 1569 =head2 no_sequences | |
| 1570 | |
| 1571 Title : no_sequences | |
| 1572 Usage : $depth = $ali->no_sequences | |
| 1573 Function : number of sequence in the sequence alignment | |
| 1574 Returns : integer | |
| 1575 Argument : | |
| 1576 | |
| 1577 =cut | |
| 1578 | |
| 1579 sub no_sequences { | |
| 1580 my $self = shift; | |
| 1581 | |
| 1582 return scalar($self->each_seq); | |
| 1583 } | |
| 1584 | |
| 1585 | |
| 1586 =head2 average_percentage_identity | |
| 1587 | |
| 1588 Title : average_percentage_identity | |
| 1589 Usage : $id = $align->average_percentage_identity | |
| 1590 Function: The function uses a fast method to calculate the average | |
| 1591 percentage identity of the alignment | |
| 1592 Returns : The average percentage identity of the alignment | |
| 1593 Args : None | |
| 1594 Notes : This method implemented by Kevin Howe calculates a figure that is | |
| 1595 designed to be similar to the average pairwise identity of the | |
| 1596 alignment (identical in the absence of gaps), without having to | |
| 1597 explicitly calculate pairwise identities proposed by Richard Durbin. | |
| 1598 Validated by Ewan Birney ad Alex Bateman. | |
| 1599 | |
| 1600 =cut | |
| 1601 | |
| 1602 sub average_percentage_identity{ | |
| 1603 my ($self,@args) = @_; | |
| 1604 | |
| 1605 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', | |
| 1606 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); | |
| 1607 | |
| 1608 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes); | |
| 1609 | |
| 1610 if (! $self->is_flush()) { | |
| 1611 $self->throw("All sequences in the alignment must be the same length"); | |
| 1612 } | |
| 1613 | |
| 1614 @seqs = $self->each_seq(); | |
| 1615 $len = $self->length(); | |
| 1616 | |
| 1617 # load the each hash with correct keys for existence checks | |
| 1618 | |
| 1619 for( my $index=0; $index < $len; $index++) { | |
| 1620 foreach my $letter (@alphabet) { | |
| 1621 $countHashes[$index]->{$letter} = 0; | |
| 1622 } | |
| 1623 } | |
| 1624 foreach my $seq (@seqs) { | |
| 1625 my @seqChars = split //, $seq->seq(); | |
| 1626 for( my $column=0; $column < @seqChars; $column++ ) { | |
| 1627 my $char = uc($seqChars[$column]); | |
| 1628 if (exists $countHashes[$column]->{$char}) { | |
| 1629 $countHashes[$column]->{$char}++; | |
| 1630 } | |
| 1631 } | |
| 1632 } | |
| 1633 | |
| 1634 $total = 0; | |
| 1635 $divisor = 0; | |
| 1636 for(my $column =0; $column < $len; $column++) { | |
| 1637 my %hash = %{$countHashes[$column]}; | |
| 1638 $subdivisor = 0; | |
| 1639 foreach my $res (keys %hash) { | |
| 1640 $total += $hash{$res}*($hash{$res} - 1); | |
| 1641 $subdivisor += $hash{$res}; | |
| 1642 } | |
| 1643 $divisor += $subdivisor * ($subdivisor - 1); | |
| 1644 } | |
| 1645 return $divisor > 0 ? ($total / $divisor )*100.0 : 0; | |
| 1646 } | |
| 1647 | |
| 1648 =head2 percentage_identity | |
| 1649 | |
| 1650 Title : percentage_identity | |
| 1651 Usage : $id = $align->percentage_identity | |
| 1652 Function: The function calculates the average percentage identity | |
| 1653 (aliased for average_percentage_identity) | |
| 1654 Returns : The average percentage identity | |
| 1655 Args : None | |
| 1656 | |
| 1657 =cut | |
| 1658 | |
| 1659 sub percentage_identity { | |
| 1660 my $self = shift; | |
| 1661 return $self->average_percentage_identity(); | |
| 1662 } | |
| 1663 | |
| 1664 =head2 overall_percentage_identity | |
| 1665 | |
| 1666 Title : percentage_identity | |
| 1667 Usage : $id = $align->percentage_identity | |
| 1668 Function: The function calculates the percentage identity of | |
| 1669 the conserved columns | |
| 1670 Returns : The percentage identity of the conserved columns | |
| 1671 Args : None | |
| 1672 | |
| 1673 =cut | |
| 1674 | |
| 1675 sub overall_percentage_identity{ | |
| 1676 my ($self,@args) = @_; | |
| 1677 | |
| 1678 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', | |
| 1679 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); | |
| 1680 | |
| 1681 my ($len, $total, @seqs, @countHashes); | |
| 1682 | |
| 1683 if (! $self->is_flush()) { | |
| 1684 $self->throw("All sequences in the alignment must be the same length"); | |
| 1685 } | |
| 1686 | |
| 1687 @seqs = $self->each_seq(); | |
| 1688 $len = $self->length(); | |
| 1689 | |
| 1690 # load the each hash with correct keys for existence checks | |
| 1691 for( my $index=0; $index < $len; $index++) { | |
| 1692 foreach my $letter (@alphabet) { | |
| 1693 $countHashes[$index]->{$letter} = 0; | |
| 1694 } | |
| 1695 } | |
| 1696 foreach my $seq (@seqs) { | |
| 1697 my @seqChars = split //, $seq->seq(); | |
| 1698 for( my $column=0; $column < @seqChars; $column++ ) { | |
| 1699 my $char = uc($seqChars[$column]); | |
| 1700 if (exists $countHashes[$column]->{$char}) { | |
| 1701 $countHashes[$column]->{$char}++; | |
| 1702 } | |
| 1703 } | |
| 1704 } | |
| 1705 | |
| 1706 $total = 0; | |
| 1707 for(my $column =0; $column < $len; $column++) { | |
| 1708 my %hash = %{$countHashes[$column]}; | |
| 1709 foreach ( values %hash ) { | |
| 1710 next if( $_ == 0 ); | |
| 1711 $total++ if( $_ == scalar @seqs ); | |
| 1712 last; | |
| 1713 } | |
| 1714 } | |
| 1715 return ($total / $len ) * 100.0; | |
| 1716 } | |
| 1717 | |
| 1718 =head1 Alignment positions | |
| 1719 | |
| 1720 Methods to map a sequence position into an alignment column and back. | |
| 1721 column_from_residue_number() does the former. The latter is really a | |
| 1722 property of the sequence object and can done using | |
| 1723 L<Bio::LocatableSeq::location_from_column>: | |
| 1724 | |
| 1725 # select somehow a sequence from the alignment, e.g. | |
| 1726 my $seq = $aln->get_seq_by_pos(1); | |
| 1727 #$loc is undef or Bio::LocationI object | |
| 1728 my $loc = $seq->location_from_column(5); | |
| 1729 | |
| 1730 | |
| 1731 =head2 column_from_residue_number | |
| 1732 | |
| 1733 Title : column_from_residue_number | |
| 1734 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber) | |
| 1735 Function: | |
| 1736 | |
| 1737 This function gives the position in the alignment | |
| 1738 (i.e. column number) of the given residue number in the | |
| 1739 sequence with the given name. For example, for the | |
| 1740 alignment | |
| 1741 | |
| 1742 Seq1/91-97 AC..DEF.GH | |
| 1743 Seq2/24-30 ACGG.RTY.. | |
| 1744 Seq3/43-51 AC.DDEFGHI | |
| 1745 | |
| 1746 column_from_residue_number( "Seq1", 94 ) returns 5. | |
| 1747 column_from_residue_number( "Seq2", 25 ) returns 2. | |
| 1748 column_from_residue_number( "Seq3", 50 ) returns 9. | |
| 1749 | |
| 1750 An exception is thrown if the residue number would lie | |
| 1751 outside the length of the aligment | |
| 1752 (e.g. column_from_residue_number( "Seq2", 22 ) | |
| 1753 | |
| 1754 Note: If the the parent sequence is represented by more than | |
| 1755 one alignment sequence and the residue number is present in | |
| 1756 them, this method finds only the first one. | |
| 1757 | |
| 1758 Returns : A column number for the position in the alignment of the | |
| 1759 given residue in the given sequence (1 = first column) | |
| 1760 Args : A sequence id/name (not a name/start-end) | |
| 1761 A residue number in the whole sequence (not just that | |
| 1762 segment of it in the alignment) | |
| 1763 | |
| 1764 =cut | |
| 1765 | |
| 1766 sub column_from_residue_number { | |
| 1767 my ($self, $name, $resnumber) = @_; | |
| 1768 | |
| 1769 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name}; | |
| 1770 $self->throw("Second argument residue number missing") unless $resnumber; | |
| 1771 | |
| 1772 foreach my $seq ($self->each_seq_with_id($name)) { | |
| 1773 my $col; | |
| 1774 eval { | |
| 1775 $col = $seq->column_from_residue_number($resnumber); | |
| 1776 }; | |
| 1777 next if $@; | |
| 1778 return $col; | |
| 1779 } | |
| 1780 | |
| 1781 $self->throw("Could not find a sequence segment in $name ". | |
| 1782 "containing residue number $resnumber"); | |
| 1783 | |
| 1784 } | |
| 1785 | |
| 1786 =head1 Sequence names | |
| 1787 | |
| 1788 Methods to manipulate the display name. The default name based on the | |
| 1789 sequence id and subsequence positions can be overridden in various | |
| 1790 ways. | |
| 1791 | |
| 1792 =head2 displayname | |
| 1793 | |
| 1794 Title : displayname | |
| 1795 Usage : $myalign->displayname("Ig", "IgA") | |
| 1796 Function : Gets/sets the display name of a sequence in the alignment | |
| 1797 : | |
| 1798 Returns : A display name string | |
| 1799 Argument : name of the sequence | |
| 1800 displayname of the sequence (optional) | |
| 1801 | |
| 1802 =cut | |
| 1803 | |
| 1804 sub get_displayname { | |
| 1805 my $self = shift; | |
| 1806 $self->warn(ref($self). "::get_displayname - deprecated method. Use displayname() instead."); | |
| 1807 $self->displayname(@_); | |
| 1808 } | |
| 1809 | |
| 1810 sub set_displayname { | |
| 1811 my $self = shift; | |
| 1812 $self->warn(ref($self). "::set_displayname - deprecated method. Use displayname() instead."); | |
| 1813 $self->displayname(@_); | |
| 1814 } | |
| 1815 | |
| 1816 sub displayname { | |
| 1817 my ($self, $name, $disname) = @_; | |
| 1818 | |
| 1819 $self->throw("No sequence with name [$name]") unless $self->{'_seq'}->{$name}; | |
| 1820 | |
| 1821 if( $disname and $name) { | |
| 1822 $self->{'_dis_name'}->{$name} = $disname; | |
| 1823 return $disname; | |
| 1824 } | |
| 1825 elsif( defined $self->{'_dis_name'}->{$name} ) { | |
| 1826 return $self->{'_dis_name'}->{$name}; | |
| 1827 } else { | |
| 1828 return $name; | |
| 1829 } | |
| 1830 } | |
| 1831 | |
| 1832 =head2 set_displayname_count | |
| 1833 | |
| 1834 Title : set_displayname_count | |
| 1835 Usage : $ali->set_displayname_count | |
| 1836 Function : | |
| 1837 | |
| 1838 Sets the names to be name_# where # is the number of | |
| 1839 times this name has been used. | |
| 1840 | |
| 1841 Returns : | |
| 1842 Argument : | |
| 1843 | |
| 1844 =cut | |
| 1845 | |
| 1846 sub set_displayname_count { | |
| 1847 my $self= shift; | |
| 1848 my (@arr,$name,$seq,$count,$temp,$nse); | |
| 1849 | |
| 1850 foreach $seq ( $self->each_alphabetically() ) { | |
| 1851 $nse = $seq->get_nse(); | |
| 1852 | |
| 1853 #name will be set when this is the second | |
| 1854 #time (or greater) is has been seen | |
| 1855 | |
| 1856 if( defined $name and $name eq ($seq->id()) ) { | |
| 1857 $temp = sprintf("%s_%s",$name,$count); | |
| 1858 $self->displayname($nse,$temp); | |
| 1859 $count++; | |
| 1860 } else { | |
| 1861 $count = 1; | |
| 1862 $name = $seq->id(); | |
| 1863 $temp = sprintf("%s_%s",$name,$count); | |
| 1864 $self->displayname($nse,$temp); | |
| 1865 $count++; | |
| 1866 } | |
| 1867 } | |
| 1868 return 1; | |
| 1869 } | |
| 1870 | |
| 1871 =head2 set_displayname_flat | |
| 1872 | |
| 1873 Title : set_displayname_flat | |
| 1874 Usage : $ali->set_displayname_flat() | |
| 1875 Function : Makes all the sequences be displayed as just their name, | |
| 1876 not name/start-end | |
| 1877 Returns : 1 | |
| 1878 Argument : | |
| 1879 | |
| 1880 =cut | |
| 1881 | |
| 1882 sub set_displayname_flat { | |
| 1883 my $self = shift; | |
| 1884 my ($nse,$seq); | |
| 1885 | |
| 1886 foreach $seq ( $self->each_seq() ) { | |
| 1887 $nse = $seq->get_nse(); | |
| 1888 $self->displayname($nse,$seq->id()); | |
| 1889 } | |
| 1890 return 1; | |
| 1891 } | |
| 1892 | |
| 1893 =head2 set_displayname_normal | |
| 1894 | |
| 1895 Title : set_displayname_normal | |
| 1896 Usage : $ali->set_displayname_normal() | |
| 1897 Function : Makes all the sequences be displayed as name/start-end | |
| 1898 Returns : | |
| 1899 Argument : | |
| 1900 | |
| 1901 =cut | |
| 1902 | |
| 1903 sub set_displayname_normal { | |
| 1904 my $self = shift; | |
| 1905 my ($nse,$seq); | |
| 1906 | |
| 1907 foreach $seq ( $self->each_seq() ) { | |
| 1908 $nse = $seq->get_nse(); | |
| 1909 $self->displayname($nse,$nse); | |
| 1910 } | |
| 1911 return 1; | |
| 1912 } | |
| 1913 | |
| 1914 =head2 source | |
| 1915 | |
| 1916 Title : source | |
| 1917 Usage : $obj->source($newval) | |
| 1918 Function: sets the Alignment source program | |
| 1919 Example : | |
| 1920 Returns : value of source | |
| 1921 Args : newvalue (optional) | |
| 1922 | |
| 1923 | |
| 1924 =cut | |
| 1925 | |
| 1926 sub source{ | |
| 1927 my ($self,$value) = @_; | |
| 1928 if( defined $value) { | |
| 1929 $self->{'_source'} = $value; | |
| 1930 } | |
| 1931 return $self->{'_source'}; | |
| 1932 } | |
| 1933 | |
| 1934 1; |
