Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/SimpleAlign.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: SimpleAlign.pm,v 1.65.2.1 2003/07/02 16:00:19 jason Exp $ | |
2 # BioPerl module for SimpleAlign | |
3 # | |
4 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> | |
5 # | |
6 # Copyright Ewan Birney | |
7 # | |
8 # You may distribute this module under the same terms as perl itself | |
9 | |
10 # POD documentation - main docs before the code | |
11 # | |
12 # History: | |
13 # 11/3/00 Added threshold feature to consensus and consensus_aa - PS | |
14 # May 2001 major rewrite - Heikki Lehvaslaiho | |
15 | |
16 =head1 NAME | |
17 | |
18 Bio::SimpleAlign - Multiple alignments held as a set of sequences | |
19 | |
20 =head1 SYNOPSIS | |
21 | |
22 # use Bio::AlignIO to read in the alignment | |
23 $str = Bio::AlignIO->new('-file' => 't/data/testaln.pfam'); | |
24 $aln = $str->next_aln(); | |
25 | |
26 # some descriptors | |
27 print $aln->length, "\n"; | |
28 print $aln->no_residues, "\n"; | |
29 print $aln->is_flush, "\n"; | |
30 print $aln->no_sequences, "\n"; | |
31 print $aln->percentage_identity, "\n"; | |
32 print $aln->consensus_string(50), "\n"; | |
33 | |
34 # find the position in the alignment for a sequence location | |
35 $pos = $aln->column_from_residue_number('1433_LYCES', 14); # = 6; | |
36 | |
37 # extract sequences and check values for the alignment column $pos | |
38 foreach $seq ($aln->each_seq) { | |
39 $res = $seq->subseq($pos, $pos); | |
40 $count{$res}++; | |
41 } | |
42 foreach $res (keys %count) { | |
43 printf "Res: %s Count: %2d\n", $res, $count{$res}; | |
44 } | |
45 | |
46 | |
47 =head1 DESCRIPTION | |
48 | |
49 SimpleAlign handles multiple alignments of sequences. It is very | |
50 permissive of types (it won't insist on things being all same length | |
51 etc): really it is a SequenceSet explicitly held in memory with a | |
52 whole series of built in manipulations and especially file format | |
53 systems for read/writing alignments. | |
54 | |
55 SimpleAlign basically views an alignment as an immutable block of | |
56 text. SimpleAlign *is not* the object to be using if you want to | |
57 perform complex alignment manipulations. | |
58 | |
59 However for lightweight display/formatting and minimal manipulation | |
60 (e.g. removing all-gaps columns) - this is the one to use. | |
61 | |
62 SimpleAlign uses a subclass of L<Bio::PrimarySeq> class | |
63 L<Bio::LocatableSeq> to store its sequences. These are subsequences | |
64 with a start and end positions in the parent reference sequence. | |
65 | |
66 Tricky concepts. SimpleAlign expects name,start,end to be 'unique' in | |
67 the alignment, and this is the key for the internal hashes. | |
68 (name,start,end is abbreviated nse in the code). However, in many | |
69 cases people don't want the name/start-end to be displayed: either | |
70 multiple names in an alignment or names specific to the alignment | |
71 (ROA1_HUMAN_1, ROA1_HUMAN_2 etc). These names are called | |
72 'displayname', and generally is what is used to print out the | |
73 alignment. They default to name/start-end. | |
74 | |
75 The SimpleAlign Module came from Ewan Birney's Align module. | |
76 | |
77 =head1 PROGRESS | |
78 | |
79 SimpleAlign is being slowly converted to bioperl coding standards, | |
80 mainly by Ewan. | |
81 | |
82 =over 3 | |
83 | |
84 =item Use Bio::Root::Object - done | |
85 | |
86 =item Use proper exceptions - done | |
87 | |
88 =item Use hashed constructor - not done! | |
89 | |
90 =back | |
91 | |
92 =head1 FEEDBACK | |
93 | |
94 =head2 Mailing Lists | |
95 | |
96 User feedback is an integral part of the evolution of this and other | |
97 Bioperl modules. Send your comments and suggestions preferably to one | |
98 of the Bioperl mailing lists. Your participation is much appreciated. | |
99 | |
100 bioperl-l@bioperl.org - General discussion | |
101 http://bioperl.org/MailList.shtml - About the mailing lists | |
102 | |
103 =head2 Reporting Bugs | |
104 | |
105 Report bugs to the Bioperl bug tracking system to help us keep track | |
106 the bugs and their resolution. Bug reports can be submitted via email | |
107 or the web: | |
108 | |
109 bioperl-bugs@bio.perl.org | |
110 http://bugzilla.bioperl.org/ | |
111 | |
112 =head1 AUTHOR | |
113 | |
114 Ewan Birney, birney@sanger.ac.uk | |
115 | |
116 =head1 CONTRIBUTORS | |
117 | |
118 Richard Adams, Richard.Adams@ed.ac.uk, | |
119 David J. Evans, David.Evans@vir.gla.ac.uk, | |
120 Heikki Lehvaslaiho, heikki@ebi.ac.uk, | |
121 Allen Smith, allens@cpan.org, | |
122 Jason Stajich, jason@bioperl.org, | |
123 Anthony Underwood, aunderwood@phls.org.uk, | |
124 Xintao Wei & Giri Narasimhan, giri@cs.fiu.edu | |
125 | |
126 | |
127 =head1 SEE ALSO | |
128 | |
129 L<Bio::LocatableSeq.pm> | |
130 | |
131 =head1 APPENDIX | |
132 | |
133 The rest of the documentation details each of the object | |
134 methods. Internal methods are usually preceded with a _ | |
135 | |
136 =cut | |
137 | |
138 # 'Let the code begin... | |
139 | |
140 package Bio::SimpleAlign; | |
141 use vars qw(@ISA %CONSERVATION_GROUPS); | |
142 use strict; | |
143 | |
144 use Bio::Root::Root; | |
145 use Bio::LocatableSeq; # uses Seq's as list | |
146 use Bio::Align::AlignI; | |
147 | |
148 BEGIN { | |
149 # This data should probably be in a more centralized module... | |
150 # it is taken from Clustalw documentation | |
151 # These are all the positively scoring groups that occur in the | |
152 # Gonnet Pam250 matrix. The strong and weak groups are | |
153 # defined as strong score >0.5 and weak score =<0.5 respectively. | |
154 | |
155 %CONSERVATION_GROUPS = ( 'strong' => [ qw(STA | |
156 NEQK | |
157 NHQK | |
158 NDEQ | |
159 QHRK | |
160 MILV | |
161 MILF | |
162 HY | |
163 FYW) | |
164 ], | |
165 'weak' => [ qw(CSA | |
166 ATV | |
167 SAG | |
168 STNK | |
169 STPA | |
170 SGND | |
171 SNDEQK | |
172 NDEQHK | |
173 NEQHRK | |
174 FVLIM | |
175 HFY) ], | |
176 ); | |
177 | |
178 } | |
179 | |
180 @ISA = qw(Bio::Root::Root Bio::Align::AlignI); | |
181 | |
182 =head2 new | |
183 | |
184 Title : new | |
185 Usage : my $aln = new Bio::SimpleAlign(); | |
186 Function : Creates a new simple align object | |
187 Returns : Bio::SimpleAlign | |
188 Args : -source => string representing the source program | |
189 where this alignment came from | |
190 | |
191 =cut | |
192 | |
193 | |
194 sub new { | |
195 my($class,@args) = @_; | |
196 | |
197 my $self = $class->SUPER::new(@args); | |
198 | |
199 my ($src) = $self->_rearrange([qw(SOURCE)], @args); | |
200 $src && $self->source($src); | |
201 # we need to set up internal hashs first! | |
202 | |
203 $self->{'_seq'} = {}; | |
204 $self->{'_order'} = {}; | |
205 $self->{'_start_end_lists'} = {}; | |
206 $self->{'_dis_name'} = {}; | |
207 $self->{'_id'} = 'NoName'; | |
208 $self->{'_symbols'} = {}; | |
209 # maybe we should automatically read in from args. Hmmm... | |
210 | |
211 return $self; # success - we hope! | |
212 } | |
213 | |
214 =head1 Modifier methods | |
215 | |
216 These methods modify the MSE by adding, removing or shuffling complete | |
217 sequences. | |
218 | |
219 =head2 add_seq | |
220 | |
221 Title : add_seq | |
222 Usage : $myalign->add_seq($newseq); | |
223 Function : Adds another sequence to the alignment. *Does not* align | |
224 it - just adds it to the hashes. | |
225 Returns : nothing | |
226 Args : a Bio::LocatableSeq object | |
227 order (optional) | |
228 | |
229 See L<Bio::LocatableSeq> for more information | |
230 | |
231 =cut | |
232 | |
233 sub addSeq { | |
234 my $self = shift; | |
235 $self->warn(ref($self). "::addSeq - deprecated method. Use add_seq() instead."); | |
236 $self->add_seq(@_); | |
237 } | |
238 | |
239 sub add_seq { | |
240 my $self = shift; | |
241 my $seq = shift; | |
242 my $order = shift; | |
243 my ($name,$id,$start,$end); | |
244 | |
245 if( !ref $seq || ! $seq->isa('Bio::LocatableSeq') ) { | |
246 $self->throw("Unable to process non locatable sequences [", ref($seq), "]"); | |
247 } | |
248 | |
249 $id = $seq->id() ||$seq->display_id || $seq->primary_id; | |
250 $start = $seq->start(); | |
251 $end = $seq->end(); | |
252 | |
253 # build the symbol list for this sequence, | |
254 # will prune out the gap and missing/match chars | |
255 # when actually asked for the symbol list in the | |
256 # symbol_chars | |
257 map { $self->{'_symbols'}->{$_} = 1; } split(//,$seq->seq); | |
258 | |
259 if( !defined $order ) { | |
260 $order = keys %{$self->{'_seq'}}; | |
261 } | |
262 $name = sprintf("%s/%d-%d",$id,$start,$end); | |
263 | |
264 if( $self->{'_seq'}->{$name} ) { | |
265 $self->warn("Replacing one sequence [$name]\n"); | |
266 } | |
267 else { | |
268 #print STDERR "Assigning $name to $order\n"; | |
269 | |
270 $self->{'_order'}->{$order} = $name; | |
271 | |
272 unless( exists( $self->{'_start_end_lists'}->{$id})) { | |
273 $self->{'_start_end_lists'}->{$id} = []; | |
274 } | |
275 push @{$self->{'_start_end_lists'}->{$id}}, $seq; | |
276 } | |
277 | |
278 $self->{'_seq'}->{$name} = $seq; | |
279 | |
280 } | |
281 | |
282 | |
283 =head2 remove_seq | |
284 | |
285 Title : remove_seq | |
286 Usage : $aln->remove_seq($seq); | |
287 Function : Removes a single sequence from an alignment | |
288 Returns : | |
289 Argument : a Bio::LocatableSeq object | |
290 | |
291 =cut | |
292 | |
293 sub removeSeq { | |
294 my $self = shift; | |
295 $self->warn(ref($self). "::removeSeq - deprecated method. Use remove_seq() instead."); | |
296 $self->remove_seq(@_); | |
297 } | |
298 | |
299 sub remove_seq { | |
300 my $self = shift; | |
301 my $seq = shift; | |
302 my ($name,$id,$start,$end); | |
303 | |
304 $self->throw("Need Bio::Locatable seq argument ") | |
305 unless ref $seq && $seq->isa('Bio::LocatableSeq'); | |
306 | |
307 $id = $seq->id(); | |
308 $start = $seq->start(); | |
309 $end = $seq->end(); | |
310 $name = sprintf("%s/%d-%d",$id,$start,$end); | |
311 | |
312 if( !exists $self->{'_seq'}->{$name} ) { | |
313 $self->throw("Sequence $name does not exist in the alignment to remove!"); | |
314 } | |
315 | |
316 delete $self->{'_seq'}->{$name}; | |
317 | |
318 # we need to remove this seq from the start_end_lists hash | |
319 | |
320 if (exists $self->{'_start_end_lists'}->{$id}) { | |
321 # we need to find the sequence in the array. | |
322 | |
323 my ($i, $found);; | |
324 for ($i=0; $i < @{$self->{'_start_end_lists'}->{$id}}; $i++) { | |
325 if (${$self->{'_start_end_lists'}->{$id}}[$i] eq $seq) { | |
326 $found = 1; | |
327 last; | |
328 } | |
329 } | |
330 if ($found) { | |
331 splice @{$self->{'_start_end_lists'}->{$id}}, $i, 1; | |
332 } | |
333 else { | |
334 $self->throw("Could not find the sequence to remoce from the start-end list"); | |
335 } | |
336 } | |
337 else { | |
338 $self->throw("There is no seq list for the name $id"); | |
339 } | |
340 return 1; | |
341 # we can't do anything about the order hash but that is ok | |
342 # because each_seq will handle it | |
343 } | |
344 | |
345 | |
346 =head2 purge | |
347 | |
348 Title : purge | |
349 Usage : $aln->purge(0.7); | |
350 Function: | |
351 | |
352 Removes sequences above given sequence similarity | |
353 This function will grind on large alignments. Beware! | |
354 | |
355 Example : | |
356 Returns : An array of the removed sequences | |
357 Args : float, threshold for similarity | |
358 | |
359 =cut | |
360 | |
361 sub purge { | |
362 my ($self,$perc) = @_; | |
363 my (%duplicate, @dups); | |
364 | |
365 my @seqs = $self->each_seq(); | |
366 | |
367 for (my $i=0;$i< @seqs - 1;$i++ ) { #for each seq in alignment | |
368 my $seq = $seqs[$i]; | |
369 | |
370 #skip if already in duplicate hash | |
371 next if exists $duplicate{$seq->display_id} ; | |
372 my $one = $seq->seq(); | |
373 | |
374 my @one = split '', $one; #split to get 1aa per array element | |
375 | |
376 for (my $j=$i+1;$j < @seqs;$j++) { | |
377 my $seq2 = $seqs[$j]; | |
378 | |
379 #skip if already in duplicate hash | |
380 next if exists $duplicate{$seq2->display_id} ; | |
381 | |
382 my $two = $seq2->seq(); | |
383 my @two = split '', $two; | |
384 | |
385 my $count = 0; | |
386 my $res = 0; | |
387 for (my $k=0;$k<@one;$k++) { | |
388 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) && | |
389 $one[$k] eq $two[$k]) { | |
390 $count++; | |
391 } | |
392 if ( $one[$k] ne '.' && $one[$k] ne '-' && defined($two[$k]) && | |
393 $two[$k] ne '.' && $two[$k] ne '-' ) { | |
394 $res++; | |
395 } | |
396 } | |
397 | |
398 my $ratio = 0; | |
399 $ratio = $count/$res unless $res == 0; | |
400 | |
401 # if above threshold put in duplicate hash and push onto | |
402 # duplicate array for returning to get_unique | |
403 if ( $ratio > $perc ) { | |
404 print STDERR "duplicate!", $seq2->display_id, "\n" if $self->verbose > 0; | |
405 $duplicate{$seq2->display_id} = 1; | |
406 push @dups, $seq2; | |
407 } | |
408 } | |
409 } | |
410 foreach my $seq (@dups) { | |
411 $self->remove_seq($seq); | |
412 } | |
413 return @dups; | |
414 } | |
415 | |
416 =head2 sort_alphabetically | |
417 | |
418 Title : sort_alphabetically | |
419 Usage : $ali->sort_alphabetically | |
420 Function : | |
421 | |
422 Changes the order of the alignemnt to alphabetical on name | |
423 followed by numerical by number. | |
424 | |
425 Returns : | |
426 Argument : | |
427 | |
428 =cut | |
429 | |
430 sub sort_alphabetically { | |
431 my $self = shift; | |
432 my ($seq,$nse,@arr,%hash,$count); | |
433 | |
434 foreach $seq ( $self->each_seq() ) { | |
435 $nse = $seq->get_nse; | |
436 $hash{$nse} = $seq; | |
437 } | |
438 | |
439 $count = 0; | |
440 | |
441 %{$self->{'_order'}} = (); # reset the hash; | |
442 | |
443 foreach $nse ( sort _alpha_startend keys %hash) { | |
444 $self->{'_order'}->{$count} = $nse; | |
445 | |
446 $count++; | |
447 } | |
448 1; | |
449 } | |
450 | |
451 =head1 Sequence selection methods | |
452 | |
453 Methods returning one or more sequences objects. | |
454 | |
455 =head2 each_seq | |
456 | |
457 Title : each_seq | |
458 Usage : foreach $seq ( $align->each_seq() ) | |
459 Function : Gets an array of Seq objects from the alignment | |
460 Returns : an array | |
461 Argument : | |
462 | |
463 =cut | |
464 | |
465 sub eachSeq { | |
466 my $self = shift; | |
467 $self->warn(ref($self). "::eachSeq - deprecated method. Use each_seq() instead."); | |
468 $self->each_seq(); | |
469 } | |
470 | |
471 sub each_seq { | |
472 my $self = shift; | |
473 my (@arr,$order); | |
474 | |
475 foreach $order ( sort { $a <=> $b } keys %{$self->{'_order'}} ) { | |
476 if( exists $self->{'_seq'}->{$self->{'_order'}->{$order}} ) { | |
477 push(@arr,$self->{'_seq'}->{$self->{'_order'}->{$order}}); | |
478 } | |
479 } | |
480 | |
481 return @arr; | |
482 } | |
483 | |
484 | |
485 =head2 each_alphabetically | |
486 | |
487 Title : each_alphabetically | |
488 Usage : foreach $seq ( $ali->each_alphabetically() ) | |
489 Function : | |
490 | |
491 Returns an array of sequence object sorted alphabetically | |
492 by name and then by start point. | |
493 Does not change the order of the alignment | |
494 | |
495 Returns : | |
496 Argument : | |
497 | |
498 =cut | |
499 | |
500 sub each_alphabetically { | |
501 my $self = shift; | |
502 my ($seq,$nse,@arr,%hash,$count); | |
503 | |
504 foreach $seq ( $self->each_seq() ) { | |
505 $nse = $seq->get_nse; | |
506 $hash{$nse} = $seq; | |
507 } | |
508 | |
509 foreach $nse ( sort _alpha_startend keys %hash) { | |
510 push(@arr,$hash{$nse}); | |
511 } | |
512 | |
513 return @arr; | |
514 | |
515 } | |
516 | |
517 sub _alpha_startend { | |
518 my ($aname,$astart,$bname,$bstart); | |
519 ($aname,$astart) = split (/-/,$a); | |
520 ($bname,$bstart) = split (/-/,$b); | |
521 | |
522 if( $aname eq $bname ) { | |
523 return $astart <=> $bstart; | |
524 } | |
525 else { | |
526 return $aname cmp $bname; | |
527 } | |
528 | |
529 } | |
530 | |
531 =head2 each_seq_with_id | |
532 | |
533 Title : each_seq_with_id | |
534 Usage : foreach $seq ( $align->each_seq_with_id() ) | |
535 Function : | |
536 | |
537 Gets an array of Seq objects from the | |
538 alignment, the contents being those sequences | |
539 with the given name (there may be more than one) | |
540 | |
541 Returns : an array | |
542 Argument : a seq name | |
543 | |
544 =cut | |
545 | |
546 sub eachSeqWithId { | |
547 my $self = shift; | |
548 $self->warn(ref($self). "::eachSeqWithId - deprecated method. Use each_seq_with_id() instead."); | |
549 $self->each_seq_with_id(@_); | |
550 } | |
551 | |
552 sub each_seq_with_id { | |
553 my $self = shift; | |
554 my $id = shift; | |
555 | |
556 $self->throw("Method each_seq_with_id needs a sequence name argument") | |
557 unless defined $id; | |
558 | |
559 my (@arr, $seq); | |
560 | |
561 if (exists($self->{'_start_end_lists'}->{$id})) { | |
562 @arr = @{$self->{'_start_end_lists'}->{$id}}; | |
563 } | |
564 return @arr; | |
565 } | |
566 | |
567 =head2 get_seq_by_pos | |
568 | |
569 Title : get_seq_by_pos | |
570 Usage : $seq = $aln->get_seq_by_pos(3) # third sequence from the alignment | |
571 Function : | |
572 | |
573 Gets a sequence based on its position in the alignment. | |
574 Numbering starts from 1. Sequence positions larger than | |
575 no_sequences() will thow an error. | |
576 | |
577 Returns : a Bio::LocatableSeq object | |
578 Args : positive integer for the sequence osition | |
579 | |
580 =cut | |
581 | |
582 sub get_seq_by_pos { | |
583 | |
584 my $self = shift; | |
585 my ($pos) = @_; | |
586 | |
587 $self->throw("Sequence position has to be a positive integer, not [$pos]") | |
588 unless $pos =~ /^\d+$/ and $pos > 0; | |
589 $self->throw("No sequence at position [$pos]") | |
590 unless $pos <= $self->no_sequences ; | |
591 | |
592 my $nse = $self->{'_order'}->{--$pos}; | |
593 return $self->{'_seq'}->{$nse}; | |
594 } | |
595 | |
596 =head1 Create new alignments | |
597 | |
598 The result of these methods are horizontal or vertical subsets of the | |
599 current MSE. | |
600 | |
601 =head2 select | |
602 | |
603 Title : select | |
604 Usage : $aln2 = $aln->select(1, 3) # three first sequences | |
605 Function : | |
606 | |
607 Creates a new alignment from a continuous subset of | |
608 sequences. Numbering starts from 1. Sequence positions | |
609 larger than no_sequences() will thow an error. | |
610 | |
611 Returns : a Bio::SimpleAlign object | |
612 Args : positive integer for the first sequence | |
613 positive integer for the last sequence to include (optional) | |
614 | |
615 =cut | |
616 | |
617 sub select { | |
618 my $self = shift; | |
619 my ($start, $end) = @_; | |
620 | |
621 $self->throw("Select start has to be a positive integer, not [$start]") | |
622 unless $start =~ /^\d+$/ and $start > 0; | |
623 $self->throw("Select end has to be a positive integer, not [$end]") | |
624 unless $end =~ /^\d+$/ and $end > 0; | |
625 $self->throw("Select $start [$start] has to be smaller than or equal to end [$end]") | |
626 unless $start <= $end; | |
627 | |
628 my $aln = new $self; | |
629 foreach my $pos ($start .. $end) { | |
630 $aln->add_seq($self->get_seq_by_pos($pos)); | |
631 } | |
632 $aln->id($self->id); | |
633 return $aln; | |
634 } | |
635 | |
636 =head2 select_noncont | |
637 | |
638 Title : select_noncont | |
639 Usage : $aln2 = $aln->select_noncont(1, 3) # first and 3rd sequences | |
640 Function : | |
641 | |
642 Creates a new alignment from a subset of | |
643 sequences. Numbering starts from 1. Sequence positions | |
644 larger than no_sequences() will thow an error. | |
645 | |
646 Returns : a Bio::SimpleAlign object | |
647 Args : array of integers for the sequences | |
648 | |
649 =cut | |
650 | |
651 sub select_noncont { | |
652 my $self = shift; | |
653 my (@pos) = @_; | |
654 my $end = $self->no_sequences; | |
655 foreach ( @pos ) { | |
656 $self->throw("position must be a positive integer, > 0 and <= $end not [$_]") | |
657 unless( /^\d+$/ && $_ > 0 && $_ <= $end ); | |
658 } | |
659 my $aln = new $self; | |
660 foreach my $p (@pos) { | |
661 $aln->add_seq($self->get_seq_by_pos($p)); | |
662 } | |
663 $aln->id($self->id); | |
664 return $aln; | |
665 } | |
666 | |
667 =head2 slice | |
668 | |
669 Title : slice | |
670 Usage : $aln2 = $aln->slice(20, 30) | |
671 Function : | |
672 | |
673 Creates a slice from the alignment inclusive of start and | |
674 end columns. Sequences with no residues in the slice are | |
675 excluded from the new alignment and a warning is printed. | |
676 Slice beyond the length of the sequence does not do | |
677 padding. | |
678 | |
679 Returns : a Bio::SimpleAlign object | |
680 Args : positive integer for start column | |
681 positive integer for end column | |
682 | |
683 =cut | |
684 | |
685 sub slice { | |
686 my $self = shift; | |
687 my ($start, $end) = @_; | |
688 | |
689 $self->throw("Slice start has to be a positive integer, not [$start]") | |
690 unless $start =~ /^\d+$/ and $start > 0; | |
691 $self->throw("Slice end has to be a positive integer, not [$end]") | |
692 unless $end =~ /^\d+$/ and $end > 0; | |
693 $self->throw("Slice $start [$start] has to be smaller than or equal to end [$end]") | |
694 unless $start <= $end; | |
695 my $aln_length = $self->length; | |
696 $self->throw("This alignment has only ". $self->length. | |
697 " residues. Slice start [$start] is too bigger.") | |
698 if $start > $self->length; | |
699 | |
700 my $aln = new $self; | |
701 $aln->id($self->id); | |
702 foreach my $seq ( $self->each_seq() ) { | |
703 | |
704 my $new_seq = new Bio::LocatableSeq (-id => $seq->id); | |
705 | |
706 # seq | |
707 my $seq_end = $end; | |
708 $seq_end = $seq->length if $end > $seq->length; | |
709 my $slice_seq = $seq->subseq($start, $seq_end); | |
710 $new_seq->seq( $slice_seq ); | |
711 | |
712 # start | |
713 if ($start > 1) { | |
714 my $pre_start_seq = $seq->subseq(1, $start - 1); | |
715 $pre_start_seq =~ s/\W//g; #print "$pre_start_seq\n"; | |
716 $new_seq->start( $seq->start + CORE::length($pre_start_seq) ); | |
717 } else { | |
718 $new_seq->start( $seq->start); | |
719 } | |
720 | |
721 # end | |
722 $slice_seq =~ s/\W//g; | |
723 $new_seq->end( $new_seq->start + CORE::length($slice_seq) - 1 ); | |
724 | |
725 if ($new_seq->start and $new_seq->end >= $new_seq->start) { | |
726 $aln->add_seq($new_seq); | |
727 } else { | |
728 my $nse = $seq->get_nse(); | |
729 $self->warn("Slice [$start-$end] of sequence [$nse] contains no residues.". | |
730 " Sequence excluded from the new alignment."); | |
731 } | |
732 | |
733 } | |
734 | |
735 return $aln; | |
736 } | |
737 | |
738 =head2 remove_columns | |
739 | |
740 Title : remove_column | |
741 Usage : $aln2 = $aln->remove_columns(['mismatch','weak']) | |
742 Function : | |
743 Creates an aligment with columns removed corresponding to | |
744 the specified criteria. | |
745 Returns : a L<Bio::SimpleAlign> object | |
746 Args : array ref of types, 'match'|'weak'|'strong'|'mismatch' | |
747 | |
748 =cut | |
749 | |
750 sub remove_columns{ | |
751 my ($self,$type) = @_; | |
752 my %matchchars = ( 'match' => '\*', | |
753 'weak' => '\.', | |
754 'strong' => ':', | |
755 'mismatch'=> ' ', | |
756 ); | |
757 #get the characters to delete against | |
758 my $del_char; | |
759 foreach my $type(@{$type}){ | |
760 $del_char.= $matchchars{$type}; | |
761 } | |
762 | |
763 my $match_line = $self->match_line; | |
764 my $aln = new $self; | |
765 | |
766 my @remove; | |
767 my $length = 0; | |
768 | |
769 #do the matching to get the segments to remove | |
770 while($match_line=~m/[$del_char]/g){ | |
771 my $start = pos($match_line)-1; | |
772 $match_line=~/\G[$del_char]+/gc; | |
773 my $end = pos($match_line)-1; | |
774 | |
775 #have to offset the start and end for subsequent removes | |
776 $start-=$length; | |
777 $end -=$length; | |
778 $length += ($end-$start+1); | |
779 push @remove, [$start,$end]; | |
780 } | |
781 | |
782 #remove the segments | |
783 $aln = $self->_remove_col($aln,\@remove); | |
784 | |
785 return $aln; | |
786 } | |
787 | |
788 sub _remove_col { | |
789 my ($self,$aln,$remove) = @_; | |
790 my @new; | |
791 | |
792 #splice out the segments and create new seq | |
793 foreach my $seq($self->each_seq){ | |
794 my $new_seq = new Bio::LocatableSeq(-id=>$seq->id); | |
795 my $sequence; | |
796 foreach my $pair(@{$remove}){ | |
797 my $start = $pair->[0]; | |
798 my $end = $pair->[1]; | |
799 $sequence = $seq->seq unless $sequence; | |
800 my $spliced; | |
801 $spliced .= $start > 0 ? substr($sequence,0,$start) : ''; | |
802 $spliced .= substr($sequence,$end+1,$seq->length-$end+1); | |
803 $sequence = $spliced; | |
804 if ($start == 1) { | |
805 $new_seq->start($end); | |
806 } | |
807 else { | |
808 $new_seq->start( $seq->start); | |
809 } | |
810 # end | |
811 if($end >= $seq->end){ | |
812 $new_seq->end( $start); | |
813 } | |
814 else { | |
815 $new_seq->end($seq->end); | |
816 } | |
817 } | |
818 $new_seq->seq($sequence); | |
819 push @new, $new_seq; | |
820 } | |
821 #add the new seqs to the alignment | |
822 foreach my $new(@new){ | |
823 $aln->add_seq($new); | |
824 } | |
825 return $aln; | |
826 } | |
827 | |
828 =head1 Change sequences within the MSE | |
829 | |
830 These methods affect characters in all sequences without changeing the | |
831 alignment. | |
832 | |
833 | |
834 =head2 map_chars | |
835 | |
836 Title : map_chars | |
837 Usage : $ali->map_chars('\.','-') | |
838 Function : | |
839 | |
840 Does a s/$arg1/$arg2/ on the sequences. Useful for gap | |
841 characters | |
842 | |
843 Notice that the from (arg1) is interpretted as a regex, | |
844 so be careful about quoting meta characters (eg | |
845 $ali->map_chars('.','-') wont do what you want) | |
846 | |
847 Returns : | |
848 Argument : 'from' rexexp | |
849 'to' string | |
850 | |
851 =cut | |
852 | |
853 sub map_chars { | |
854 my $self = shift; | |
855 my $from = shift; | |
856 my $to = shift; | |
857 my ($seq,$temp); | |
858 | |
859 $self->throw("Need exactly two arguments") | |
860 unless defined $from and defined $to; | |
861 | |
862 foreach $seq ( $self->each_seq() ) { | |
863 $temp = $seq->seq(); | |
864 $temp =~ s/$from/$to/g; | |
865 $seq->seq($temp); | |
866 } | |
867 return 1; | |
868 } | |
869 | |
870 | |
871 =head2 uppercase | |
872 | |
873 Title : uppercase() | |
874 Usage : $ali->uppercase() | |
875 Function : Sets all the sequences to uppercase | |
876 Returns : | |
877 Argument : | |
878 | |
879 =cut | |
880 | |
881 sub uppercase { | |
882 my $self = shift; | |
883 my $seq; | |
884 my $temp; | |
885 | |
886 foreach $seq ( $self->each_seq() ) { | |
887 $temp = $seq->seq(); | |
888 $temp =~ tr/[a-z]/[A-Z]/; | |
889 | |
890 $seq->seq($temp); | |
891 } | |
892 return 1; | |
893 } | |
894 | |
895 =head2 cigar_line | |
896 | |
897 Title : cigar_line() | |
898 Usage : $align->cigar_line() | |
899 Function : Generates a "cigar" line for each sequence in the alignment | |
900 The format is simply A-1,60;B-1,1:4,60;C-5,10:12,58 | |
901 where A,B,C,etc. are the sequence identifiers, and the numbers | |
902 refer to conserved positions within the alignment | |
903 Args : none | |
904 | |
905 =cut | |
906 | |
907 sub cigar_line { | |
908 my ($self) = @_; | |
909 | |
910 my %cigar; | |
911 my %clines; | |
912 my @seqchars; | |
913 my $seqcount = 0; | |
914 my $sc; | |
915 foreach my $seq ( $self->each_seq ) { | |
916 push @seqchars, [ split(//, uc ($seq->seq)) ]; | |
917 $sc = scalar(@seqchars); | |
918 } | |
919 | |
920 foreach my $pos ( 0..$self->length ) { | |
921 my $i=0; | |
922 foreach my $seq ( @seqchars ) { | |
923 $i++; | |
924 # print STDERR "Seq $i at pos $pos: ".$seq->[$pos]."\n"; | |
925 if ($seq->[$pos] eq '.') { | |
926 if (defined $cigar{$i} && $clines{$i} !~ $cigar{$i}) { | |
927 $clines{$i}.=$cigar{$i}; | |
928 } | |
929 } | |
930 else { | |
931 if (! defined $cigar{$i}) { | |
932 $clines{$i}.=($pos+1).","; | |
933 } | |
934 $cigar{$i}=$pos+1; | |
935 } | |
936 if ($pos+1 == $self->length && ($clines{$i} =~ /\,$/) ) { | |
937 $clines{$i}.=$cigar{$i}; | |
938 } | |
939 } | |
940 } | |
941 for(my $i=1; $i<$sc+1;$i++) { | |
942 print STDERR "Seq $i cigar line ".$clines{$i}."\n"; | |
943 } | |
944 return %clines; | |
945 } | |
946 | |
947 =head2 match_line | |
948 | |
949 Title : match_line() | |
950 Usage : $align->match_line() | |
951 Function : Generates a match line - much like consensus string | |
952 except that a line indicating the '*' for a match. | |
953 Args : (optional) Match line characters ('*' by default) | |
954 (optional) Strong match char (':' by default) | |
955 (optional) Weak match char ('.' by default) | |
956 | |
957 =cut | |
958 | |
959 sub match_line { | |
960 my ($self,$matchlinechar, $strong, $weak) = @_; | |
961 my %matchchars = ( 'match' => $matchlinechar || '*', | |
962 'weak' => $weak || '.', | |
963 'strong' => $strong || ':', | |
964 'mismatch'=> ' ', | |
965 ); | |
966 | |
967 | |
968 my @seqchars; | |
969 my $seqcount = 0; | |
970 my $alphabet; | |
971 foreach my $seq ( $self->each_seq ) { | |
972 push @seqchars, [ split(//, uc ($seq->seq)) ]; | |
973 $alphabet = $seq->alphabet unless defined $alphabet; | |
974 } | |
975 my $refseq = shift @seqchars; | |
976 # let's just march down the columns | |
977 my $matchline; | |
978 POS: foreach my $pos ( 0..$self->length ) { | |
979 my $refchar = $refseq->[$pos]; | |
980 next unless $refchar; # skip '' | |
981 my %col = ($refchar => 1); | |
982 my $dash = ($refchar eq '-' || $refchar eq '.' || $refchar eq ' '); | |
983 foreach my $seq ( @seqchars ) { | |
984 $dash = 1 if( $seq->[$pos] eq '-' || $seq->[$pos] eq '.' || | |
985 $seq->[$pos] eq ' ' ); | |
986 $col{$seq->[$pos]}++; | |
987 } | |
988 my @colresidues = sort keys %col; | |
989 my $char = $matchchars{'mismatch'}; | |
990 # if all the values are the same | |
991 if( $dash ) { $char = $matchchars{'mismatch'} } | |
992 elsif( @colresidues == 1 ) { $char = $matchchars{'match'} } | |
993 elsif( $alphabet eq 'protein' ) { # only try to do weak/strong | |
994 # matches for protein seqs | |
995 TYPE: foreach my $type ( qw(strong weak) ) { | |
996 # iterate through categories | |
997 my %groups; | |
998 # iterate through each of the aa in the col | |
999 # look to see which groups it is in | |
1000 foreach my $c ( @colresidues ) { | |
1001 foreach my $f ( grep /\Q$c/, @{$CONSERVATION_GROUPS{$type}} ) { | |
1002 push @{$groups{$f}},$c; | |
1003 } | |
1004 } | |
1005 GRP: foreach my $cols ( values %groups ) { | |
1006 @$cols = sort @$cols; | |
1007 # now we are just testing to see if two arrays | |
1008 # are identical w/o changing either one | |
1009 | |
1010 # have to be same len | |
1011 next if( scalar @$cols != scalar @colresidues ); | |
1012 # walk down the length and check each slot | |
1013 for($_=0;$_ < (scalar @$cols);$_++ ) { | |
1014 next GRP if( $cols->[$_] ne $colresidues[$_] ); | |
1015 } | |
1016 $char = $matchchars{$type}; | |
1017 last TYPE; | |
1018 } | |
1019 } | |
1020 } | |
1021 $matchline .= $char; | |
1022 } | |
1023 return $matchline; | |
1024 } | |
1025 | |
1026 =head2 match | |
1027 | |
1028 Title : match() | |
1029 Usage : $ali->match() | |
1030 Function : | |
1031 | |
1032 Goes through all columns and changes residues that are | |
1033 identical to residue in first sequence to match '.' | |
1034 character. Sets match_char. | |
1035 | |
1036 USE WITH CARE: Most MSE formats do not support match | |
1037 characters in sequences, so this is mostly for output | |
1038 only. NEXUS format (Bio::AlignIO::nexus) can handle | |
1039 it. | |
1040 | |
1041 Returns : 1 | |
1042 Argument : a match character, optional, defaults to '.' | |
1043 | |
1044 =cut | |
1045 | |
1046 sub match { | |
1047 my ($self, $match) = @_; | |
1048 | |
1049 $match ||= '.'; | |
1050 my ($matching_char) = $match; | |
1051 $matching_char = "\\$match" if $match =~ /[\^.$|()\[\]]/ ; #'; | |
1052 $self->map_chars($matching_char, '-'); | |
1053 | |
1054 my @seqs = $self->each_seq(); | |
1055 return 1 unless scalar @seqs > 1; | |
1056 | |
1057 my $refseq = shift @seqs ; | |
1058 my @refseq = split //, $refseq->seq; | |
1059 my $gapchar = $self->gap_char; | |
1060 | |
1061 foreach my $seq ( @seqs ) { | |
1062 my @varseq = split //, $seq->seq(); | |
1063 for ( my $i=0; $i < scalar @varseq; $i++) { | |
1064 $varseq[$i] = $match if defined $refseq[$i] && | |
1065 ( $refseq[$i] =~ /[A-Za-z\*]/ || | |
1066 $refseq[$i] =~ /$gapchar/ ) | |
1067 && $refseq[$i] eq $varseq[$i]; | |
1068 } | |
1069 $seq->seq(join '', @varseq); | |
1070 } | |
1071 $self->match_char($match); | |
1072 return 1; | |
1073 } | |
1074 | |
1075 | |
1076 =head2 unmatch | |
1077 | |
1078 Title : unmatch() | |
1079 Usage : $ali->unmatch() | |
1080 Function : Undoes the effect of method match. Unsets match_char. | |
1081 | |
1082 Returns : 1 | |
1083 Argument : a match character, optional, defaults to '.' | |
1084 | |
1085 See L<match> and L<match_char> | |
1086 | |
1087 =cut | |
1088 | |
1089 sub unmatch { | |
1090 my ($self, $match) = @_; | |
1091 | |
1092 $match ||= '.'; | |
1093 | |
1094 my @seqs = $self->each_seq(); | |
1095 return 1 unless scalar @seqs > 1; | |
1096 | |
1097 my $refseq = shift @seqs ; | |
1098 my @refseq = split //, $refseq->seq; | |
1099 my $gapchar = $self->gap_char; | |
1100 foreach my $seq ( @seqs ) { | |
1101 my @varseq = split //, $seq->seq(); | |
1102 for ( my $i=0; $i < scalar @varseq; $i++) { | |
1103 $varseq[$i] = $refseq[$i] if defined $refseq[$i] && | |
1104 ( $refseq[$i] =~ /[A-Za-z\*]/ || | |
1105 $refseq[$i] =~ /$gapchar/ ) && | |
1106 $varseq[$i] eq $match; | |
1107 } | |
1108 $seq->seq(join '', @varseq); | |
1109 } | |
1110 $self->match_char(''); | |
1111 return 1; | |
1112 } | |
1113 | |
1114 =head1 MSE attibutes | |
1115 | |
1116 Methods for setting and reading the MSE attributes. | |
1117 | |
1118 Note that the methods defining character semantics depend on the user | |
1119 to set them sensibly. They are needed only by certain input/output | |
1120 methods. Unset them by setting to an empty string (''). | |
1121 | |
1122 =head2 id | |
1123 | |
1124 Title : id | |
1125 Usage : $myalign->id("Ig") | |
1126 Function : Gets/sets the id field of the alignment | |
1127 Returns : An id string | |
1128 Argument : An id string (optional) | |
1129 | |
1130 =cut | |
1131 | |
1132 sub id { | |
1133 my ($self, $name) = @_; | |
1134 | |
1135 if (defined( $name )) { | |
1136 $self->{'_id'} = $name; | |
1137 } | |
1138 | |
1139 return $self->{'_id'}; | |
1140 } | |
1141 | |
1142 =head2 missing_char | |
1143 | |
1144 Title : missing_char | |
1145 Usage : $myalign->missing_char("?") | |
1146 Function : Gets/sets the missing_char attribute of the alignment | |
1147 It is generally recommended to set it to 'n' or 'N' | |
1148 for nucleotides and to 'X' for protein. | |
1149 Returns : An missing_char string, | |
1150 Argument : An missing_char string (optional) | |
1151 | |
1152 =cut | |
1153 | |
1154 sub missing_char { | |
1155 my ($self, $char) = @_; | |
1156 | |
1157 if (defined $char ) { | |
1158 $self->throw("Single missing character, not [$char]!") if CORE::length($char) > 1; | |
1159 $self->{'_missing_char'} = $char; | |
1160 } | |
1161 | |
1162 return $self->{'_missing_char'}; | |
1163 } | |
1164 | |
1165 =head2 match_char | |
1166 | |
1167 Title : match_char | |
1168 Usage : $myalign->match_char('.') | |
1169 Function : Gets/sets the match_char attribute of the alignment | |
1170 Returns : An match_char string, | |
1171 Argument : An match_char string (optional) | |
1172 | |
1173 =cut | |
1174 | |
1175 sub match_char { | |
1176 my ($self, $char) = @_; | |
1177 | |
1178 if (defined $char ) { | |
1179 $self->throw("Single match character, not [$char]!") if CORE::length($char) > 1; | |
1180 $self->{'_match_char'} = $char; | |
1181 } | |
1182 | |
1183 return $self->{'_match_char'}; | |
1184 } | |
1185 | |
1186 =head2 gap_char | |
1187 | |
1188 Title : gap_char | |
1189 Usage : $myalign->gap_char('-') | |
1190 Function : Gets/sets the gap_char attribute of the alignment | |
1191 Returns : An gap_char string, defaults to '-' | |
1192 Argument : An gap_char string (optional) | |
1193 | |
1194 =cut | |
1195 | |
1196 sub gap_char { | |
1197 my ($self, $char) = @_; | |
1198 | |
1199 if (defined $char || ! defined $self->{'_gap_char'} ) { | |
1200 $char= '-' unless defined $char; | |
1201 $self->throw("Single gap character, not [$char]!") if CORE::length($char) > 1; | |
1202 $self->{'_gap_char'} = $char; | |
1203 } | |
1204 return $self->{'_gap_char'}; | |
1205 } | |
1206 | |
1207 =head2 symbol_chars | |
1208 | |
1209 Title : symbol_chars | |
1210 Usage : my @symbolchars = $aln->symbol_chars; | |
1211 Function: Returns all the seen symbols (other than gaps) | |
1212 Returns : array of characters that are the seen symbols | |
1213 Args : boolean to include the gap/missing/match characters | |
1214 | |
1215 =cut | |
1216 | |
1217 sub symbol_chars{ | |
1218 my ($self,$includeextra) = @_; | |
1219 if( ! defined $self->{'_symbols'} ) { | |
1220 $self->warn("Symbol list was not initialized"); | |
1221 return (); | |
1222 } | |
1223 my %copy = %{$self->{'_symbols'}}; | |
1224 if( ! $includeextra ) { | |
1225 foreach my $char ( $self->gap_char, $self->match_char, | |
1226 $self->missing_char) { | |
1227 delete $copy{$char} if( defined $char ); | |
1228 } | |
1229 } | |
1230 return keys %copy; | |
1231 } | |
1232 | |
1233 =head1 Alignment descriptors | |
1234 | |
1235 These read only methods describe the MSE in various ways. | |
1236 | |
1237 | |
1238 =head2 consensus_string | |
1239 | |
1240 Title : consensus_string | |
1241 Usage : $str = $ali->consensus_string($threshold_percent) | |
1242 Function : Makes a strict consensus | |
1243 Returns : | |
1244 Argument : Optional treshold ranging from 0 to 100. | |
1245 The consensus residue has to appear at least threshold % | |
1246 of the sequences at a given location, otherwise a '?' | |
1247 character will be placed at that location. | |
1248 (Default value = 0%) | |
1249 | |
1250 =cut | |
1251 | |
1252 sub consensus_string { | |
1253 my $self = shift; | |
1254 my $threshold = shift; | |
1255 my $len; | |
1256 my ($out,$count); | |
1257 | |
1258 $out = ""; | |
1259 | |
1260 $len = $self->length - 1; | |
1261 | |
1262 foreach $count ( 0 .. $len ) { | |
1263 $out .= $self->_consensus_aa($count,$threshold); | |
1264 } | |
1265 return $out; | |
1266 } | |
1267 | |
1268 sub _consensus_aa { | |
1269 my $self = shift; | |
1270 my $point = shift; | |
1271 my $threshold_percent = shift || -1 ; | |
1272 my ($seq,%hash,$count,$letter,$key); | |
1273 | |
1274 foreach $seq ( $self->each_seq() ) { | |
1275 $letter = substr($seq->seq,$point,1); | |
1276 $self->throw("--$point-----------") if $letter eq ''; | |
1277 ($letter =~ /\./) && next; | |
1278 # print "Looking at $letter\n"; | |
1279 $hash{$letter}++; | |
1280 } | |
1281 my $number_of_sequences = $self->no_sequences(); | |
1282 my $threshold = $number_of_sequences * $threshold_percent / 100. ; | |
1283 $count = -1; | |
1284 $letter = '?'; | |
1285 | |
1286 foreach $key ( sort keys %hash ) { | |
1287 # print "Now at $key $hash{$key}\n"; | |
1288 if( $hash{$key} > $count && $hash{$key} >= $threshold) { | |
1289 $letter = $key; | |
1290 $count = $hash{$key}; | |
1291 } | |
1292 } | |
1293 return $letter; | |
1294 } | |
1295 | |
1296 | |
1297 =head2 consensus_iupac | |
1298 | |
1299 Title : consensus_iupac | |
1300 Usage : $str = $ali->consensus_iupac() | |
1301 Function : | |
1302 | |
1303 Makes a consensus using IUPAC ambiguity codes from DNA | |
1304 and RNA. The output is in upper case except when gaps in | |
1305 a column force output to be in lower case. | |
1306 | |
1307 Note that if your alignment sequences contain a lot of | |
1308 IUPAC ambiquity codes you often have to manually set | |
1309 alphabet. Bio::PrimarySeq::_guess_type thinks they | |
1310 indicate a protein sequence. | |
1311 | |
1312 Returns : consensus string | |
1313 Argument : none | |
1314 Throws : on protein sequences | |
1315 | |
1316 =cut | |
1317 | |
1318 sub consensus_iupac { | |
1319 my $self = shift; | |
1320 my $out = ""; | |
1321 my $len = $self->length-1; | |
1322 | |
1323 # only DNA and RNA sequences are valid | |
1324 foreach my $seq ( $self->each_seq() ) { | |
1325 $self->throw("Seq [". $seq->get_nse. "] is a protein") | |
1326 if $seq->alphabet eq 'protein'; | |
1327 } | |
1328 # loop over the alignment columns | |
1329 foreach my $count ( 0 .. $len ) { | |
1330 $out .= $self->_consensus_iupac($count); | |
1331 } | |
1332 return $out; | |
1333 } | |
1334 | |
1335 sub _consensus_iupac { | |
1336 my ($self, $column) = @_; | |
1337 my ($string, $char, $rna); | |
1338 | |
1339 #determine all residues in a column | |
1340 foreach my $seq ( $self->each_seq() ) { | |
1341 $string .= substr($seq->seq, $column, 1); | |
1342 } | |
1343 $string = uc $string; | |
1344 | |
1345 # quick exit if there's an N in the string | |
1346 if ($string =~ /N/) { | |
1347 $string =~ /\W/ ? return 'n' : return 'N'; | |
1348 } | |
1349 # ... or if there are only gap characters | |
1350 return '-' if $string =~ /^\W+$/; | |
1351 | |
1352 # treat RNA as DNA in regexps | |
1353 if ($string =~ /U/) { | |
1354 $string =~ s/U/T/; | |
1355 $rna = 1; | |
1356 } | |
1357 | |
1358 # the following s///'s only need to be done to the _first_ ambiguity code | |
1359 # as we only need to see the _range_ of characters in $string | |
1360 | |
1361 if ($string =~ /[VDHB]/) { | |
1362 $string =~ s/V/AGC/; | |
1363 $string =~ s/D/AGT/; | |
1364 $string =~ s/H/ACT/; | |
1365 $string =~ s/B/CTG/; | |
1366 } | |
1367 | |
1368 if ($string =~ /[SKYRWM]/) { | |
1369 $string =~ s/S/GC/; | |
1370 $string =~ s/K/GT/; | |
1371 $string =~ s/Y/CT/; | |
1372 $string =~ s/R/AG/; | |
1373 $string =~ s/W/AT/; | |
1374 $string =~ s/M/AC/; | |
1375 } | |
1376 | |
1377 # and now the guts of the thing | |
1378 | |
1379 if ($string =~ /A/) { | |
1380 $char = 'A'; # A A | |
1381 if ($string =~ /G/) { | |
1382 $char = 'R'; # A and G (purines) R | |
1383 if ($string =~ /C/) { | |
1384 $char = 'V'; # A and G and C V | |
1385 if ($string =~ /T/) { | |
1386 $char = 'N'; # A and G and C and T N | |
1387 } | |
1388 } elsif ($string =~ /T/) { | |
1389 $char = 'D'; # A and G and T D | |
1390 } | |
1391 } elsif ($string =~ /C/) { | |
1392 $char = 'M'; # A and C M | |
1393 if ($string =~ /T/) { | |
1394 $char = 'H'; # A and C and T H | |
1395 } | |
1396 } elsif ($string =~ /T/) { | |
1397 $char = 'W'; # A and T W | |
1398 } | |
1399 } elsif ($string =~ /C/) { | |
1400 $char = 'C'; # C C | |
1401 if ($string =~ /T/) { | |
1402 $char = 'Y'; # C and T (pyrimidines) Y | |
1403 if ($string =~ /G/) { | |
1404 $char = 'B'; # C and T and G B | |
1405 } | |
1406 } elsif ($string =~ /G/) { | |
1407 $char = 'S'; # C and G S | |
1408 } | |
1409 } elsif ($string =~ /G/) { | |
1410 $char = 'G'; # G G | |
1411 if ($string =~ /C/) { | |
1412 $char = 'S'; # G and C S | |
1413 } elsif ($string =~ /T/) { | |
1414 $char = 'K'; # G and T K | |
1415 } | |
1416 } elsif ($string =~ /T/) { | |
1417 $char = 'T'; # T T | |
1418 } | |
1419 | |
1420 $char = 'U' if $rna and $char eq 'T'; | |
1421 $char = lc $char if $string =~ /\W/; | |
1422 | |
1423 return $char; | |
1424 } | |
1425 | |
1426 =head2 is_flush | |
1427 | |
1428 Title : is_flush | |
1429 Usage : if( $ali->is_flush() ) | |
1430 : | |
1431 : | |
1432 Function : Tells you whether the alignment | |
1433 : is flush, ie all of the same length | |
1434 : | |
1435 : | |
1436 Returns : 1 or 0 | |
1437 Argument : | |
1438 | |
1439 =cut | |
1440 | |
1441 sub is_flush { | |
1442 my ($self,$report) = @_; | |
1443 my $seq; | |
1444 my $length = (-1); | |
1445 my $temp; | |
1446 | |
1447 foreach $seq ( $self->each_seq() ) { | |
1448 if( $length == (-1) ) { | |
1449 $length = CORE::length($seq->seq()); | |
1450 next; | |
1451 } | |
1452 | |
1453 $temp = CORE::length($seq->seq()); | |
1454 if( $temp != $length ) { | |
1455 $self->warn("expecting $length not $temp from ". | |
1456 $seq->display_id) if( $report ); | |
1457 $self->debug("expecting $length not $temp from ". | |
1458 $seq->display_id); | |
1459 $self->debug($seq->seq(). "\n"); | |
1460 return 0; | |
1461 } | |
1462 } | |
1463 | |
1464 return 1; | |
1465 } | |
1466 | |
1467 | |
1468 =head2 length | |
1469 | |
1470 Title : length() | |
1471 Usage : $len = $ali->length() | |
1472 Function : Returns the maximum length of the alignment. | |
1473 To be sure the alignment is a block, use is_flush | |
1474 Returns : | |
1475 Argument : | |
1476 | |
1477 =cut | |
1478 | |
1479 sub length_aln { | |
1480 my $self = shift; | |
1481 $self->warn(ref($self). "::length_aln - deprecated method. Use length() instead."); | |
1482 $self->length(@_); | |
1483 } | |
1484 | |
1485 sub length { | |
1486 my $self = shift; | |
1487 my $seq; | |
1488 my $length = (-1); | |
1489 my ($temp,$len); | |
1490 | |
1491 foreach $seq ( $self->each_seq() ) { | |
1492 $temp = CORE::length($seq->seq()); | |
1493 if( $temp > $length ) { | |
1494 $length = $temp; | |
1495 } | |
1496 } | |
1497 | |
1498 return $length; | |
1499 } | |
1500 | |
1501 | |
1502 =head2 maxdisplayname_length | |
1503 | |
1504 Title : maxdisplayname_length | |
1505 Usage : $ali->maxdisplayname_length() | |
1506 Function : | |
1507 | |
1508 Gets the maximum length of the displayname in the | |
1509 alignment. Used in writing out various MSE formats. | |
1510 | |
1511 Returns : integer | |
1512 Argument : | |
1513 | |
1514 =cut | |
1515 | |
1516 sub maxname_length { | |
1517 my $self = shift; | |
1518 $self->warn(ref($self). "::maxname_length - deprecated method.". | |
1519 " Use maxdisplayname_length() instead."); | |
1520 $self->maxdisplayname_length(); | |
1521 } | |
1522 | |
1523 sub maxnse_length { | |
1524 my $self = shift; | |
1525 $self->warn(ref($self). "::maxnse_length - deprecated method.". | |
1526 " Use maxnse_length() instead."); | |
1527 $self->maxdisplayname_length(); | |
1528 } | |
1529 | |
1530 sub maxdisplayname_length { | |
1531 my $self = shift; | |
1532 my $maxname = (-1); | |
1533 my ($seq,$len); | |
1534 | |
1535 foreach $seq ( $self->each_seq() ) { | |
1536 $len = CORE::length $self->displayname($seq->get_nse()); | |
1537 | |
1538 if( $len > $maxname ) { | |
1539 $maxname = $len; | |
1540 } | |
1541 } | |
1542 | |
1543 return $maxname; | |
1544 } | |
1545 | |
1546 =head2 no_residues | |
1547 | |
1548 Title : no_residues | |
1549 Usage : $no = $ali->no_residues | |
1550 Function : number of residues in total in the alignment | |
1551 Returns : integer | |
1552 Argument : | |
1553 | |
1554 =cut | |
1555 | |
1556 sub no_residues { | |
1557 my $self = shift; | |
1558 my $count = 0; | |
1559 | |
1560 foreach my $seq ($self->each_seq) { | |
1561 my $str = $seq->seq(); | |
1562 | |
1563 $count += ($str =~ s/[^A-Za-z]//g); | |
1564 } | |
1565 | |
1566 return $count; | |
1567 } | |
1568 | |
1569 =head2 no_sequences | |
1570 | |
1571 Title : no_sequences | |
1572 Usage : $depth = $ali->no_sequences | |
1573 Function : number of sequence in the sequence alignment | |
1574 Returns : integer | |
1575 Argument : | |
1576 | |
1577 =cut | |
1578 | |
1579 sub no_sequences { | |
1580 my $self = shift; | |
1581 | |
1582 return scalar($self->each_seq); | |
1583 } | |
1584 | |
1585 | |
1586 =head2 average_percentage_identity | |
1587 | |
1588 Title : average_percentage_identity | |
1589 Usage : $id = $align->average_percentage_identity | |
1590 Function: The function uses a fast method to calculate the average | |
1591 percentage identity of the alignment | |
1592 Returns : The average percentage identity of the alignment | |
1593 Args : None | |
1594 Notes : This method implemented by Kevin Howe calculates a figure that is | |
1595 designed to be similar to the average pairwise identity of the | |
1596 alignment (identical in the absence of gaps), without having to | |
1597 explicitly calculate pairwise identities proposed by Richard Durbin. | |
1598 Validated by Ewan Birney ad Alex Bateman. | |
1599 | |
1600 =cut | |
1601 | |
1602 sub average_percentage_identity{ | |
1603 my ($self,@args) = @_; | |
1604 | |
1605 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', | |
1606 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); | |
1607 | |
1608 my ($len, $total, $subtotal, $divisor, $subdivisor, @seqs, @countHashes); | |
1609 | |
1610 if (! $self->is_flush()) { | |
1611 $self->throw("All sequences in the alignment must be the same length"); | |
1612 } | |
1613 | |
1614 @seqs = $self->each_seq(); | |
1615 $len = $self->length(); | |
1616 | |
1617 # load the each hash with correct keys for existence checks | |
1618 | |
1619 for( my $index=0; $index < $len; $index++) { | |
1620 foreach my $letter (@alphabet) { | |
1621 $countHashes[$index]->{$letter} = 0; | |
1622 } | |
1623 } | |
1624 foreach my $seq (@seqs) { | |
1625 my @seqChars = split //, $seq->seq(); | |
1626 for( my $column=0; $column < @seqChars; $column++ ) { | |
1627 my $char = uc($seqChars[$column]); | |
1628 if (exists $countHashes[$column]->{$char}) { | |
1629 $countHashes[$column]->{$char}++; | |
1630 } | |
1631 } | |
1632 } | |
1633 | |
1634 $total = 0; | |
1635 $divisor = 0; | |
1636 for(my $column =0; $column < $len; $column++) { | |
1637 my %hash = %{$countHashes[$column]}; | |
1638 $subdivisor = 0; | |
1639 foreach my $res (keys %hash) { | |
1640 $total += $hash{$res}*($hash{$res} - 1); | |
1641 $subdivisor += $hash{$res}; | |
1642 } | |
1643 $divisor += $subdivisor * ($subdivisor - 1); | |
1644 } | |
1645 return $divisor > 0 ? ($total / $divisor )*100.0 : 0; | |
1646 } | |
1647 | |
1648 =head2 percentage_identity | |
1649 | |
1650 Title : percentage_identity | |
1651 Usage : $id = $align->percentage_identity | |
1652 Function: The function calculates the average percentage identity | |
1653 (aliased for average_percentage_identity) | |
1654 Returns : The average percentage identity | |
1655 Args : None | |
1656 | |
1657 =cut | |
1658 | |
1659 sub percentage_identity { | |
1660 my $self = shift; | |
1661 return $self->average_percentage_identity(); | |
1662 } | |
1663 | |
1664 =head2 overall_percentage_identity | |
1665 | |
1666 Title : percentage_identity | |
1667 Usage : $id = $align->percentage_identity | |
1668 Function: The function calculates the percentage identity of | |
1669 the conserved columns | |
1670 Returns : The percentage identity of the conserved columns | |
1671 Args : None | |
1672 | |
1673 =cut | |
1674 | |
1675 sub overall_percentage_identity{ | |
1676 my ($self,@args) = @_; | |
1677 | |
1678 my @alphabet = ('A','B','C','D','E','F','G','H','I','J','K','L','M', | |
1679 'N','O','P','Q','R','S','T','U','V','W','X','Y','Z'); | |
1680 | |
1681 my ($len, $total, @seqs, @countHashes); | |
1682 | |
1683 if (! $self->is_flush()) { | |
1684 $self->throw("All sequences in the alignment must be the same length"); | |
1685 } | |
1686 | |
1687 @seqs = $self->each_seq(); | |
1688 $len = $self->length(); | |
1689 | |
1690 # load the each hash with correct keys for existence checks | |
1691 for( my $index=0; $index < $len; $index++) { | |
1692 foreach my $letter (@alphabet) { | |
1693 $countHashes[$index]->{$letter} = 0; | |
1694 } | |
1695 } | |
1696 foreach my $seq (@seqs) { | |
1697 my @seqChars = split //, $seq->seq(); | |
1698 for( my $column=0; $column < @seqChars; $column++ ) { | |
1699 my $char = uc($seqChars[$column]); | |
1700 if (exists $countHashes[$column]->{$char}) { | |
1701 $countHashes[$column]->{$char}++; | |
1702 } | |
1703 } | |
1704 } | |
1705 | |
1706 $total = 0; | |
1707 for(my $column =0; $column < $len; $column++) { | |
1708 my %hash = %{$countHashes[$column]}; | |
1709 foreach ( values %hash ) { | |
1710 next if( $_ == 0 ); | |
1711 $total++ if( $_ == scalar @seqs ); | |
1712 last; | |
1713 } | |
1714 } | |
1715 return ($total / $len ) * 100.0; | |
1716 } | |
1717 | |
1718 =head1 Alignment positions | |
1719 | |
1720 Methods to map a sequence position into an alignment column and back. | |
1721 column_from_residue_number() does the former. The latter is really a | |
1722 property of the sequence object and can done using | |
1723 L<Bio::LocatableSeq::location_from_column>: | |
1724 | |
1725 # select somehow a sequence from the alignment, e.g. | |
1726 my $seq = $aln->get_seq_by_pos(1); | |
1727 #$loc is undef or Bio::LocationI object | |
1728 my $loc = $seq->location_from_column(5); | |
1729 | |
1730 | |
1731 =head2 column_from_residue_number | |
1732 | |
1733 Title : column_from_residue_number | |
1734 Usage : $col = $ali->column_from_residue_number( $seqname, $resnumber) | |
1735 Function: | |
1736 | |
1737 This function gives the position in the alignment | |
1738 (i.e. column number) of the given residue number in the | |
1739 sequence with the given name. For example, for the | |
1740 alignment | |
1741 | |
1742 Seq1/91-97 AC..DEF.GH | |
1743 Seq2/24-30 ACGG.RTY.. | |
1744 Seq3/43-51 AC.DDEFGHI | |
1745 | |
1746 column_from_residue_number( "Seq1", 94 ) returns 5. | |
1747 column_from_residue_number( "Seq2", 25 ) returns 2. | |
1748 column_from_residue_number( "Seq3", 50 ) returns 9. | |
1749 | |
1750 An exception is thrown if the residue number would lie | |
1751 outside the length of the aligment | |
1752 (e.g. column_from_residue_number( "Seq2", 22 ) | |
1753 | |
1754 Note: If the the parent sequence is represented by more than | |
1755 one alignment sequence and the residue number is present in | |
1756 them, this method finds only the first one. | |
1757 | |
1758 Returns : A column number for the position in the alignment of the | |
1759 given residue in the given sequence (1 = first column) | |
1760 Args : A sequence id/name (not a name/start-end) | |
1761 A residue number in the whole sequence (not just that | |
1762 segment of it in the alignment) | |
1763 | |
1764 =cut | |
1765 | |
1766 sub column_from_residue_number { | |
1767 my ($self, $name, $resnumber) = @_; | |
1768 | |
1769 $self->throw("No sequence with name [$name]") unless $self->{'_start_end_lists'}->{$name}; | |
1770 $self->throw("Second argument residue number missing") unless $resnumber; | |
1771 | |
1772 foreach my $seq ($self->each_seq_with_id($name)) { | |
1773 my $col; | |
1774 eval { | |
1775 $col = $seq->column_from_residue_number($resnumber); | |
1776 }; | |
1777 next if $@; | |
1778 return $col; | |
1779 } | |
1780 | |
1781 $self->throw("Could not find a sequence segment in $name ". | |
1782 "containing residue number $resnumber"); | |
1783 | |
1784 } | |
1785 | |
1786 =head1 Sequence names | |
1787 | |
1788 Methods to manipulate the display name. The default name based on the | |
1789 sequence id and subsequence positions can be overridden in various | |
1790 ways. | |
1791 | |
1792 =head2 displayname | |
1793 | |
1794 Title : displayname | |
1795 Usage : $myalign->displayname("Ig", "IgA") | |
1796 Function : Gets/sets the display name of a sequence in the alignment | |
1797 : | |
1798 Returns : A display name string | |
1799 Argument : name of the sequence | |
1800 displayname of the sequence (optional) | |
1801 | |
1802 =cut | |
1803 | |
1804 sub get_displayname { | |
1805 my $self = shift; | |
1806 $self->warn(ref($self). "::get_displayname - deprecated method. Use displayname() instead."); | |
1807 $self->displayname(@_); | |
1808 } | |
1809 | |
1810 sub set_displayname { | |
1811 my $self = shift; | |
1812 $self->warn(ref($self). "::set_displayname - deprecated method. Use displayname() instead."); | |
1813 $self->displayname(@_); | |
1814 } | |
1815 | |
1816 sub displayname { | |
1817 my ($self, $name, $disname) = @_; | |
1818 | |
1819 $self->throw("No sequence with name [$name]") unless $self->{'_seq'}->{$name}; | |
1820 | |
1821 if( $disname and $name) { | |
1822 $self->{'_dis_name'}->{$name} = $disname; | |
1823 return $disname; | |
1824 } | |
1825 elsif( defined $self->{'_dis_name'}->{$name} ) { | |
1826 return $self->{'_dis_name'}->{$name}; | |
1827 } else { | |
1828 return $name; | |
1829 } | |
1830 } | |
1831 | |
1832 =head2 set_displayname_count | |
1833 | |
1834 Title : set_displayname_count | |
1835 Usage : $ali->set_displayname_count | |
1836 Function : | |
1837 | |
1838 Sets the names to be name_# where # is the number of | |
1839 times this name has been used. | |
1840 | |
1841 Returns : | |
1842 Argument : | |
1843 | |
1844 =cut | |
1845 | |
1846 sub set_displayname_count { | |
1847 my $self= shift; | |
1848 my (@arr,$name,$seq,$count,$temp,$nse); | |
1849 | |
1850 foreach $seq ( $self->each_alphabetically() ) { | |
1851 $nse = $seq->get_nse(); | |
1852 | |
1853 #name will be set when this is the second | |
1854 #time (or greater) is has been seen | |
1855 | |
1856 if( defined $name and $name eq ($seq->id()) ) { | |
1857 $temp = sprintf("%s_%s",$name,$count); | |
1858 $self->displayname($nse,$temp); | |
1859 $count++; | |
1860 } else { | |
1861 $count = 1; | |
1862 $name = $seq->id(); | |
1863 $temp = sprintf("%s_%s",$name,$count); | |
1864 $self->displayname($nse,$temp); | |
1865 $count++; | |
1866 } | |
1867 } | |
1868 return 1; | |
1869 } | |
1870 | |
1871 =head2 set_displayname_flat | |
1872 | |
1873 Title : set_displayname_flat | |
1874 Usage : $ali->set_displayname_flat() | |
1875 Function : Makes all the sequences be displayed as just their name, | |
1876 not name/start-end | |
1877 Returns : 1 | |
1878 Argument : | |
1879 | |
1880 =cut | |
1881 | |
1882 sub set_displayname_flat { | |
1883 my $self = shift; | |
1884 my ($nse,$seq); | |
1885 | |
1886 foreach $seq ( $self->each_seq() ) { | |
1887 $nse = $seq->get_nse(); | |
1888 $self->displayname($nse,$seq->id()); | |
1889 } | |
1890 return 1; | |
1891 } | |
1892 | |
1893 =head2 set_displayname_normal | |
1894 | |
1895 Title : set_displayname_normal | |
1896 Usage : $ali->set_displayname_normal() | |
1897 Function : Makes all the sequences be displayed as name/start-end | |
1898 Returns : | |
1899 Argument : | |
1900 | |
1901 =cut | |
1902 | |
1903 sub set_displayname_normal { | |
1904 my $self = shift; | |
1905 my ($nse,$seq); | |
1906 | |
1907 foreach $seq ( $self->each_seq() ) { | |
1908 $nse = $seq->get_nse(); | |
1909 $self->displayname($nse,$nse); | |
1910 } | |
1911 return 1; | |
1912 } | |
1913 | |
1914 =head2 source | |
1915 | |
1916 Title : source | |
1917 Usage : $obj->source($newval) | |
1918 Function: sets the Alignment source program | |
1919 Example : | |
1920 Returns : value of source | |
1921 Args : newvalue (optional) | |
1922 | |
1923 | |
1924 =cut | |
1925 | |
1926 sub source{ | |
1927 my ($self,$value) = @_; | |
1928 if( defined $value) { | |
1929 $self->{'_source'} = $value; | |
1930 } | |
1931 return $self->{'_source'}; | |
1932 } | |
1933 | |
1934 1; |