comparison variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =head1 AUTHORSHIP
20
21 Ensembl Team. Individual contributions can be found in the CVS log.
22
23 =cut
24
25 =head1 NAME
26
27 AlignedMemberSet - A superclass for pairwise or multiple relationships, base of
28 Bio::EnsEMBL::Compara::Family, Bio::EnsEMBL::Compara::Homology and
29 Bio::EnsEMBL::Compara::Domain.
30
31 =head1 DESCRIPTION
32
33 A superclass for pairwise and multiple relationships
34
35 Currently the AlignedMember objects are used in the GeneTree structure
36 to represent the leaves of the trees. Each leaf contains an aligned
37 sequence, which is represented as an AlignedMember object.
38
39 =head1 INHERITANCE TREE
40
41 Bio::EnsEMBL::Compara::AlignedMemberSet
42
43 =head1 METHODS
44
45 =cut
46
47 package Bio::EnsEMBL::Compara::AlignedMemberSet;
48
49 use strict;
50 use Scalar::Util qw(weaken);
51
52 use Bio::AlignIO;
53
54 use Bio::EnsEMBL::Utils::Argument;
55 use Bio::EnsEMBL::Utils::Scalar;
56 use Bio::EnsEMBL::Utils::Exception;
57
58 use Bio::EnsEMBL::Compara::AlignedMember;
59
60 use base ('Bio::EnsEMBL::Compara::MemberSet');
61
62
63 =head2 member_class
64
65 Description: Returns the type of member used in the set
66 Returntype : String: Bio::EnsEMBL::Compara::AlignedMember
67 Caller : general
68 Status : Stable
69
70 =cut
71
72 sub member_class {
73 return 'Bio::EnsEMBL::Compara::AlignedMember';
74 }
75
76 ######################
77 # Alignment sections #
78 ######################
79
80
81 sub print_sequences_to_fasta {
82 my ($self, $pep_file) = @_;
83 my $pep_counter = 0;
84 open PEP, ">$pep_file";
85 foreach my $member (@{$self->get_all_Members}) {
86 next if $member->source_name eq 'ENSEMBLGENE';
87 my $member_stable_id = $member->stable_id;
88 my $seq = $member->sequence;
89
90 print PEP ">$member_stable_id\n";
91 $seq =~ s/(.{72})/$1\n/g;
92 chomp $seq;
93 unless (defined($seq)) {
94 my $family_id = $self->dbID;
95 die "member $member_stable_id in family $family_id doesn't have a sequence";
96 }
97 print PEP $seq,"\n";
98 $pep_counter++;
99 }
100 close PEP;
101 return $pep_counter;
102 }
103
104
105 =head2 read_clustalw
106
107 Arg [1] : string $file
108 The name of the file containing the clustalw output
109 Example : $family->read_clustalw('/tmp/clustalw.aln');
110 Description: Parses the output from clustalw and sets the alignment strings
111 of each of the memebers of this family
112 Returntype : none
113 Exceptions : thrown if file cannot be parsed
114 warning if alignment file contains identifiers for sequences
115 which are not members of this family
116 Caller : general
117
118 =cut
119
120 sub read_clustalw {
121 my $self = shift;
122 my $file = shift;
123
124 my %align_hash;
125 my $FH = IO::File->new();
126 $FH->open($file) || throw("Could not open alignment file [$file]");
127
128 <$FH>; #skip header
129 while(<$FH>) {
130 next if($_ =~ /^\s+/); #skip lines that start with space
131
132 my ($id, $align) = split;
133 $align_hash{$id} ||= '';
134 $align_hash{$id} .= $align;
135 }
136
137 $FH->close;
138
139 #place all members in a hash on their member name
140 my %member_hash;
141 foreach my $member (@{$self->get_all_Members}) {
142 $member_hash{$member->stable_id} = $member;
143 }
144
145 #assign cigar_line to each of the member attribute
146 foreach my $id (keys %align_hash) {
147 throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
148
149 my $alignment_string = $align_hash{$id};
150 $alignment_string =~ s/\-([A-Z])/\- $1/g;
151 $alignment_string =~ s/([A-Z])\-/$1 \-/g;
152
153 my @cigar_segments = split " ",$alignment_string;
154
155 my $cigar_line = "";
156 foreach my $segment (@cigar_segments) {
157 my $seglength = length($segment);
158 $seglength = "" if ($seglength == 1);
159 if ($segment =~ /^\-+$/) {
160 $cigar_line .= $seglength . "D";
161 } else {
162 $cigar_line .= $seglength . "M";
163 }
164 }
165
166 $member_hash{$id}->cigar_line($cigar_line);
167 }
168 }
169
170 sub load_cigars_from_fasta {
171 my ($self, $file) = @_;
172
173 my $alignio = Bio::AlignIO->new(-file => "$file", -format => "fasta");
174
175 my $aln = $alignio->next_aln or die "Bio::AlignIO could not get next_aln() from file '$file'";
176
177 #place all members in a hash on their member name
178 my %member_hash;
179 foreach my $member (@{$self->get_all_Members}) {
180 $member_hash{$member->stable_id} = $member;
181 }
182
183 #assign cigar_line to each of the member attribute
184 foreach my $seq ($aln->each_seq) {
185 my $id = $seq->display_id;
186 throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
187
188 my $cigar_line = '';
189 my $seq_string = $seq->seq();
190 while($seq_string=~/(?:\b|^)(.)(.*?)(?:\b|$)/g) {
191 $cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M');
192 }
193
194 $member_hash{$id}->cigar_line($cigar_line);
195 }
196 }
197
198
199
200 sub get_SimpleAlign {
201
202 my ($self, @args) = @_;
203
204 my $id_type = 'STABLE';
205 my $unique_seqs = 0;
206 my $cdna = 0;
207 my $stop2x = 0;
208 my $append_taxon_id = 0;
209 my $append_sp_short_name = 0;
210 my $append_genomedb_id = 0;
211 my $exon_cased = 0;
212 my $alignment = 'protein';
213 my $changeSelenos = 0;
214 if (scalar @args) {
215 ($unique_seqs, $cdna, $id_type, $stop2x, $append_taxon_id, $append_sp_short_name, $append_genomedb_id, $exon_cased, $alignment, $changeSelenos) =
216 rearrange([qw(UNIQ_SEQ CDNA ID_TYPE STOP2X APPEND_TAXON_ID APPEND_SP_SHORT_NAME APPEND_GENOMEDB_ID EXON_CASED ALIGNMENT CHANGE_SELENO)], @args);
217 }
218
219 my $sa = Bio::SimpleAlign->new();
220
221 #Hack to try to work with both bioperl 0.7 and 1.2:
222 #Check to see if the method is called 'addSeq' or 'add_seq'
223 my $bio07 = ($sa->can('add_seq') ? 0 : 1);
224
225 my $seq_id_hash = {};
226 foreach my $member (@{$self->get_all_Members}) {
227
228 next if $member->source_name eq 'ENSEMBLGENE';
229
230 # Print unique sequences only ?
231 next if($unique_seqs and $seq_id_hash->{$member->sequence_id});
232 $seq_id_hash->{$member->sequence_id} = 1;
233
234 # The correct codon table
235 if ($member->chr_name =~ /mt/i) {
236 # codeml icodes
237 # 0:universal code (default)
238 my $class;
239 eval {$class = member->taxon->classification;};
240 unless ($@) {
241 if ($class =~ /vertebrata/i) {
242 # 1:mamalian mt
243 $sa->{_special_codeml_icode} = 1;
244 } else {
245 # 4:invertebrate mt
246 $sa->{_special_codeml_icode} = 4;
247 }
248 }
249 }
250
251 my $seqstr;
252 my $alphabet;
253 if ($cdna or (lc($alignment) eq 'cdna')) {
254 $seqstr = $member->cdna_alignment_string($changeSelenos);
255 $seqstr =~ s/\s+//g;
256 $alphabet = 'dna';
257 } else {
258 $seqstr = $member->alignment_string($exon_cased);
259 $alphabet = 'protein';
260 }
261 next if(!$seqstr);
262
263 # Sequence name
264 my $seqID = $member->stable_id;
265 $seqID = $member->sequence_id if($id_type eq "SEQ");
266 $seqID = $member->member_id if($id_type eq "MEMBER");
267 $seqID .= "_" . $member->taxon_id if($append_taxon_id);
268 $seqID .= "_" . $member->genome_db_id if ($append_genomedb_id);
269
270 ## Append $seqID with Speciae short name, if required
271 if ($append_sp_short_name) {
272 my $species = $member->genome_db->short_name;
273 $species =~ s/\s/_/g;
274 $seqID .= "_" . $species . "_";
275 }
276
277 # Sequence length
278 my $aln_end = $member->seq_length;
279 $aln_end = $aln_end*3 if $alphabet eq 'dna';
280
281 $seqstr =~ s/\*/X/g if ($stop2x);
282 my $seq = Bio::LocatableSeq->new(
283 -SEQ => $seqstr,
284 -ALPHABET => $alphabet,
285 -START => 1,
286 -END => $aln_end,
287 -ID => $seqID,
288 -STRAND => 0
289 );
290
291 if($bio07) {
292 $sa->addSeq($seq);
293 } else {
294 $sa->add_seq($seq);
295 }
296 }
297 $sa = $sa->remove_gaps(undef, 1) if UNIVERSAL::can($sa, 'remove_gaps');
298 return $sa;
299 }
300
301
302
303 # Takes a protein tree and creates a consensus cigar line from the
304 # constituent leaf nodes.
305 sub consensus_cigar_line {
306
307 my $self = shift;
308 my @cigars;
309
310 # First get an 'expanded' cigar string for each leaf of the subtree
311 my @all_members = @{$self->get_all_Members};
312 my $num_members = scalar(@all_members);
313 foreach my $leaf (@all_members) {
314 next unless( UNIVERSAL::can( $leaf, 'cigar_line' ) );
315 my $cigar = $leaf->cigar_line;
316 my $newcigar = "";
317 # $cigar =~ s/(\d*)([A-Z])/$2 x ($1||1)/ge; #Expand
318 while ($cigar =~ /(\d*)([A-Z])/g) {
319 $newcigar .= $2 x ($1 || 1);
320 }
321 $cigar = $newcigar;
322 push @cigars, $cigar;
323 }
324
325 # Itterate through each character of the expanded cigars.
326 # If there is a 'D' at a given location in any cigar,
327 # set the consensus to 'D', otherwise assume an 'M'.
328 # TODO: Fix assumption that cigar strings are always the same length,
329 # and start at the same point.
330 my $cigar_len = length( $cigars[0] );
331 my $cons_cigar;
332 for( my $i=0; $i<$cigar_len; $i++ ){
333 my $char = 'M';
334 my $num_deletions = 0;
335 foreach my $cigar( @cigars ){
336 if ( substr($cigar,$i,1) eq 'D'){
337 $num_deletions++;
338 }
339 }
340 if ($num_deletions * 3 > 2 * $num_members) {
341 $char = "D";
342 } elsif ($num_deletions * 3 > $num_members) {
343 $char = "m";
344 }
345 $cons_cigar .= $char;
346 }
347
348 # Collapse the consensus cigar, e.g. 'DDDD' = 4D
349 # $cons_cigar =~ s/(\w)(\1*)/($2?length($2)+1:"").$1/ge;
350 my $collapsed_cigar = "";
351 while ($cons_cigar =~ /(D+|M+|I+|m+)/g) {
352 $collapsed_cigar .= (length($1) == 1 ? "" : length($1)) . substr($1,0,1)
353 }
354 $cons_cigar = $collapsed_cigar;
355 # Return the consensus
356 return $cons_cigar;
357 }
358
359
360 my %TWOD_CODONS = ("TTT" => "Phe",#Phe
361 "TTC" => "Phe",
362
363 "TTA" => "Leu",#Leu
364 "TTG" => "Leu",
365
366 "TAT" => "Tyr",#Tyr
367 "TAC" => "Tyr",
368
369 "CAT" => "His",#His
370 "CAC" => "His",
371
372 "CAA" => "Gln",#Gln
373 "CAG" => "Gln",
374
375 "AAT" => "Asn",#Asn
376 "AAC" => "Asn",
377
378 "AAA" => "Lys",#Lys
379 "AAG" => "Lys",
380
381 "GAT" => "Asp",#Asp
382 "GAC" => "Asp",
383
384 "GAA" => "Glu",#Glu
385 "GAG" => "Glu",
386
387 "TGT" => "Cys",#Cys
388 "TGC" => "Cys",
389
390 "AGT" => "Ser",#Ser
391 "AGC" => "Ser",
392
393 "AGA" => "Arg",#Arg
394 "AGG" => "Arg",
395
396 "ATT" => "Ile",#Ile
397 "ATC" => "Ile",
398 "ATA" => "Ile");
399
400 my %FOURD_CODONS = ("CTT" => "Leu",#Leu
401 "CTC" => "Leu",
402 "CTA" => "Leu",
403 "CTG" => "Leu",
404
405 "GTT" => "Val",#Val
406 "GTC" => "Val",
407 "GTA" => "Val",
408 "GTG" => "Val",
409
410 "TCT" => "Ser",#Ser
411 "TCC" => "Ser",
412 "TCA" => "Ser",
413 "TCG" => "Ser",
414
415 "CCT" => "Pro",#Pro
416 "CCC" => "Pro",
417 "CCA" => "Pro",
418 "CCG" => "Pro",
419
420 "ACT" => "Thr",#Thr
421 "ACC" => "Thr",
422 "ACA" => "Thr",
423 "ACG" => "Thr",
424
425 "GCT" => "Ala",#Ala
426 "GCC" => "Ala",
427 "GCA" => "Ala",
428 "GCG" => "Ala",
429
430 "CGT" => "Arg",#Arg
431 "CGC" => "Arg",
432 "CGA" => "Arg",
433 "CGG" => "Arg",
434
435 "GGT" => "Gly",#Gly
436 "GGC" => "Gly",
437 "GGA" => "Gly",
438 "GGG" => "Gly");
439
440 my %CODONS = ("ATG" => "Met",
441 "TGG" => "Trp",
442 "TAA" => "TER",
443 "TAG" => "TER",
444 "TGA" => "TER",
445 "---" => "---",
446 %TWOD_CODONS,
447 %FOURD_CODONS,
448 );
449
450
451 =head2 get_4D_SimpleAlign
452
453 Example : $4d_align = $homology->get_4D_SimpleAlign();
454 Description: get 4 times degenerate positions pairwise simple alignment
455 Returntype : Bio::SimpleAlign
456
457 =cut
458
459 sub get_4D_SimpleAlign {
460 my $self = shift;
461
462 my $sa = Bio::SimpleAlign->new();
463
464 #Hack to try to work with both bioperl 0.7 and 1.2:
465 #Check to see if the method is called 'addSeq' or 'add_seq'
466 my $bio07 = 0;
467 if(!$sa->can('add_seq')) {
468 $bio07 = 1;
469 }
470
471 my $ma = $self->adaptor->db->get_MemberAdaptor;
472
473 my %member_seqstr;
474 foreach my $member (@{$self->get_all_Members}) {
475 next if $member->source_name ne 'ENSEMBLPEP';
476 my $seqstr = $member->cdna_alignment_string();
477 next if(!$seqstr);
478 #print STDERR $seqstr,"\n";
479 my @tmp_tab = split /\s+/, $seqstr;
480 #print STDERR "tnp_tab 0: ", $tmp_tab[0],"\n";
481 $member_seqstr{$member->stable_id} = \@tmp_tab;
482 }
483
484 my $seqstr_length;
485 foreach my $seqid (keys %member_seqstr) {
486 unless (defined $seqstr_length) {
487 #print STDERR $member_seqstr{$seqid}->[0],"\n";
488 $seqstr_length = scalar @{$member_seqstr{$seqid}};
489 next;
490 }
491 unless ($seqstr_length == scalar @{$member_seqstr{$seqid}}) {
492 die "Length of dna alignment are not the same, $seqstr_length and " . scalar @{$member_seqstr{$seqid}} ." respectively for homology_id " . $self->dbID . "\n";
493 }
494 }
495
496 my %FourD_member_seqstr;
497 for (my $i=0; $i < $seqstr_length; $i++) {
498 my $FourD_codon = 1;
499 my $FourD_aminoacid;
500 foreach my $seqid (keys %member_seqstr) {
501 if (defined $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
502 if (defined $FourD_aminoacid && $FourD_aminoacid eq $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
503 #print STDERR "YES ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
504 next;
505 } elsif (defined $FourD_aminoacid) {
506 #print STDERR "NO ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
507 $FourD_codon = 0;
508 last;
509 } else {
510 $FourD_aminoacid = $FOURD_CODONS{$member_seqstr{$seqid}->[$i]};
511 #print STDERR $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i]," ";
512 }
513 next;
514 } else {
515 #print STDERR "NO ",$CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
516 $FourD_codon = 0;
517 last;
518 }
519 }
520 next unless ($FourD_codon);
521 foreach my $seqid (keys %member_seqstr) {
522 $FourD_member_seqstr{$seqid} .= substr($member_seqstr{$seqid}->[$i],2,1);
523 }
524 }
525
526 foreach my $seqid (keys %FourD_member_seqstr) {
527
528 my $seq = Bio::LocatableSeq->new(
529 -SEQ => $FourD_member_seqstr{$seqid},
530 -START => 1,
531 -END => length($FourD_member_seqstr{$seqid}),
532 -ID => $seqid,
533 -STRAND => 0
534 );
535
536 if($bio07) {
537 $sa->addSeq($seq);
538 } else {
539 $sa->add_seq($seq);
540 }
541 }
542
543 return $sa;
544 }
545
546 my %matrix_hash;
547
548 {
549 my $BLOSUM62 = "# Matrix made by matblas from blosum62.iij
550 # * column uses minimum score
551 # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
552 # Blocks Database = /data/blocks_5.0/blocks.dat
553 # Cluster Percentage: >= 62
554 # Entropy = 0.6979, Expected = -0.5209
555 A R N D C Q E G H I L K M F P S T W Y V B Z X *
556 A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
557 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
558 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
559 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
560 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
561 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
562 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
563 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
564 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
565 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
566 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
567 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
568 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
569 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
570 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
571 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
572 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
573 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
574 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
575 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
576 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
577 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
578 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
579 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
580 ";
581 my $matrix_string;
582 my @lines = split(/\n/,$BLOSUM62);
583 foreach my $line (@lines) {
584 next if ($line =~ /^\#/);
585 if ($line =~ /^[A-Z\*\s]+$/) {
586 $matrix_string .= sprintf "$line\n";
587 } else {
588 my @t = split(/\s+/,$line);
589 shift @t;
590 # print scalar @t,"\n";
591 $matrix_string .= sprintf(join(" ",@t)."\n");
592 }
593 }
594
595 # my %matrix_hash;
596 @lines = ();
597 @lines = split /\n/, $matrix_string;
598 my $lts = shift @lines;
599 $lts =~ s/^\s+//;
600 $lts =~ s/\s+$//;
601 my @letters = split /\s+/, $lts;
602
603 foreach my $letter (@letters) {
604 my $line = shift @lines;
605 $line =~ s/^\s+//;
606 $line =~ s/\s+$//;
607 my @penalties = split /\s+/, $line;
608 die "Size of letters array and penalties array are different\n"
609 unless (scalar @letters == scalar @penalties);
610 for (my $i=0; $i < scalar @letters; $i++) {
611 $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
612 }
613 }
614 }
615
616
617 sub update_alignment_stats {
618 my $self = shift;
619
620 my $genes = $self->get_all_Members;
621 my $ngenes = scalar(@$genes);
622
623 if ($ngenes == 2) {
624 # This code is >4 times faster with pairs of genes
625
626 my $gene1 = $genes->[0];
627 my $gene2 = $genes->[1];
628 my $new_aln1_cigarline = "";
629 my $new_aln2_cigarline = "";
630
631 my $identical_matches = 0;
632 my $positive_matches = 0;
633
634 my ($aln1state, $aln2state);
635 my ($aln1count, $aln2count);
636 my ($aln1cov, $aln2cov) = (0,0);
637
638 my @aln1 = split(//, $gene1->alignment_string);
639 my @aln2 = split(//, $gene2->alignment_string);
640
641 for (my $i=0; $i <= $#aln1; $i++) {
642 next if ($aln1[$i] eq '-' && $aln2[$i] eq '-');
643 my $cur_aln1state = ($aln1[$i] eq '-' ? 'D' : 'M');
644 my $cur_aln2state = ($aln2[$i] eq '-' ? 'D' : 'M');
645 $aln1cov++ if $cur_aln1state ne 'D';
646 $aln2cov++ if $cur_aln2state ne 'D';
647 if ($cur_aln1state eq 'M' && $cur_aln2state eq 'M') {
648 if ($aln1[$i] eq $aln2[$i]) {
649 $identical_matches++;
650 $positive_matches++;
651 } elsif ($matrix_hash{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
652 $positive_matches++;
653 }
654 }
655 unless (defined $aln1state) {
656 $aln1count = 1;
657 $aln2count = 1;
658 $aln1state = $cur_aln1state;
659 $aln2state = $cur_aln2state;
660 next;
661 }
662 if ($cur_aln1state eq $aln1state) {
663 $aln1count++;
664 } else {
665 if ($aln1count == 1) {
666 $new_aln1_cigarline .= $aln1state;
667 } else {
668 $new_aln1_cigarline .= $aln1count.$aln1state;
669 }
670 $aln1count = 1;
671 $aln1state = $cur_aln1state;
672 }
673 if ($cur_aln2state eq $aln2state) {
674 $aln2count++;
675 } else {
676 if ($aln2count == 1) {
677 $new_aln2_cigarline .= $aln2state;
678 } else {
679 $new_aln2_cigarline .= $aln2count.$aln2state;
680 }
681 $aln2count = 1;
682 $aln2state = $cur_aln2state;
683 }
684 }
685 if ($aln1count == 1) {
686 $new_aln1_cigarline .= $aln1state;
687 } else {
688 $new_aln1_cigarline .= $aln1count.$aln1state;
689 }
690 if ($aln2count == 1) {
691 $new_aln2_cigarline .= $aln2state;
692 } else {
693 $new_aln2_cigarline .= $aln2count.$aln2state;
694 }
695 my $seq_length1 = $gene1->seq_length;
696 unless (0 == $seq_length1) {
697 $gene1->cigar_line($new_aln1_cigarline);
698 $gene1->perc_id( int((100.0 * $identical_matches / $seq_length1 + 0.5)) );
699 $gene1->perc_pos( int((100.0 * $positive_matches / $seq_length1 + 0.5)) );
700 $gene1->perc_cov( int((100.0 * $aln1cov / $seq_length1 + 0.5)) );
701 }
702 my $seq_length2 = $gene2->seq_length;
703 unless (0 == $seq_length2) {
704 $gene2->cigar_line($new_aln2_cigarline);
705 $gene2->perc_id( int((100.0 * $identical_matches / $seq_length2 + 0.5)) );
706 $gene2->perc_pos( int((100.0 * $positive_matches / $seq_length2 + 0.5)) );
707 $gene2->perc_cov( int((100.0 * $aln2cov / $seq_length2 + 0.5)) );
708 }
709 return undef;
710 }
711
712 my $min_seq = shift;
713 $min_seq = int($min_seq * $ngenes) if $min_seq <= 1;
714
715 my @new_cigars = ('') x $ngenes;
716 my @nmatch_id = (0) x $ngenes;
717 my @nmatch_pos = (0) x $ngenes;
718 my @nmatch_cov = (0) x $ngenes;
719 my @alncount = (1) x $ngenes;
720 my @alnstate = (undef) x $ngenes;
721 my @cur_alnstate = (undef) x $ngenes;
722
723 my @aln = map {$_->alignment_string} @$genes;
724 my $aln_length = length($aln[0]);
725
726 for (my $i=0; $i < $aln_length; $i++) {
727
728 my @char_i = map {substr($_, $i, 1)} @aln;
729 #print "pos $i: ", join('/', @char_i), "\n";
730
731 my %seen;
732 map {$seen{$_}++} @char_i;
733 next if $seen{'-'} == $ngenes;
734 delete $seen{'-'};
735
736 my %pos_chars = ();
737 my @chars = keys %seen;
738 while (my $c1 = shift @chars) {
739 foreach my $c2 (@chars) {
740 if (($matrix_hash{uc $c1}{uc $c2} > 0) and ($seen{$c1}+$seen{$c2} >= $min_seq)) {
741 $pos_chars{$c1} = 1;
742 $pos_chars{$c2} = 1;
743 }
744 }
745 }
746
747 for (my $j=0; $j<$ngenes; $j++) {
748 if ($char_i[$j] eq '-') {
749 $cur_alnstate[$j] = 'D';
750 } else {
751 $nmatch_cov[$j]++;
752 $cur_alnstate[$j] = 'M';
753 if ($seen{$char_i[$j]} >= $min_seq) {
754 $nmatch_id[$j]++;
755 $nmatch_pos[$j]++;
756 } elsif (exists $pos_chars{$char_i[$j]}) {
757 $nmatch_pos[$j]++;
758 }
759 }
760 }
761
762 if ($i == 0) {
763 @alnstate = @cur_alnstate;
764 next;
765 }
766
767 for (my $j=0; $j<$ngenes; $j++) {
768 if ($cur_alnstate[$j] eq $alnstate[$j]) {
769 $alncount[$j]++;
770 } else {
771 if ($alncount[$j] == 1) {
772 $new_cigars[$j] .= $alnstate[$j];
773 } else {
774 $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
775 }
776 $alncount[$j] = 1;
777 $alnstate[$j] = $cur_alnstate[$j];
778 }
779 }
780 }
781
782 for (my $j=0; $j<$ngenes; $j++) {
783 if ($alncount[$j] == 1) {
784 $new_cigars[$j] .= $alnstate[$j];
785 } else {
786 $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
787 }
788 $genes->[$j]->cigar_line($new_cigars[$j]);
789 my $seq_length = $genes->[$j]->seq_length;
790 unless (0 == $seq_length) {
791 $genes->[$j]->perc_id( int((100.0 * $nmatch_id[$j] / $seq_length + 0.5)) );
792 $genes->[$j]->perc_pos( int((100.0 * $nmatch_pos[$j] / $seq_length + 0.5)) );
793 $genes->[$j]->perc_cov( int((100.0 * $nmatch_cov[$j] / $seq_length + 0.5)) );
794 }
795 }
796 }
797
798
799
800 1;