0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =head1 AUTHORSHIP
|
|
20
|
|
21 Ensembl Team. Individual contributions can be found in the CVS log.
|
|
22
|
|
23 =cut
|
|
24
|
|
25 =head1 NAME
|
|
26
|
|
27 AlignedMemberSet - A superclass for pairwise or multiple relationships, base of
|
|
28 Bio::EnsEMBL::Compara::Family, Bio::EnsEMBL::Compara::Homology and
|
|
29 Bio::EnsEMBL::Compara::Domain.
|
|
30
|
|
31 =head1 DESCRIPTION
|
|
32
|
|
33 A superclass for pairwise and multiple relationships
|
|
34
|
|
35 Currently the AlignedMember objects are used in the GeneTree structure
|
|
36 to represent the leaves of the trees. Each leaf contains an aligned
|
|
37 sequence, which is represented as an AlignedMember object.
|
|
38
|
|
39 =head1 INHERITANCE TREE
|
|
40
|
|
41 Bio::EnsEMBL::Compara::AlignedMemberSet
|
|
42
|
|
43 =head1 METHODS
|
|
44
|
|
45 =cut
|
|
46
|
|
47 package Bio::EnsEMBL::Compara::AlignedMemberSet;
|
|
48
|
|
49 use strict;
|
|
50 use Scalar::Util qw(weaken);
|
|
51
|
|
52 use Bio::AlignIO;
|
|
53
|
|
54 use Bio::EnsEMBL::Utils::Argument;
|
|
55 use Bio::EnsEMBL::Utils::Scalar;
|
|
56 use Bio::EnsEMBL::Utils::Exception;
|
|
57
|
|
58 use Bio::EnsEMBL::Compara::AlignedMember;
|
|
59
|
|
60 use base ('Bio::EnsEMBL::Compara::MemberSet');
|
|
61
|
|
62
|
|
63 =head2 member_class
|
|
64
|
|
65 Description: Returns the type of member used in the set
|
|
66 Returntype : String: Bio::EnsEMBL::Compara::AlignedMember
|
|
67 Caller : general
|
|
68 Status : Stable
|
|
69
|
|
70 =cut
|
|
71
|
|
72 sub member_class {
|
|
73 return 'Bio::EnsEMBL::Compara::AlignedMember';
|
|
74 }
|
|
75
|
|
76 ######################
|
|
77 # Alignment sections #
|
|
78 ######################
|
|
79
|
|
80
|
|
81 sub print_sequences_to_fasta {
|
|
82 my ($self, $pep_file) = @_;
|
|
83 my $pep_counter = 0;
|
|
84 open PEP, ">$pep_file";
|
|
85 foreach my $member (@{$self->get_all_Members}) {
|
|
86 next if $member->source_name eq 'ENSEMBLGENE';
|
|
87 my $member_stable_id = $member->stable_id;
|
|
88 my $seq = $member->sequence;
|
|
89
|
|
90 print PEP ">$member_stable_id\n";
|
|
91 $seq =~ s/(.{72})/$1\n/g;
|
|
92 chomp $seq;
|
|
93 unless (defined($seq)) {
|
|
94 my $family_id = $self->dbID;
|
|
95 die "member $member_stable_id in family $family_id doesn't have a sequence";
|
|
96 }
|
|
97 print PEP $seq,"\n";
|
|
98 $pep_counter++;
|
|
99 }
|
|
100 close PEP;
|
|
101 return $pep_counter;
|
|
102 }
|
|
103
|
|
104
|
|
105 =head2 read_clustalw
|
|
106
|
|
107 Arg [1] : string $file
|
|
108 The name of the file containing the clustalw output
|
|
109 Example : $family->read_clustalw('/tmp/clustalw.aln');
|
|
110 Description: Parses the output from clustalw and sets the alignment strings
|
|
111 of each of the memebers of this family
|
|
112 Returntype : none
|
|
113 Exceptions : thrown if file cannot be parsed
|
|
114 warning if alignment file contains identifiers for sequences
|
|
115 which are not members of this family
|
|
116 Caller : general
|
|
117
|
|
118 =cut
|
|
119
|
|
120 sub read_clustalw {
|
|
121 my $self = shift;
|
|
122 my $file = shift;
|
|
123
|
|
124 my %align_hash;
|
|
125 my $FH = IO::File->new();
|
|
126 $FH->open($file) || throw("Could not open alignment file [$file]");
|
|
127
|
|
128 <$FH>; #skip header
|
|
129 while(<$FH>) {
|
|
130 next if($_ =~ /^\s+/); #skip lines that start with space
|
|
131
|
|
132 my ($id, $align) = split;
|
|
133 $align_hash{$id} ||= '';
|
|
134 $align_hash{$id} .= $align;
|
|
135 }
|
|
136
|
|
137 $FH->close;
|
|
138
|
|
139 #place all members in a hash on their member name
|
|
140 my %member_hash;
|
|
141 foreach my $member (@{$self->get_all_Members}) {
|
|
142 $member_hash{$member->stable_id} = $member;
|
|
143 }
|
|
144
|
|
145 #assign cigar_line to each of the member attribute
|
|
146 foreach my $id (keys %align_hash) {
|
|
147 throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
|
|
148
|
|
149 my $alignment_string = $align_hash{$id};
|
|
150 $alignment_string =~ s/\-([A-Z])/\- $1/g;
|
|
151 $alignment_string =~ s/([A-Z])\-/$1 \-/g;
|
|
152
|
|
153 my @cigar_segments = split " ",$alignment_string;
|
|
154
|
|
155 my $cigar_line = "";
|
|
156 foreach my $segment (@cigar_segments) {
|
|
157 my $seglength = length($segment);
|
|
158 $seglength = "" if ($seglength == 1);
|
|
159 if ($segment =~ /^\-+$/) {
|
|
160 $cigar_line .= $seglength . "D";
|
|
161 } else {
|
|
162 $cigar_line .= $seglength . "M";
|
|
163 }
|
|
164 }
|
|
165
|
|
166 $member_hash{$id}->cigar_line($cigar_line);
|
|
167 }
|
|
168 }
|
|
169
|
|
170 sub load_cigars_from_fasta {
|
|
171 my ($self, $file) = @_;
|
|
172
|
|
173 my $alignio = Bio::AlignIO->new(-file => "$file", -format => "fasta");
|
|
174
|
|
175 my $aln = $alignio->next_aln or die "Bio::AlignIO could not get next_aln() from file '$file'";
|
|
176
|
|
177 #place all members in a hash on their member name
|
|
178 my %member_hash;
|
|
179 foreach my $member (@{$self->get_all_Members}) {
|
|
180 $member_hash{$member->stable_id} = $member;
|
|
181 }
|
|
182
|
|
183 #assign cigar_line to each of the member attribute
|
|
184 foreach my $seq ($aln->each_seq) {
|
|
185 my $id = $seq->display_id;
|
|
186 throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
|
|
187
|
|
188 my $cigar_line = '';
|
|
189 my $seq_string = $seq->seq();
|
|
190 while($seq_string=~/(?:\b|^)(.)(.*?)(?:\b|$)/g) {
|
|
191 $cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M');
|
|
192 }
|
|
193
|
|
194 $member_hash{$id}->cigar_line($cigar_line);
|
|
195 }
|
|
196 }
|
|
197
|
|
198
|
|
199
|
|
200 sub get_SimpleAlign {
|
|
201
|
|
202 my ($self, @args) = @_;
|
|
203
|
|
204 my $id_type = 'STABLE';
|
|
205 my $unique_seqs = 0;
|
|
206 my $cdna = 0;
|
|
207 my $stop2x = 0;
|
|
208 my $append_taxon_id = 0;
|
|
209 my $append_sp_short_name = 0;
|
|
210 my $append_genomedb_id = 0;
|
|
211 my $exon_cased = 0;
|
|
212 my $alignment = 'protein';
|
|
213 my $changeSelenos = 0;
|
|
214 if (scalar @args) {
|
|
215 ($unique_seqs, $cdna, $id_type, $stop2x, $append_taxon_id, $append_sp_short_name, $append_genomedb_id, $exon_cased, $alignment, $changeSelenos) =
|
|
216 rearrange([qw(UNIQ_SEQ CDNA ID_TYPE STOP2X APPEND_TAXON_ID APPEND_SP_SHORT_NAME APPEND_GENOMEDB_ID EXON_CASED ALIGNMENT CHANGE_SELENO)], @args);
|
|
217 }
|
|
218
|
|
219 my $sa = Bio::SimpleAlign->new();
|
|
220
|
|
221 #Hack to try to work with both bioperl 0.7 and 1.2:
|
|
222 #Check to see if the method is called 'addSeq' or 'add_seq'
|
|
223 my $bio07 = ($sa->can('add_seq') ? 0 : 1);
|
|
224
|
|
225 my $seq_id_hash = {};
|
|
226 foreach my $member (@{$self->get_all_Members}) {
|
|
227
|
|
228 next if $member->source_name eq 'ENSEMBLGENE';
|
|
229
|
|
230 # Print unique sequences only ?
|
|
231 next if($unique_seqs and $seq_id_hash->{$member->sequence_id});
|
|
232 $seq_id_hash->{$member->sequence_id} = 1;
|
|
233
|
|
234 # The correct codon table
|
|
235 if ($member->chr_name =~ /mt/i) {
|
|
236 # codeml icodes
|
|
237 # 0:universal code (default)
|
|
238 my $class;
|
|
239 eval {$class = member->taxon->classification;};
|
|
240 unless ($@) {
|
|
241 if ($class =~ /vertebrata/i) {
|
|
242 # 1:mamalian mt
|
|
243 $sa->{_special_codeml_icode} = 1;
|
|
244 } else {
|
|
245 # 4:invertebrate mt
|
|
246 $sa->{_special_codeml_icode} = 4;
|
|
247 }
|
|
248 }
|
|
249 }
|
|
250
|
|
251 my $seqstr;
|
|
252 my $alphabet;
|
|
253 if ($cdna or (lc($alignment) eq 'cdna')) {
|
|
254 $seqstr = $member->cdna_alignment_string($changeSelenos);
|
|
255 $seqstr =~ s/\s+//g;
|
|
256 $alphabet = 'dna';
|
|
257 } else {
|
|
258 $seqstr = $member->alignment_string($exon_cased);
|
|
259 $alphabet = 'protein';
|
|
260 }
|
|
261 next if(!$seqstr);
|
|
262
|
|
263 # Sequence name
|
|
264 my $seqID = $member->stable_id;
|
|
265 $seqID = $member->sequence_id if($id_type eq "SEQ");
|
|
266 $seqID = $member->member_id if($id_type eq "MEMBER");
|
|
267 $seqID .= "_" . $member->taxon_id if($append_taxon_id);
|
|
268 $seqID .= "_" . $member->genome_db_id if ($append_genomedb_id);
|
|
269
|
|
270 ## Append $seqID with Speciae short name, if required
|
|
271 if ($append_sp_short_name) {
|
|
272 my $species = $member->genome_db->short_name;
|
|
273 $species =~ s/\s/_/g;
|
|
274 $seqID .= "_" . $species . "_";
|
|
275 }
|
|
276
|
|
277 # Sequence length
|
|
278 my $aln_end = $member->seq_length;
|
|
279 $aln_end = $aln_end*3 if $alphabet eq 'dna';
|
|
280
|
|
281 $seqstr =~ s/\*/X/g if ($stop2x);
|
|
282 my $seq = Bio::LocatableSeq->new(
|
|
283 -SEQ => $seqstr,
|
|
284 -ALPHABET => $alphabet,
|
|
285 -START => 1,
|
|
286 -END => $aln_end,
|
|
287 -ID => $seqID,
|
|
288 -STRAND => 0
|
|
289 );
|
|
290
|
|
291 if($bio07) {
|
|
292 $sa->addSeq($seq);
|
|
293 } else {
|
|
294 $sa->add_seq($seq);
|
|
295 }
|
|
296 }
|
|
297 $sa = $sa->remove_gaps(undef, 1) if UNIVERSAL::can($sa, 'remove_gaps');
|
|
298 return $sa;
|
|
299 }
|
|
300
|
|
301
|
|
302
|
|
303 # Takes a protein tree and creates a consensus cigar line from the
|
|
304 # constituent leaf nodes.
|
|
305 sub consensus_cigar_line {
|
|
306
|
|
307 my $self = shift;
|
|
308 my @cigars;
|
|
309
|
|
310 # First get an 'expanded' cigar string for each leaf of the subtree
|
|
311 my @all_members = @{$self->get_all_Members};
|
|
312 my $num_members = scalar(@all_members);
|
|
313 foreach my $leaf (@all_members) {
|
|
314 next unless( UNIVERSAL::can( $leaf, 'cigar_line' ) );
|
|
315 my $cigar = $leaf->cigar_line;
|
|
316 my $newcigar = "";
|
|
317 # $cigar =~ s/(\d*)([A-Z])/$2 x ($1||1)/ge; #Expand
|
|
318 while ($cigar =~ /(\d*)([A-Z])/g) {
|
|
319 $newcigar .= $2 x ($1 || 1);
|
|
320 }
|
|
321 $cigar = $newcigar;
|
|
322 push @cigars, $cigar;
|
|
323 }
|
|
324
|
|
325 # Itterate through each character of the expanded cigars.
|
|
326 # If there is a 'D' at a given location in any cigar,
|
|
327 # set the consensus to 'D', otherwise assume an 'M'.
|
|
328 # TODO: Fix assumption that cigar strings are always the same length,
|
|
329 # and start at the same point.
|
|
330 my $cigar_len = length( $cigars[0] );
|
|
331 my $cons_cigar;
|
|
332 for( my $i=0; $i<$cigar_len; $i++ ){
|
|
333 my $char = 'M';
|
|
334 my $num_deletions = 0;
|
|
335 foreach my $cigar( @cigars ){
|
|
336 if ( substr($cigar,$i,1) eq 'D'){
|
|
337 $num_deletions++;
|
|
338 }
|
|
339 }
|
|
340 if ($num_deletions * 3 > 2 * $num_members) {
|
|
341 $char = "D";
|
|
342 } elsif ($num_deletions * 3 > $num_members) {
|
|
343 $char = "m";
|
|
344 }
|
|
345 $cons_cigar .= $char;
|
|
346 }
|
|
347
|
|
348 # Collapse the consensus cigar, e.g. 'DDDD' = 4D
|
|
349 # $cons_cigar =~ s/(\w)(\1*)/($2?length($2)+1:"").$1/ge;
|
|
350 my $collapsed_cigar = "";
|
|
351 while ($cons_cigar =~ /(D+|M+|I+|m+)/g) {
|
|
352 $collapsed_cigar .= (length($1) == 1 ? "" : length($1)) . substr($1,0,1)
|
|
353 }
|
|
354 $cons_cigar = $collapsed_cigar;
|
|
355 # Return the consensus
|
|
356 return $cons_cigar;
|
|
357 }
|
|
358
|
|
359
|
|
360 my %TWOD_CODONS = ("TTT" => "Phe",#Phe
|
|
361 "TTC" => "Phe",
|
|
362
|
|
363 "TTA" => "Leu",#Leu
|
|
364 "TTG" => "Leu",
|
|
365
|
|
366 "TAT" => "Tyr",#Tyr
|
|
367 "TAC" => "Tyr",
|
|
368
|
|
369 "CAT" => "His",#His
|
|
370 "CAC" => "His",
|
|
371
|
|
372 "CAA" => "Gln",#Gln
|
|
373 "CAG" => "Gln",
|
|
374
|
|
375 "AAT" => "Asn",#Asn
|
|
376 "AAC" => "Asn",
|
|
377
|
|
378 "AAA" => "Lys",#Lys
|
|
379 "AAG" => "Lys",
|
|
380
|
|
381 "GAT" => "Asp",#Asp
|
|
382 "GAC" => "Asp",
|
|
383
|
|
384 "GAA" => "Glu",#Glu
|
|
385 "GAG" => "Glu",
|
|
386
|
|
387 "TGT" => "Cys",#Cys
|
|
388 "TGC" => "Cys",
|
|
389
|
|
390 "AGT" => "Ser",#Ser
|
|
391 "AGC" => "Ser",
|
|
392
|
|
393 "AGA" => "Arg",#Arg
|
|
394 "AGG" => "Arg",
|
|
395
|
|
396 "ATT" => "Ile",#Ile
|
|
397 "ATC" => "Ile",
|
|
398 "ATA" => "Ile");
|
|
399
|
|
400 my %FOURD_CODONS = ("CTT" => "Leu",#Leu
|
|
401 "CTC" => "Leu",
|
|
402 "CTA" => "Leu",
|
|
403 "CTG" => "Leu",
|
|
404
|
|
405 "GTT" => "Val",#Val
|
|
406 "GTC" => "Val",
|
|
407 "GTA" => "Val",
|
|
408 "GTG" => "Val",
|
|
409
|
|
410 "TCT" => "Ser",#Ser
|
|
411 "TCC" => "Ser",
|
|
412 "TCA" => "Ser",
|
|
413 "TCG" => "Ser",
|
|
414
|
|
415 "CCT" => "Pro",#Pro
|
|
416 "CCC" => "Pro",
|
|
417 "CCA" => "Pro",
|
|
418 "CCG" => "Pro",
|
|
419
|
|
420 "ACT" => "Thr",#Thr
|
|
421 "ACC" => "Thr",
|
|
422 "ACA" => "Thr",
|
|
423 "ACG" => "Thr",
|
|
424
|
|
425 "GCT" => "Ala",#Ala
|
|
426 "GCC" => "Ala",
|
|
427 "GCA" => "Ala",
|
|
428 "GCG" => "Ala",
|
|
429
|
|
430 "CGT" => "Arg",#Arg
|
|
431 "CGC" => "Arg",
|
|
432 "CGA" => "Arg",
|
|
433 "CGG" => "Arg",
|
|
434
|
|
435 "GGT" => "Gly",#Gly
|
|
436 "GGC" => "Gly",
|
|
437 "GGA" => "Gly",
|
|
438 "GGG" => "Gly");
|
|
439
|
|
440 my %CODONS = ("ATG" => "Met",
|
|
441 "TGG" => "Trp",
|
|
442 "TAA" => "TER",
|
|
443 "TAG" => "TER",
|
|
444 "TGA" => "TER",
|
|
445 "---" => "---",
|
|
446 %TWOD_CODONS,
|
|
447 %FOURD_CODONS,
|
|
448 );
|
|
449
|
|
450
|
|
451 =head2 get_4D_SimpleAlign
|
|
452
|
|
453 Example : $4d_align = $homology->get_4D_SimpleAlign();
|
|
454 Description: get 4 times degenerate positions pairwise simple alignment
|
|
455 Returntype : Bio::SimpleAlign
|
|
456
|
|
457 =cut
|
|
458
|
|
459 sub get_4D_SimpleAlign {
|
|
460 my $self = shift;
|
|
461
|
|
462 my $sa = Bio::SimpleAlign->new();
|
|
463
|
|
464 #Hack to try to work with both bioperl 0.7 and 1.2:
|
|
465 #Check to see if the method is called 'addSeq' or 'add_seq'
|
|
466 my $bio07 = 0;
|
|
467 if(!$sa->can('add_seq')) {
|
|
468 $bio07 = 1;
|
|
469 }
|
|
470
|
|
471 my $ma = $self->adaptor->db->get_MemberAdaptor;
|
|
472
|
|
473 my %member_seqstr;
|
|
474 foreach my $member (@{$self->get_all_Members}) {
|
|
475 next if $member->source_name ne 'ENSEMBLPEP';
|
|
476 my $seqstr = $member->cdna_alignment_string();
|
|
477 next if(!$seqstr);
|
|
478 #print STDERR $seqstr,"\n";
|
|
479 my @tmp_tab = split /\s+/, $seqstr;
|
|
480 #print STDERR "tnp_tab 0: ", $tmp_tab[0],"\n";
|
|
481 $member_seqstr{$member->stable_id} = \@tmp_tab;
|
|
482 }
|
|
483
|
|
484 my $seqstr_length;
|
|
485 foreach my $seqid (keys %member_seqstr) {
|
|
486 unless (defined $seqstr_length) {
|
|
487 #print STDERR $member_seqstr{$seqid}->[0],"\n";
|
|
488 $seqstr_length = scalar @{$member_seqstr{$seqid}};
|
|
489 next;
|
|
490 }
|
|
491 unless ($seqstr_length == scalar @{$member_seqstr{$seqid}}) {
|
|
492 die "Length of dna alignment are not the same, $seqstr_length and " . scalar @{$member_seqstr{$seqid}} ." respectively for homology_id " . $self->dbID . "\n";
|
|
493 }
|
|
494 }
|
|
495
|
|
496 my %FourD_member_seqstr;
|
|
497 for (my $i=0; $i < $seqstr_length; $i++) {
|
|
498 my $FourD_codon = 1;
|
|
499 my $FourD_aminoacid;
|
|
500 foreach my $seqid (keys %member_seqstr) {
|
|
501 if (defined $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
|
|
502 if (defined $FourD_aminoacid && $FourD_aminoacid eq $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
|
|
503 #print STDERR "YES ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
|
|
504 next;
|
|
505 } elsif (defined $FourD_aminoacid) {
|
|
506 #print STDERR "NO ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
|
|
507 $FourD_codon = 0;
|
|
508 last;
|
|
509 } else {
|
|
510 $FourD_aminoacid = $FOURD_CODONS{$member_seqstr{$seqid}->[$i]};
|
|
511 #print STDERR $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i]," ";
|
|
512 }
|
|
513 next;
|
|
514 } else {
|
|
515 #print STDERR "NO ",$CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
|
|
516 $FourD_codon = 0;
|
|
517 last;
|
|
518 }
|
|
519 }
|
|
520 next unless ($FourD_codon);
|
|
521 foreach my $seqid (keys %member_seqstr) {
|
|
522 $FourD_member_seqstr{$seqid} .= substr($member_seqstr{$seqid}->[$i],2,1);
|
|
523 }
|
|
524 }
|
|
525
|
|
526 foreach my $seqid (keys %FourD_member_seqstr) {
|
|
527
|
|
528 my $seq = Bio::LocatableSeq->new(
|
|
529 -SEQ => $FourD_member_seqstr{$seqid},
|
|
530 -START => 1,
|
|
531 -END => length($FourD_member_seqstr{$seqid}),
|
|
532 -ID => $seqid,
|
|
533 -STRAND => 0
|
|
534 );
|
|
535
|
|
536 if($bio07) {
|
|
537 $sa->addSeq($seq);
|
|
538 } else {
|
|
539 $sa->add_seq($seq);
|
|
540 }
|
|
541 }
|
|
542
|
|
543 return $sa;
|
|
544 }
|
|
545
|
|
546 my %matrix_hash;
|
|
547
|
|
548 {
|
|
549 my $BLOSUM62 = "# Matrix made by matblas from blosum62.iij
|
|
550 # * column uses minimum score
|
|
551 # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
|
|
552 # Blocks Database = /data/blocks_5.0/blocks.dat
|
|
553 # Cluster Percentage: >= 62
|
|
554 # Entropy = 0.6979, Expected = -0.5209
|
|
555 A R N D C Q E G H I L K M F P S T W Y V B Z X *
|
|
556 A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
|
|
557 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
|
|
558 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
|
|
559 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
|
|
560 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
|
|
561 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
|
|
562 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
|
|
563 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
|
|
564 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
|
|
565 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
|
|
566 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
|
|
567 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
|
|
568 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
|
|
569 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
|
|
570 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
|
|
571 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
|
|
572 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
|
|
573 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
|
|
574 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
|
|
575 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
|
|
576 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
|
|
577 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
|
|
578 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
|
|
579 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
|
|
580 ";
|
|
581 my $matrix_string;
|
|
582 my @lines = split(/\n/,$BLOSUM62);
|
|
583 foreach my $line (@lines) {
|
|
584 next if ($line =~ /^\#/);
|
|
585 if ($line =~ /^[A-Z\*\s]+$/) {
|
|
586 $matrix_string .= sprintf "$line\n";
|
|
587 } else {
|
|
588 my @t = split(/\s+/,$line);
|
|
589 shift @t;
|
|
590 # print scalar @t,"\n";
|
|
591 $matrix_string .= sprintf(join(" ",@t)."\n");
|
|
592 }
|
|
593 }
|
|
594
|
|
595 # my %matrix_hash;
|
|
596 @lines = ();
|
|
597 @lines = split /\n/, $matrix_string;
|
|
598 my $lts = shift @lines;
|
|
599 $lts =~ s/^\s+//;
|
|
600 $lts =~ s/\s+$//;
|
|
601 my @letters = split /\s+/, $lts;
|
|
602
|
|
603 foreach my $letter (@letters) {
|
|
604 my $line = shift @lines;
|
|
605 $line =~ s/^\s+//;
|
|
606 $line =~ s/\s+$//;
|
|
607 my @penalties = split /\s+/, $line;
|
|
608 die "Size of letters array and penalties array are different\n"
|
|
609 unless (scalar @letters == scalar @penalties);
|
|
610 for (my $i=0; $i < scalar @letters; $i++) {
|
|
611 $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
|
|
612 }
|
|
613 }
|
|
614 }
|
|
615
|
|
616
|
|
617 sub update_alignment_stats {
|
|
618 my $self = shift;
|
|
619
|
|
620 my $genes = $self->get_all_Members;
|
|
621 my $ngenes = scalar(@$genes);
|
|
622
|
|
623 if ($ngenes == 2) {
|
|
624 # This code is >4 times faster with pairs of genes
|
|
625
|
|
626 my $gene1 = $genes->[0];
|
|
627 my $gene2 = $genes->[1];
|
|
628 my $new_aln1_cigarline = "";
|
|
629 my $new_aln2_cigarline = "";
|
|
630
|
|
631 my $identical_matches = 0;
|
|
632 my $positive_matches = 0;
|
|
633
|
|
634 my ($aln1state, $aln2state);
|
|
635 my ($aln1count, $aln2count);
|
|
636 my ($aln1cov, $aln2cov) = (0,0);
|
|
637
|
|
638 my @aln1 = split(//, $gene1->alignment_string);
|
|
639 my @aln2 = split(//, $gene2->alignment_string);
|
|
640
|
|
641 for (my $i=0; $i <= $#aln1; $i++) {
|
|
642 next if ($aln1[$i] eq '-' && $aln2[$i] eq '-');
|
|
643 my $cur_aln1state = ($aln1[$i] eq '-' ? 'D' : 'M');
|
|
644 my $cur_aln2state = ($aln2[$i] eq '-' ? 'D' : 'M');
|
|
645 $aln1cov++ if $cur_aln1state ne 'D';
|
|
646 $aln2cov++ if $cur_aln2state ne 'D';
|
|
647 if ($cur_aln1state eq 'M' && $cur_aln2state eq 'M') {
|
|
648 if ($aln1[$i] eq $aln2[$i]) {
|
|
649 $identical_matches++;
|
|
650 $positive_matches++;
|
|
651 } elsif ($matrix_hash{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
|
|
652 $positive_matches++;
|
|
653 }
|
|
654 }
|
|
655 unless (defined $aln1state) {
|
|
656 $aln1count = 1;
|
|
657 $aln2count = 1;
|
|
658 $aln1state = $cur_aln1state;
|
|
659 $aln2state = $cur_aln2state;
|
|
660 next;
|
|
661 }
|
|
662 if ($cur_aln1state eq $aln1state) {
|
|
663 $aln1count++;
|
|
664 } else {
|
|
665 if ($aln1count == 1) {
|
|
666 $new_aln1_cigarline .= $aln1state;
|
|
667 } else {
|
|
668 $new_aln1_cigarline .= $aln1count.$aln1state;
|
|
669 }
|
|
670 $aln1count = 1;
|
|
671 $aln1state = $cur_aln1state;
|
|
672 }
|
|
673 if ($cur_aln2state eq $aln2state) {
|
|
674 $aln2count++;
|
|
675 } else {
|
|
676 if ($aln2count == 1) {
|
|
677 $new_aln2_cigarline .= $aln2state;
|
|
678 } else {
|
|
679 $new_aln2_cigarline .= $aln2count.$aln2state;
|
|
680 }
|
|
681 $aln2count = 1;
|
|
682 $aln2state = $cur_aln2state;
|
|
683 }
|
|
684 }
|
|
685 if ($aln1count == 1) {
|
|
686 $new_aln1_cigarline .= $aln1state;
|
|
687 } else {
|
|
688 $new_aln1_cigarline .= $aln1count.$aln1state;
|
|
689 }
|
|
690 if ($aln2count == 1) {
|
|
691 $new_aln2_cigarline .= $aln2state;
|
|
692 } else {
|
|
693 $new_aln2_cigarline .= $aln2count.$aln2state;
|
|
694 }
|
|
695 my $seq_length1 = $gene1->seq_length;
|
|
696 unless (0 == $seq_length1) {
|
|
697 $gene1->cigar_line($new_aln1_cigarline);
|
|
698 $gene1->perc_id( int((100.0 * $identical_matches / $seq_length1 + 0.5)) );
|
|
699 $gene1->perc_pos( int((100.0 * $positive_matches / $seq_length1 + 0.5)) );
|
|
700 $gene1->perc_cov( int((100.0 * $aln1cov / $seq_length1 + 0.5)) );
|
|
701 }
|
|
702 my $seq_length2 = $gene2->seq_length;
|
|
703 unless (0 == $seq_length2) {
|
|
704 $gene2->cigar_line($new_aln2_cigarline);
|
|
705 $gene2->perc_id( int((100.0 * $identical_matches / $seq_length2 + 0.5)) );
|
|
706 $gene2->perc_pos( int((100.0 * $positive_matches / $seq_length2 + 0.5)) );
|
|
707 $gene2->perc_cov( int((100.0 * $aln2cov / $seq_length2 + 0.5)) );
|
|
708 }
|
|
709 return undef;
|
|
710 }
|
|
711
|
|
712 my $min_seq = shift;
|
|
713 $min_seq = int($min_seq * $ngenes) if $min_seq <= 1;
|
|
714
|
|
715 my @new_cigars = ('') x $ngenes;
|
|
716 my @nmatch_id = (0) x $ngenes;
|
|
717 my @nmatch_pos = (0) x $ngenes;
|
|
718 my @nmatch_cov = (0) x $ngenes;
|
|
719 my @alncount = (1) x $ngenes;
|
|
720 my @alnstate = (undef) x $ngenes;
|
|
721 my @cur_alnstate = (undef) x $ngenes;
|
|
722
|
|
723 my @aln = map {$_->alignment_string} @$genes;
|
|
724 my $aln_length = length($aln[0]);
|
|
725
|
|
726 for (my $i=0; $i < $aln_length; $i++) {
|
|
727
|
|
728 my @char_i = map {substr($_, $i, 1)} @aln;
|
|
729 #print "pos $i: ", join('/', @char_i), "\n";
|
|
730
|
|
731 my %seen;
|
|
732 map {$seen{$_}++} @char_i;
|
|
733 next if $seen{'-'} == $ngenes;
|
|
734 delete $seen{'-'};
|
|
735
|
|
736 my %pos_chars = ();
|
|
737 my @chars = keys %seen;
|
|
738 while (my $c1 = shift @chars) {
|
|
739 foreach my $c2 (@chars) {
|
|
740 if (($matrix_hash{uc $c1}{uc $c2} > 0) and ($seen{$c1}+$seen{$c2} >= $min_seq)) {
|
|
741 $pos_chars{$c1} = 1;
|
|
742 $pos_chars{$c2} = 1;
|
|
743 }
|
|
744 }
|
|
745 }
|
|
746
|
|
747 for (my $j=0; $j<$ngenes; $j++) {
|
|
748 if ($char_i[$j] eq '-') {
|
|
749 $cur_alnstate[$j] = 'D';
|
|
750 } else {
|
|
751 $nmatch_cov[$j]++;
|
|
752 $cur_alnstate[$j] = 'M';
|
|
753 if ($seen{$char_i[$j]} >= $min_seq) {
|
|
754 $nmatch_id[$j]++;
|
|
755 $nmatch_pos[$j]++;
|
|
756 } elsif (exists $pos_chars{$char_i[$j]}) {
|
|
757 $nmatch_pos[$j]++;
|
|
758 }
|
|
759 }
|
|
760 }
|
|
761
|
|
762 if ($i == 0) {
|
|
763 @alnstate = @cur_alnstate;
|
|
764 next;
|
|
765 }
|
|
766
|
|
767 for (my $j=0; $j<$ngenes; $j++) {
|
|
768 if ($cur_alnstate[$j] eq $alnstate[$j]) {
|
|
769 $alncount[$j]++;
|
|
770 } else {
|
|
771 if ($alncount[$j] == 1) {
|
|
772 $new_cigars[$j] .= $alnstate[$j];
|
|
773 } else {
|
|
774 $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
|
|
775 }
|
|
776 $alncount[$j] = 1;
|
|
777 $alnstate[$j] = $cur_alnstate[$j];
|
|
778 }
|
|
779 }
|
|
780 }
|
|
781
|
|
782 for (my $j=0; $j<$ngenes; $j++) {
|
|
783 if ($alncount[$j] == 1) {
|
|
784 $new_cigars[$j] .= $alnstate[$j];
|
|
785 } else {
|
|
786 $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
|
|
787 }
|
|
788 $genes->[$j]->cigar_line($new_cigars[$j]);
|
|
789 my $seq_length = $genes->[$j]->seq_length;
|
|
790 unless (0 == $seq_length) {
|
|
791 $genes->[$j]->perc_id( int((100.0 * $nmatch_id[$j] / $seq_length + 0.5)) );
|
|
792 $genes->[$j]->perc_pos( int((100.0 * $nmatch_pos[$j] / $seq_length + 0.5)) );
|
|
793 $genes->[$j]->perc_cov( int((100.0 * $nmatch_cov[$j] / $seq_length + 0.5)) );
|
|
794 }
|
|
795 }
|
|
796 }
|
|
797
|
|
798
|
|
799
|
|
800 1;
|