Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =head1 AUTHORSHIP | |
20 | |
21 Ensembl Team. Individual contributions can be found in the CVS log. | |
22 | |
23 =cut | |
24 | |
25 =head1 NAME | |
26 | |
27 AlignedMemberSet - A superclass for pairwise or multiple relationships, base of | |
28 Bio::EnsEMBL::Compara::Family, Bio::EnsEMBL::Compara::Homology and | |
29 Bio::EnsEMBL::Compara::Domain. | |
30 | |
31 =head1 DESCRIPTION | |
32 | |
33 A superclass for pairwise and multiple relationships | |
34 | |
35 Currently the AlignedMember objects are used in the GeneTree structure | |
36 to represent the leaves of the trees. Each leaf contains an aligned | |
37 sequence, which is represented as an AlignedMember object. | |
38 | |
39 =head1 INHERITANCE TREE | |
40 | |
41 Bio::EnsEMBL::Compara::AlignedMemberSet | |
42 | |
43 =head1 METHODS | |
44 | |
45 =cut | |
46 | |
47 package Bio::EnsEMBL::Compara::AlignedMemberSet; | |
48 | |
49 use strict; | |
50 use Scalar::Util qw(weaken); | |
51 | |
52 use Bio::AlignIO; | |
53 | |
54 use Bio::EnsEMBL::Utils::Argument; | |
55 use Bio::EnsEMBL::Utils::Scalar; | |
56 use Bio::EnsEMBL::Utils::Exception; | |
57 | |
58 use Bio::EnsEMBL::Compara::AlignedMember; | |
59 | |
60 use base ('Bio::EnsEMBL::Compara::MemberSet'); | |
61 | |
62 | |
63 =head2 member_class | |
64 | |
65 Description: Returns the type of member used in the set | |
66 Returntype : String: Bio::EnsEMBL::Compara::AlignedMember | |
67 Caller : general | |
68 Status : Stable | |
69 | |
70 =cut | |
71 | |
72 sub member_class { | |
73 return 'Bio::EnsEMBL::Compara::AlignedMember'; | |
74 } | |
75 | |
76 ###################### | |
77 # Alignment sections # | |
78 ###################### | |
79 | |
80 | |
81 sub print_sequences_to_fasta { | |
82 my ($self, $pep_file) = @_; | |
83 my $pep_counter = 0; | |
84 open PEP, ">$pep_file"; | |
85 foreach my $member (@{$self->get_all_Members}) { | |
86 next if $member->source_name eq 'ENSEMBLGENE'; | |
87 my $member_stable_id = $member->stable_id; | |
88 my $seq = $member->sequence; | |
89 | |
90 print PEP ">$member_stable_id\n"; | |
91 $seq =~ s/(.{72})/$1\n/g; | |
92 chomp $seq; | |
93 unless (defined($seq)) { | |
94 my $family_id = $self->dbID; | |
95 die "member $member_stable_id in family $family_id doesn't have a sequence"; | |
96 } | |
97 print PEP $seq,"\n"; | |
98 $pep_counter++; | |
99 } | |
100 close PEP; | |
101 return $pep_counter; | |
102 } | |
103 | |
104 | |
105 =head2 read_clustalw | |
106 | |
107 Arg [1] : string $file | |
108 The name of the file containing the clustalw output | |
109 Example : $family->read_clustalw('/tmp/clustalw.aln'); | |
110 Description: Parses the output from clustalw and sets the alignment strings | |
111 of each of the memebers of this family | |
112 Returntype : none | |
113 Exceptions : thrown if file cannot be parsed | |
114 warning if alignment file contains identifiers for sequences | |
115 which are not members of this family | |
116 Caller : general | |
117 | |
118 =cut | |
119 | |
120 sub read_clustalw { | |
121 my $self = shift; | |
122 my $file = shift; | |
123 | |
124 my %align_hash; | |
125 my $FH = IO::File->new(); | |
126 $FH->open($file) || throw("Could not open alignment file [$file]"); | |
127 | |
128 <$FH>; #skip header | |
129 while(<$FH>) { | |
130 next if($_ =~ /^\s+/); #skip lines that start with space | |
131 | |
132 my ($id, $align) = split; | |
133 $align_hash{$id} ||= ''; | |
134 $align_hash{$id} .= $align; | |
135 } | |
136 | |
137 $FH->close; | |
138 | |
139 #place all members in a hash on their member name | |
140 my %member_hash; | |
141 foreach my $member (@{$self->get_all_Members}) { | |
142 $member_hash{$member->stable_id} = $member; | |
143 } | |
144 | |
145 #assign cigar_line to each of the member attribute | |
146 foreach my $id (keys %align_hash) { | |
147 throw("No member for alignment portion: [$id]") unless exists $member_hash{$id}; | |
148 | |
149 my $alignment_string = $align_hash{$id}; | |
150 $alignment_string =~ s/\-([A-Z])/\- $1/g; | |
151 $alignment_string =~ s/([A-Z])\-/$1 \-/g; | |
152 | |
153 my @cigar_segments = split " ",$alignment_string; | |
154 | |
155 my $cigar_line = ""; | |
156 foreach my $segment (@cigar_segments) { | |
157 my $seglength = length($segment); | |
158 $seglength = "" if ($seglength == 1); | |
159 if ($segment =~ /^\-+$/) { | |
160 $cigar_line .= $seglength . "D"; | |
161 } else { | |
162 $cigar_line .= $seglength . "M"; | |
163 } | |
164 } | |
165 | |
166 $member_hash{$id}->cigar_line($cigar_line); | |
167 } | |
168 } | |
169 | |
170 sub load_cigars_from_fasta { | |
171 my ($self, $file) = @_; | |
172 | |
173 my $alignio = Bio::AlignIO->new(-file => "$file", -format => "fasta"); | |
174 | |
175 my $aln = $alignio->next_aln or die "Bio::AlignIO could not get next_aln() from file '$file'"; | |
176 | |
177 #place all members in a hash on their member name | |
178 my %member_hash; | |
179 foreach my $member (@{$self->get_all_Members}) { | |
180 $member_hash{$member->stable_id} = $member; | |
181 } | |
182 | |
183 #assign cigar_line to each of the member attribute | |
184 foreach my $seq ($aln->each_seq) { | |
185 my $id = $seq->display_id; | |
186 throw("No member for alignment portion: [$id]") unless exists $member_hash{$id}; | |
187 | |
188 my $cigar_line = ''; | |
189 my $seq_string = $seq->seq(); | |
190 while($seq_string=~/(?:\b|^)(.)(.*?)(?:\b|$)/g) { | |
191 $cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M'); | |
192 } | |
193 | |
194 $member_hash{$id}->cigar_line($cigar_line); | |
195 } | |
196 } | |
197 | |
198 | |
199 | |
200 sub get_SimpleAlign { | |
201 | |
202 my ($self, @args) = @_; | |
203 | |
204 my $id_type = 'STABLE'; | |
205 my $unique_seqs = 0; | |
206 my $cdna = 0; | |
207 my $stop2x = 0; | |
208 my $append_taxon_id = 0; | |
209 my $append_sp_short_name = 0; | |
210 my $append_genomedb_id = 0; | |
211 my $exon_cased = 0; | |
212 my $alignment = 'protein'; | |
213 my $changeSelenos = 0; | |
214 if (scalar @args) { | |
215 ($unique_seqs, $cdna, $id_type, $stop2x, $append_taxon_id, $append_sp_short_name, $append_genomedb_id, $exon_cased, $alignment, $changeSelenos) = | |
216 rearrange([qw(UNIQ_SEQ CDNA ID_TYPE STOP2X APPEND_TAXON_ID APPEND_SP_SHORT_NAME APPEND_GENOMEDB_ID EXON_CASED ALIGNMENT CHANGE_SELENO)], @args); | |
217 } | |
218 | |
219 my $sa = Bio::SimpleAlign->new(); | |
220 | |
221 #Hack to try to work with both bioperl 0.7 and 1.2: | |
222 #Check to see if the method is called 'addSeq' or 'add_seq' | |
223 my $bio07 = ($sa->can('add_seq') ? 0 : 1); | |
224 | |
225 my $seq_id_hash = {}; | |
226 foreach my $member (@{$self->get_all_Members}) { | |
227 | |
228 next if $member->source_name eq 'ENSEMBLGENE'; | |
229 | |
230 # Print unique sequences only ? | |
231 next if($unique_seqs and $seq_id_hash->{$member->sequence_id}); | |
232 $seq_id_hash->{$member->sequence_id} = 1; | |
233 | |
234 # The correct codon table | |
235 if ($member->chr_name =~ /mt/i) { | |
236 # codeml icodes | |
237 # 0:universal code (default) | |
238 my $class; | |
239 eval {$class = member->taxon->classification;}; | |
240 unless ($@) { | |
241 if ($class =~ /vertebrata/i) { | |
242 # 1:mamalian mt | |
243 $sa->{_special_codeml_icode} = 1; | |
244 } else { | |
245 # 4:invertebrate mt | |
246 $sa->{_special_codeml_icode} = 4; | |
247 } | |
248 } | |
249 } | |
250 | |
251 my $seqstr; | |
252 my $alphabet; | |
253 if ($cdna or (lc($alignment) eq 'cdna')) { | |
254 $seqstr = $member->cdna_alignment_string($changeSelenos); | |
255 $seqstr =~ s/\s+//g; | |
256 $alphabet = 'dna'; | |
257 } else { | |
258 $seqstr = $member->alignment_string($exon_cased); | |
259 $alphabet = 'protein'; | |
260 } | |
261 next if(!$seqstr); | |
262 | |
263 # Sequence name | |
264 my $seqID = $member->stable_id; | |
265 $seqID = $member->sequence_id if($id_type eq "SEQ"); | |
266 $seqID = $member->member_id if($id_type eq "MEMBER"); | |
267 $seqID .= "_" . $member->taxon_id if($append_taxon_id); | |
268 $seqID .= "_" . $member->genome_db_id if ($append_genomedb_id); | |
269 | |
270 ## Append $seqID with Speciae short name, if required | |
271 if ($append_sp_short_name) { | |
272 my $species = $member->genome_db->short_name; | |
273 $species =~ s/\s/_/g; | |
274 $seqID .= "_" . $species . "_"; | |
275 } | |
276 | |
277 # Sequence length | |
278 my $aln_end = $member->seq_length; | |
279 $aln_end = $aln_end*3 if $alphabet eq 'dna'; | |
280 | |
281 $seqstr =~ s/\*/X/g if ($stop2x); | |
282 my $seq = Bio::LocatableSeq->new( | |
283 -SEQ => $seqstr, | |
284 -ALPHABET => $alphabet, | |
285 -START => 1, | |
286 -END => $aln_end, | |
287 -ID => $seqID, | |
288 -STRAND => 0 | |
289 ); | |
290 | |
291 if($bio07) { | |
292 $sa->addSeq($seq); | |
293 } else { | |
294 $sa->add_seq($seq); | |
295 } | |
296 } | |
297 $sa = $sa->remove_gaps(undef, 1) if UNIVERSAL::can($sa, 'remove_gaps'); | |
298 return $sa; | |
299 } | |
300 | |
301 | |
302 | |
303 # Takes a protein tree and creates a consensus cigar line from the | |
304 # constituent leaf nodes. | |
305 sub consensus_cigar_line { | |
306 | |
307 my $self = shift; | |
308 my @cigars; | |
309 | |
310 # First get an 'expanded' cigar string for each leaf of the subtree | |
311 my @all_members = @{$self->get_all_Members}; | |
312 my $num_members = scalar(@all_members); | |
313 foreach my $leaf (@all_members) { | |
314 next unless( UNIVERSAL::can( $leaf, 'cigar_line' ) ); | |
315 my $cigar = $leaf->cigar_line; | |
316 my $newcigar = ""; | |
317 # $cigar =~ s/(\d*)([A-Z])/$2 x ($1||1)/ge; #Expand | |
318 while ($cigar =~ /(\d*)([A-Z])/g) { | |
319 $newcigar .= $2 x ($1 || 1); | |
320 } | |
321 $cigar = $newcigar; | |
322 push @cigars, $cigar; | |
323 } | |
324 | |
325 # Itterate through each character of the expanded cigars. | |
326 # If there is a 'D' at a given location in any cigar, | |
327 # set the consensus to 'D', otherwise assume an 'M'. | |
328 # TODO: Fix assumption that cigar strings are always the same length, | |
329 # and start at the same point. | |
330 my $cigar_len = length( $cigars[0] ); | |
331 my $cons_cigar; | |
332 for( my $i=0; $i<$cigar_len; $i++ ){ | |
333 my $char = 'M'; | |
334 my $num_deletions = 0; | |
335 foreach my $cigar( @cigars ){ | |
336 if ( substr($cigar,$i,1) eq 'D'){ | |
337 $num_deletions++; | |
338 } | |
339 } | |
340 if ($num_deletions * 3 > 2 * $num_members) { | |
341 $char = "D"; | |
342 } elsif ($num_deletions * 3 > $num_members) { | |
343 $char = "m"; | |
344 } | |
345 $cons_cigar .= $char; | |
346 } | |
347 | |
348 # Collapse the consensus cigar, e.g. 'DDDD' = 4D | |
349 # $cons_cigar =~ s/(\w)(\1*)/($2?length($2)+1:"").$1/ge; | |
350 my $collapsed_cigar = ""; | |
351 while ($cons_cigar =~ /(D+|M+|I+|m+)/g) { | |
352 $collapsed_cigar .= (length($1) == 1 ? "" : length($1)) . substr($1,0,1) | |
353 } | |
354 $cons_cigar = $collapsed_cigar; | |
355 # Return the consensus | |
356 return $cons_cigar; | |
357 } | |
358 | |
359 | |
360 my %TWOD_CODONS = ("TTT" => "Phe",#Phe | |
361 "TTC" => "Phe", | |
362 | |
363 "TTA" => "Leu",#Leu | |
364 "TTG" => "Leu", | |
365 | |
366 "TAT" => "Tyr",#Tyr | |
367 "TAC" => "Tyr", | |
368 | |
369 "CAT" => "His",#His | |
370 "CAC" => "His", | |
371 | |
372 "CAA" => "Gln",#Gln | |
373 "CAG" => "Gln", | |
374 | |
375 "AAT" => "Asn",#Asn | |
376 "AAC" => "Asn", | |
377 | |
378 "AAA" => "Lys",#Lys | |
379 "AAG" => "Lys", | |
380 | |
381 "GAT" => "Asp",#Asp | |
382 "GAC" => "Asp", | |
383 | |
384 "GAA" => "Glu",#Glu | |
385 "GAG" => "Glu", | |
386 | |
387 "TGT" => "Cys",#Cys | |
388 "TGC" => "Cys", | |
389 | |
390 "AGT" => "Ser",#Ser | |
391 "AGC" => "Ser", | |
392 | |
393 "AGA" => "Arg",#Arg | |
394 "AGG" => "Arg", | |
395 | |
396 "ATT" => "Ile",#Ile | |
397 "ATC" => "Ile", | |
398 "ATA" => "Ile"); | |
399 | |
400 my %FOURD_CODONS = ("CTT" => "Leu",#Leu | |
401 "CTC" => "Leu", | |
402 "CTA" => "Leu", | |
403 "CTG" => "Leu", | |
404 | |
405 "GTT" => "Val",#Val | |
406 "GTC" => "Val", | |
407 "GTA" => "Val", | |
408 "GTG" => "Val", | |
409 | |
410 "TCT" => "Ser",#Ser | |
411 "TCC" => "Ser", | |
412 "TCA" => "Ser", | |
413 "TCG" => "Ser", | |
414 | |
415 "CCT" => "Pro",#Pro | |
416 "CCC" => "Pro", | |
417 "CCA" => "Pro", | |
418 "CCG" => "Pro", | |
419 | |
420 "ACT" => "Thr",#Thr | |
421 "ACC" => "Thr", | |
422 "ACA" => "Thr", | |
423 "ACG" => "Thr", | |
424 | |
425 "GCT" => "Ala",#Ala | |
426 "GCC" => "Ala", | |
427 "GCA" => "Ala", | |
428 "GCG" => "Ala", | |
429 | |
430 "CGT" => "Arg",#Arg | |
431 "CGC" => "Arg", | |
432 "CGA" => "Arg", | |
433 "CGG" => "Arg", | |
434 | |
435 "GGT" => "Gly",#Gly | |
436 "GGC" => "Gly", | |
437 "GGA" => "Gly", | |
438 "GGG" => "Gly"); | |
439 | |
440 my %CODONS = ("ATG" => "Met", | |
441 "TGG" => "Trp", | |
442 "TAA" => "TER", | |
443 "TAG" => "TER", | |
444 "TGA" => "TER", | |
445 "---" => "---", | |
446 %TWOD_CODONS, | |
447 %FOURD_CODONS, | |
448 ); | |
449 | |
450 | |
451 =head2 get_4D_SimpleAlign | |
452 | |
453 Example : $4d_align = $homology->get_4D_SimpleAlign(); | |
454 Description: get 4 times degenerate positions pairwise simple alignment | |
455 Returntype : Bio::SimpleAlign | |
456 | |
457 =cut | |
458 | |
459 sub get_4D_SimpleAlign { | |
460 my $self = shift; | |
461 | |
462 my $sa = Bio::SimpleAlign->new(); | |
463 | |
464 #Hack to try to work with both bioperl 0.7 and 1.2: | |
465 #Check to see if the method is called 'addSeq' or 'add_seq' | |
466 my $bio07 = 0; | |
467 if(!$sa->can('add_seq')) { | |
468 $bio07 = 1; | |
469 } | |
470 | |
471 my $ma = $self->adaptor->db->get_MemberAdaptor; | |
472 | |
473 my %member_seqstr; | |
474 foreach my $member (@{$self->get_all_Members}) { | |
475 next if $member->source_name ne 'ENSEMBLPEP'; | |
476 my $seqstr = $member->cdna_alignment_string(); | |
477 next if(!$seqstr); | |
478 #print STDERR $seqstr,"\n"; | |
479 my @tmp_tab = split /\s+/, $seqstr; | |
480 #print STDERR "tnp_tab 0: ", $tmp_tab[0],"\n"; | |
481 $member_seqstr{$member->stable_id} = \@tmp_tab; | |
482 } | |
483 | |
484 my $seqstr_length; | |
485 foreach my $seqid (keys %member_seqstr) { | |
486 unless (defined $seqstr_length) { | |
487 #print STDERR $member_seqstr{$seqid}->[0],"\n"; | |
488 $seqstr_length = scalar @{$member_seqstr{$seqid}}; | |
489 next; | |
490 } | |
491 unless ($seqstr_length == scalar @{$member_seqstr{$seqid}}) { | |
492 die "Length of dna alignment are not the same, $seqstr_length and " . scalar @{$member_seqstr{$seqid}} ." respectively for homology_id " . $self->dbID . "\n"; | |
493 } | |
494 } | |
495 | |
496 my %FourD_member_seqstr; | |
497 for (my $i=0; $i < $seqstr_length; $i++) { | |
498 my $FourD_codon = 1; | |
499 my $FourD_aminoacid; | |
500 foreach my $seqid (keys %member_seqstr) { | |
501 if (defined $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) { | |
502 if (defined $FourD_aminoacid && $FourD_aminoacid eq $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) { | |
503 #print STDERR "YES ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n"; | |
504 next; | |
505 } elsif (defined $FourD_aminoacid) { | |
506 #print STDERR "NO ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n"; | |
507 $FourD_codon = 0; | |
508 last; | |
509 } else { | |
510 $FourD_aminoacid = $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}; | |
511 #print STDERR $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i]," "; | |
512 } | |
513 next; | |
514 } else { | |
515 #print STDERR "NO ",$CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n"; | |
516 $FourD_codon = 0; | |
517 last; | |
518 } | |
519 } | |
520 next unless ($FourD_codon); | |
521 foreach my $seqid (keys %member_seqstr) { | |
522 $FourD_member_seqstr{$seqid} .= substr($member_seqstr{$seqid}->[$i],2,1); | |
523 } | |
524 } | |
525 | |
526 foreach my $seqid (keys %FourD_member_seqstr) { | |
527 | |
528 my $seq = Bio::LocatableSeq->new( | |
529 -SEQ => $FourD_member_seqstr{$seqid}, | |
530 -START => 1, | |
531 -END => length($FourD_member_seqstr{$seqid}), | |
532 -ID => $seqid, | |
533 -STRAND => 0 | |
534 ); | |
535 | |
536 if($bio07) { | |
537 $sa->addSeq($seq); | |
538 } else { | |
539 $sa->add_seq($seq); | |
540 } | |
541 } | |
542 | |
543 return $sa; | |
544 } | |
545 | |
546 my %matrix_hash; | |
547 | |
548 { | |
549 my $BLOSUM62 = "# Matrix made by matblas from blosum62.iij | |
550 # * column uses minimum score | |
551 # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units | |
552 # Blocks Database = /data/blocks_5.0/blocks.dat | |
553 # Cluster Percentage: >= 62 | |
554 # Entropy = 0.6979, Expected = -0.5209 | |
555 A R N D C Q E G H I L K M F P S T W Y V B Z X * | |
556 A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 | |
557 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 | |
558 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 | |
559 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 | |
560 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 | |
561 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 | |
562 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 | |
563 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 | |
564 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 | |
565 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 | |
566 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 | |
567 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 | |
568 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 | |
569 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 | |
570 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 | |
571 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 | |
572 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 | |
573 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 | |
574 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 | |
575 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 | |
576 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 | |
577 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 | |
578 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 | |
579 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 | |
580 "; | |
581 my $matrix_string; | |
582 my @lines = split(/\n/,$BLOSUM62); | |
583 foreach my $line (@lines) { | |
584 next if ($line =~ /^\#/); | |
585 if ($line =~ /^[A-Z\*\s]+$/) { | |
586 $matrix_string .= sprintf "$line\n"; | |
587 } else { | |
588 my @t = split(/\s+/,$line); | |
589 shift @t; | |
590 # print scalar @t,"\n"; | |
591 $matrix_string .= sprintf(join(" ",@t)."\n"); | |
592 } | |
593 } | |
594 | |
595 # my %matrix_hash; | |
596 @lines = (); | |
597 @lines = split /\n/, $matrix_string; | |
598 my $lts = shift @lines; | |
599 $lts =~ s/^\s+//; | |
600 $lts =~ s/\s+$//; | |
601 my @letters = split /\s+/, $lts; | |
602 | |
603 foreach my $letter (@letters) { | |
604 my $line = shift @lines; | |
605 $line =~ s/^\s+//; | |
606 $line =~ s/\s+$//; | |
607 my @penalties = split /\s+/, $line; | |
608 die "Size of letters array and penalties array are different\n" | |
609 unless (scalar @letters == scalar @penalties); | |
610 for (my $i=0; $i < scalar @letters; $i++) { | |
611 $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i]; | |
612 } | |
613 } | |
614 } | |
615 | |
616 | |
617 sub update_alignment_stats { | |
618 my $self = shift; | |
619 | |
620 my $genes = $self->get_all_Members; | |
621 my $ngenes = scalar(@$genes); | |
622 | |
623 if ($ngenes == 2) { | |
624 # This code is >4 times faster with pairs of genes | |
625 | |
626 my $gene1 = $genes->[0]; | |
627 my $gene2 = $genes->[1]; | |
628 my $new_aln1_cigarline = ""; | |
629 my $new_aln2_cigarline = ""; | |
630 | |
631 my $identical_matches = 0; | |
632 my $positive_matches = 0; | |
633 | |
634 my ($aln1state, $aln2state); | |
635 my ($aln1count, $aln2count); | |
636 my ($aln1cov, $aln2cov) = (0,0); | |
637 | |
638 my @aln1 = split(//, $gene1->alignment_string); | |
639 my @aln2 = split(//, $gene2->alignment_string); | |
640 | |
641 for (my $i=0; $i <= $#aln1; $i++) { | |
642 next if ($aln1[$i] eq '-' && $aln2[$i] eq '-'); | |
643 my $cur_aln1state = ($aln1[$i] eq '-' ? 'D' : 'M'); | |
644 my $cur_aln2state = ($aln2[$i] eq '-' ? 'D' : 'M'); | |
645 $aln1cov++ if $cur_aln1state ne 'D'; | |
646 $aln2cov++ if $cur_aln2state ne 'D'; | |
647 if ($cur_aln1state eq 'M' && $cur_aln2state eq 'M') { | |
648 if ($aln1[$i] eq $aln2[$i]) { | |
649 $identical_matches++; | |
650 $positive_matches++; | |
651 } elsif ($matrix_hash{uc $aln1[$i]}{uc $aln2[$i]} > 0) { | |
652 $positive_matches++; | |
653 } | |
654 } | |
655 unless (defined $aln1state) { | |
656 $aln1count = 1; | |
657 $aln2count = 1; | |
658 $aln1state = $cur_aln1state; | |
659 $aln2state = $cur_aln2state; | |
660 next; | |
661 } | |
662 if ($cur_aln1state eq $aln1state) { | |
663 $aln1count++; | |
664 } else { | |
665 if ($aln1count == 1) { | |
666 $new_aln1_cigarline .= $aln1state; | |
667 } else { | |
668 $new_aln1_cigarline .= $aln1count.$aln1state; | |
669 } | |
670 $aln1count = 1; | |
671 $aln1state = $cur_aln1state; | |
672 } | |
673 if ($cur_aln2state eq $aln2state) { | |
674 $aln2count++; | |
675 } else { | |
676 if ($aln2count == 1) { | |
677 $new_aln2_cigarline .= $aln2state; | |
678 } else { | |
679 $new_aln2_cigarline .= $aln2count.$aln2state; | |
680 } | |
681 $aln2count = 1; | |
682 $aln2state = $cur_aln2state; | |
683 } | |
684 } | |
685 if ($aln1count == 1) { | |
686 $new_aln1_cigarline .= $aln1state; | |
687 } else { | |
688 $new_aln1_cigarline .= $aln1count.$aln1state; | |
689 } | |
690 if ($aln2count == 1) { | |
691 $new_aln2_cigarline .= $aln2state; | |
692 } else { | |
693 $new_aln2_cigarline .= $aln2count.$aln2state; | |
694 } | |
695 my $seq_length1 = $gene1->seq_length; | |
696 unless (0 == $seq_length1) { | |
697 $gene1->cigar_line($new_aln1_cigarline); | |
698 $gene1->perc_id( int((100.0 * $identical_matches / $seq_length1 + 0.5)) ); | |
699 $gene1->perc_pos( int((100.0 * $positive_matches / $seq_length1 + 0.5)) ); | |
700 $gene1->perc_cov( int((100.0 * $aln1cov / $seq_length1 + 0.5)) ); | |
701 } | |
702 my $seq_length2 = $gene2->seq_length; | |
703 unless (0 == $seq_length2) { | |
704 $gene2->cigar_line($new_aln2_cigarline); | |
705 $gene2->perc_id( int((100.0 * $identical_matches / $seq_length2 + 0.5)) ); | |
706 $gene2->perc_pos( int((100.0 * $positive_matches / $seq_length2 + 0.5)) ); | |
707 $gene2->perc_cov( int((100.0 * $aln2cov / $seq_length2 + 0.5)) ); | |
708 } | |
709 return undef; | |
710 } | |
711 | |
712 my $min_seq = shift; | |
713 $min_seq = int($min_seq * $ngenes) if $min_seq <= 1; | |
714 | |
715 my @new_cigars = ('') x $ngenes; | |
716 my @nmatch_id = (0) x $ngenes; | |
717 my @nmatch_pos = (0) x $ngenes; | |
718 my @nmatch_cov = (0) x $ngenes; | |
719 my @alncount = (1) x $ngenes; | |
720 my @alnstate = (undef) x $ngenes; | |
721 my @cur_alnstate = (undef) x $ngenes; | |
722 | |
723 my @aln = map {$_->alignment_string} @$genes; | |
724 my $aln_length = length($aln[0]); | |
725 | |
726 for (my $i=0; $i < $aln_length; $i++) { | |
727 | |
728 my @char_i = map {substr($_, $i, 1)} @aln; | |
729 #print "pos $i: ", join('/', @char_i), "\n"; | |
730 | |
731 my %seen; | |
732 map {$seen{$_}++} @char_i; | |
733 next if $seen{'-'} == $ngenes; | |
734 delete $seen{'-'}; | |
735 | |
736 my %pos_chars = (); | |
737 my @chars = keys %seen; | |
738 while (my $c1 = shift @chars) { | |
739 foreach my $c2 (@chars) { | |
740 if (($matrix_hash{uc $c1}{uc $c2} > 0) and ($seen{$c1}+$seen{$c2} >= $min_seq)) { | |
741 $pos_chars{$c1} = 1; | |
742 $pos_chars{$c2} = 1; | |
743 } | |
744 } | |
745 } | |
746 | |
747 for (my $j=0; $j<$ngenes; $j++) { | |
748 if ($char_i[$j] eq '-') { | |
749 $cur_alnstate[$j] = 'D'; | |
750 } else { | |
751 $nmatch_cov[$j]++; | |
752 $cur_alnstate[$j] = 'M'; | |
753 if ($seen{$char_i[$j]} >= $min_seq) { | |
754 $nmatch_id[$j]++; | |
755 $nmatch_pos[$j]++; | |
756 } elsif (exists $pos_chars{$char_i[$j]}) { | |
757 $nmatch_pos[$j]++; | |
758 } | |
759 } | |
760 } | |
761 | |
762 if ($i == 0) { | |
763 @alnstate = @cur_alnstate; | |
764 next; | |
765 } | |
766 | |
767 for (my $j=0; $j<$ngenes; $j++) { | |
768 if ($cur_alnstate[$j] eq $alnstate[$j]) { | |
769 $alncount[$j]++; | |
770 } else { | |
771 if ($alncount[$j] == 1) { | |
772 $new_cigars[$j] .= $alnstate[$j]; | |
773 } else { | |
774 $new_cigars[$j] .= $alncount[$j].$alnstate[$j]; | |
775 } | |
776 $alncount[$j] = 1; | |
777 $alnstate[$j] = $cur_alnstate[$j]; | |
778 } | |
779 } | |
780 } | |
781 | |
782 for (my $j=0; $j<$ngenes; $j++) { | |
783 if ($alncount[$j] == 1) { | |
784 $new_cigars[$j] .= $alnstate[$j]; | |
785 } else { | |
786 $new_cigars[$j] .= $alncount[$j].$alnstate[$j]; | |
787 } | |
788 $genes->[$j]->cigar_line($new_cigars[$j]); | |
789 my $seq_length = $genes->[$j]->seq_length; | |
790 unless (0 == $seq_length) { | |
791 $genes->[$j]->perc_id( int((100.0 * $nmatch_id[$j] / $seq_length + 0.5)) ); | |
792 $genes->[$j]->perc_pos( int((100.0 * $nmatch_pos[$j] / $seq_length + 0.5)) ); | |
793 $genes->[$j]->perc_cov( int((100.0 * $nmatch_cov[$j] / $seq_length + 0.5)) ); | |
794 } | |
795 } | |
796 } | |
797 | |
798 | |
799 | |
800 1; |