Mercurial > repos > willmclaren > ensembl_vep

diff variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm @ 0:21066c0abaf5 draft
Uploaded
author: willmclaren
date: Fri, 03 Aug 2012 10:04:48 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,800 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <dev@ensembl.org>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 AUTHORSHIP
+
+Ensembl Team. Individual contributions can be found in the CVS log.
+
+=cut
+
+=head1 NAME
+
+AlignedMemberSet - A superclass for pairwise or multiple relationships, base of
+Bio::EnsEMBL::Compara::Family, Bio::EnsEMBL::Compara::Homology and
+Bio::EnsEMBL::Compara::Domain.
+
+=head1 DESCRIPTION
+
+A superclass for pairwise and multiple relationships
+
+Currently the AlignedMember objects are used in the GeneTree structure
+to represent the leaves of the trees. Each leaf contains an aligned
+sequence, which is represented as an AlignedMember object.
+
+=head1 INHERITANCE TREE
+
+  Bio::EnsEMBL::Compara::AlignedMemberSet
+
+=head1 METHODS
+
+=cut
+
+package Bio::EnsEMBL::Compara::AlignedMemberSet;
+
+use strict;
+use Scalar::Util qw(weaken);
+
+use Bio::AlignIO;
+
+use Bio::EnsEMBL::Utils::Argument;
+use Bio::EnsEMBL::Utils::Scalar;
+use Bio::EnsEMBL::Utils::Exception;
+
+use Bio::EnsEMBL::Compara::AlignedMember;
+
+use base ('Bio::EnsEMBL::Compara::MemberSet');
+
+
+=head2 member_class
+
+  Description: Returns the type of member used in the set
+  Returntype : String: Bio::EnsEMBL::Compara::AlignedMember
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub member_class {
+    return 'Bio::EnsEMBL::Compara::AlignedMember';
+}
+
+######################
+# Alignment sections #
+######################
+
+
+sub print_sequences_to_fasta {
+    my ($self, $pep_file) = @_;
+    my $pep_counter = 0;
+    open PEP, ">$pep_file";
+    foreach my $member (@{$self->get_all_Members}) {
+        next if $member->source_name eq 'ENSEMBLGENE';
+        my $member_stable_id = $member->stable_id;
+        my $seq = $member->sequence;
+
+        print PEP ">$member_stable_id\n";
+        $seq =~ s/(.{72})/$1\n/g;
+        chomp $seq;
+        unless (defined($seq)) {
+            my $family_id = $self->dbID;
+            die "member $member_stable_id in family $family_id doesn't have a sequence";
+        }
+        print PEP $seq,"\n";
+        $pep_counter++;
+    }
+    close PEP;
+    return $pep_counter;
+}
+
+
+=head2 read_clustalw
+
+  Arg [1]    : string $file 
+               The name of the file containing the clustalw output  
+  Example    : $family->read_clustalw('/tmp/clustalw.aln');
+  Description: Parses the output from clustalw and sets the alignment strings
+               of each of the memebers of this family
+  Returntype : none
+  Exceptions : thrown if file cannot be parsed
+               warning if alignment file contains identifiers for sequences
+               which are not members of this family
+  Caller     : general
+
+=cut
+
+sub read_clustalw {
+    my $self = shift;
+    my $file = shift;
+
+    my %align_hash;
+    my $FH = IO::File->new();
+    $FH->open($file) || throw("Could not open alignment file [$file]");
+
+    <$FH>; #skip header
+    while(<$FH>) {
+        next if($_ =~ /^\s+/);  #skip lines that start with space
+
+        my ($id, $align) = split;
+        $align_hash{$id} ||= '';
+        $align_hash{$id} .= $align;
+    }
+
+    $FH->close;
+
+    #place all members in a hash on their member name
+    my %member_hash;
+    foreach my $member (@{$self->get_all_Members}) {
+        $member_hash{$member->stable_id} = $member;
+    }
+
+    #assign cigar_line to each of the member attribute
+    foreach my $id (keys %align_hash) {
+        throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
+
+        my $alignment_string = $align_hash{$id};
+        $alignment_string =~ s/\-([A-Z])/\- $1/g;
+        $alignment_string =~ s/([A-Z])\-/$1 \-/g;
+
+        my @cigar_segments = split " ",$alignment_string;
+
+        my $cigar_line = "";
+        foreach my $segment (@cigar_segments) {
+            my $seglength = length($segment);
+            $seglength = "" if ($seglength == 1);
+            if ($segment =~ /^\-+$/) {
+                $cigar_line .= $seglength . "D";
+            } else {
+                $cigar_line .= $seglength . "M";
+            }
+        }
+
+        $member_hash{$id}->cigar_line($cigar_line);
+    }
+}
+
+sub load_cigars_from_fasta {
+    my ($self, $file) = @_;
+
+    my $alignio = Bio::AlignIO->new(-file => "$file", -format => "fasta");
+
+    my $aln = $alignio->next_aln or die "Bio::AlignIO could not get next_aln() from file '$file'";
+
+    #place all members in a hash on their member name
+    my %member_hash;
+    foreach my $member (@{$self->get_all_Members}) {
+        $member_hash{$member->stable_id} = $member;
+    }
+
+    #assign cigar_line to each of the member attribute
+    foreach my $seq ($aln->each_seq) {
+        my $id = $seq->display_id;
+        throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
+
+        my $cigar_line = '';
+        my $seq_string = $seq->seq();
+        while($seq_string=~/(?:\b|^)(.)(.*?)(?:\b|$)/g) {
+            $cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M');
+        }
+
+        $member_hash{$id}->cigar_line($cigar_line);
+    }
+}
+
+
+
+sub get_SimpleAlign {
+
+    my ($self, @args) = @_;
+
+    my $id_type = 'STABLE';
+    my $unique_seqs = 0;
+    my $cdna = 0;
+    my $stop2x = 0;
+    my $append_taxon_id = 0;
+    my $append_sp_short_name = 0;
+    my $append_genomedb_id = 0;
+    my $exon_cased = 0;
+    my $alignment = 'protein';
+    my $changeSelenos = 0;
+    if (scalar @args) {
+        ($unique_seqs, $cdna, $id_type, $stop2x, $append_taxon_id, $append_sp_short_name, $append_genomedb_id, $exon_cased, $alignment, $changeSelenos) =
+            rearrange([qw(UNIQ_SEQ CDNA ID_TYPE STOP2X APPEND_TAXON_ID APPEND_SP_SHORT_NAME APPEND_GENOMEDB_ID EXON_CASED ALIGNMENT CHANGE_SELENO)], @args);
+    }
+
+    my $sa = Bio::SimpleAlign->new();
+
+    #Hack to try to work with both bioperl 0.7 and 1.2:
+    #Check to see if the method is called 'addSeq' or 'add_seq'
+    my $bio07 = ($sa->can('add_seq') ? 0 : 1);
+
+    my $seq_id_hash = {};
+    foreach my $member (@{$self->get_all_Members}) {
+
+        next if $member->source_name eq 'ENSEMBLGENE';
+
+        # Print unique sequences only ?
+        next if($unique_seqs and $seq_id_hash->{$member->sequence_id});
+        $seq_id_hash->{$member->sequence_id} = 1;
+
+        # The correct codon table
+        if ($member->chr_name =~ /mt/i) {
+            # codeml icodes
+            # 0:universal code (default)
+            my $class;
+            eval {$class = member->taxon->classification;};
+            unless ($@) {
+                if ($class =~ /vertebrata/i) {
+                    # 1:mamalian mt
+                    $sa->{_special_codeml_icode} = 1;
+                } else {
+                    # 4:invertebrate mt
+                    $sa->{_special_codeml_icode} = 4;
+                }
+            }
+        }
+
+        my $seqstr;
+        my $alphabet;
+        if ($cdna or (lc($alignment) eq 'cdna')) {
+            $seqstr = $member->cdna_alignment_string($changeSelenos);
+            $seqstr =~ s/\s+//g;
+            $alphabet = 'dna';
+        } else {
+            $seqstr = $member->alignment_string($exon_cased);
+            $alphabet = 'protein';
+        }
+        next if(!$seqstr);
+
+        # Sequence name
+        my $seqID = $member->stable_id;
+        $seqID = $member->sequence_id if($id_type eq "SEQ");
+        $seqID = $member->member_id if($id_type eq "MEMBER");
+        $seqID .= "_" . $member->taxon_id if($append_taxon_id);
+        $seqID .= "_" . $member->genome_db_id if ($append_genomedb_id);
+
+        ## Append $seqID with Speciae short name, if required
+        if ($append_sp_short_name) {
+            my $species = $member->genome_db->short_name;
+            $species =~ s/\s/_/g;
+            $seqID .= "_" . $species . "_";
+        }
+
+        # Sequence length
+        my $aln_end = $member->seq_length;
+        $aln_end = $aln_end*3 if $alphabet eq 'dna';
+
+        $seqstr =~ s/\*/X/g if ($stop2x);
+        my $seq = Bio::LocatableSeq->new(
+                -SEQ        => $seqstr,
+                -ALPHABET   => $alphabet,
+                -START      => 1,
+                -END        => $aln_end,
+                -ID         => $seqID,
+                -STRAND     => 0
+        );
+
+        if($bio07) {
+            $sa->addSeq($seq);
+        } else {
+            $sa->add_seq($seq);
+        }
+    }
+    $sa = $sa->remove_gaps(undef, 1) if UNIVERSAL::can($sa, 'remove_gaps');
+    return $sa;
+}
+
+
+
+# Takes a protein tree and creates a consensus cigar line from the
+# constituent leaf nodes.
+sub consensus_cigar_line {
+
+   my $self = shift;
+   my @cigars;
+
+   # First get an 'expanded' cigar string for each leaf of the subtree
+   my @all_members = @{$self->get_all_Members};
+   my $num_members = scalar(@all_members);
+   foreach my $leaf (@all_members) {
+     next unless( UNIVERSAL::can( $leaf, 'cigar_line' ) );
+     my $cigar = $leaf->cigar_line;
+     my $newcigar = "";
+#     $cigar =~ s/(\d*)([A-Z])/$2 x ($1||1)/ge; #Expand
+      while ($cigar =~ /(\d*)([A-Z])/g) {
+          $newcigar .= $2 x ($1 || 1);
+      }
+     $cigar = $newcigar;
+     push @cigars, $cigar;
+   }
+
+   # Itterate through each character of the expanded cigars.
+   # If there is a 'D' at a given location in any cigar,
+   # set the consensus to 'D', otherwise assume an 'M'.
+   # TODO: Fix assumption that cigar strings are always the same length,
+   # and start at the same point.
+   my $cigar_len = length( $cigars[0] );
+   my $cons_cigar;
+   for( my $i=0; $i<$cigar_len; $i++ ){
+     my $char = 'M';
+     my $num_deletions = 0;
+     foreach my $cigar( @cigars ){
+       if ( substr($cigar,$i,1) eq 'D'){
+         $num_deletions++;
+       }
+     }
+     if ($num_deletions * 3 > 2 * $num_members) {
+       $char = "D";
+     } elsif ($num_deletions * 3 > $num_members) {
+       $char = "m";
+     }
+     $cons_cigar .= $char;
+   }
+
+   # Collapse the consensus cigar, e.g. 'DDDD' = 4D
+#   $cons_cigar =~ s/(\w)(\1*)/($2?length($2)+1:"").$1/ge;
+   my $collapsed_cigar = "";
+   while ($cons_cigar =~ /(D+|M+|I+|m+)/g) {
+     $collapsed_cigar .= (length($1) == 1 ? "" : length($1)) . substr($1,0,1)
+ }
+   $cons_cigar = $collapsed_cigar;
+   # Return the consensus
+   return $cons_cigar;
+}
+
+
+my %TWOD_CODONS = ("TTT" => "Phe",#Phe
+                   "TTC" => "Phe",
+                   
+                   "TTA" => "Leu",#Leu
+                   "TTG" => "Leu",
+                   
+                   "TAT" => "Tyr",#Tyr
+                   "TAC" => "Tyr",
+                   
+                   "CAT" => "His",#His
+                   "CAC" => "His",
+
+                   "CAA" => "Gln",#Gln
+                   "CAG" => "Gln",
+                   
+                   "AAT" => "Asn",#Asn
+                   "AAC" => "Asn",
+                   
+                   "AAA" => "Lys",#Lys
+                   "AAG" => "Lys",
+                   
+                   "GAT" => "Asp",#Asp
+                   "GAC" => "Asp",
+
+                   "GAA" => "Glu",#Glu
+                   "GAG" => "Glu",
+                   
+                   "TGT" => "Cys",#Cys
+                   "TGC" => "Cys",
+                   
+                   "AGT" => "Ser",#Ser
+                   "AGC" => "Ser",
+                   
+                   "AGA" => "Arg",#Arg
+                   "AGG" => "Arg",
+                   
+                   "ATT" => "Ile",#Ile
+                   "ATC" => "Ile",
+                   "ATA" => "Ile");
+
+my %FOURD_CODONS = ("CTT" => "Leu",#Leu
+                    "CTC" => "Leu",
+                    "CTA" => "Leu",
+                    "CTG" => "Leu",
+                    
+                    "GTT" => "Val",#Val 
+                    "GTC" => "Val",
+                    "GTA" => "Val",
+                    "GTG" => "Val",
+                    
+                    "TCT" => "Ser",#Ser
+                    "TCC" => "Ser",
+                    "TCA" => "Ser",
+                    "TCG" => "Ser",
+                    
+                    "CCT" => "Pro",#Pro
+                    "CCC" => "Pro",
+                    "CCA" => "Pro",
+                    "CCG" => "Pro",
+                    
+                    "ACT" => "Thr",#Thr
+                    "ACC" => "Thr",
+                    "ACA" => "Thr",
+                    "ACG" => "Thr",
+                    
+                    "GCT" => "Ala",#Ala
+                    "GCC" => "Ala",
+                    "GCA" => "Ala",
+                    "GCG" => "Ala",
+                    
+                    "CGT" => "Arg",#Arg
+                    "CGC" => "Arg",
+                    "CGA" => "Arg",
+                    "CGG" => "Arg",
+                    
+                    "GGT" => "Gly",#Gly
+                    "GGC" => "Gly",
+                    "GGA" => "Gly",
+                    "GGG" => "Gly");
+                    
+my %CODONS =   ("ATG" => "Met",
+                "TGG" => "Trp",
+                "TAA" => "TER",
+                "TAG" => "TER",
+                "TGA" => "TER",
+                "---" => "---",
+                %TWOD_CODONS,
+                %FOURD_CODONS,
+                );
+
+
+=head2 get_4D_SimpleAlign
+
+  Example    : $4d_align = $homology->get_4D_SimpleAlign();
+  Description: get 4 times degenerate positions pairwise simple alignment
+  Returntype : Bio::SimpleAlign
+
+=cut
+
+sub get_4D_SimpleAlign {
+    my $self = shift;
+
+    my $sa = Bio::SimpleAlign->new();
+
+    #Hack to try to work with both bioperl 0.7 and 1.2:
+    #Check to see if the method is called 'addSeq' or 'add_seq'
+    my $bio07 = 0;
+    if(!$sa->can('add_seq')) {
+        $bio07 = 1;
+    }
+
+    my $ma = $self->adaptor->db->get_MemberAdaptor;
+
+    my %member_seqstr;
+    foreach my $member (@{$self->get_all_Members}) {
+        next if $member->source_name ne 'ENSEMBLPEP';
+        my $seqstr = $member->cdna_alignment_string();
+        next if(!$seqstr);
+        #print STDERR $seqstr,"\n";
+        my @tmp_tab = split /\s+/, $seqstr;
+        #print STDERR "tnp_tab 0: ", $tmp_tab[0],"\n";
+        $member_seqstr{$member->stable_id} = \@tmp_tab;
+    }
+
+    my $seqstr_length;
+    foreach my $seqid (keys %member_seqstr) {
+        unless (defined $seqstr_length) {
+            #print STDERR $member_seqstr{$seqid}->[0],"\n";
+            $seqstr_length = scalar @{$member_seqstr{$seqid}};
+            next;
+        }
+        unless ($seqstr_length == scalar @{$member_seqstr{$seqid}}) {
+            die "Length of dna alignment are not the same, $seqstr_length and " . scalar @{$member_seqstr{$seqid}} ." respectively for homology_id " . $self->dbID . "\n";
+        }
+    }
+
+    my %FourD_member_seqstr;
+    for (my $i=0; $i < $seqstr_length; $i++) {
+        my $FourD_codon = 1;
+        my $FourD_aminoacid;
+        foreach my $seqid (keys %member_seqstr) {
+            if (defined $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
+                if (defined $FourD_aminoacid && $FourD_aminoacid eq $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
+                    #print STDERR "YES ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
+                    next;
+                } elsif (defined $FourD_aminoacid) {
+                    #print STDERR "NO ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
+                    $FourD_codon = 0;
+                    last;
+                } else {
+                    $FourD_aminoacid = $FOURD_CODONS{$member_seqstr{$seqid}->[$i]};
+                    #print STDERR $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i]," ";
+                }
+                next;
+            } else {
+                #print STDERR "NO ",$CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
+                $FourD_codon = 0;
+                last;
+            }
+        }
+        next unless ($FourD_codon);
+        foreach my $seqid (keys %member_seqstr) {
+            $FourD_member_seqstr{$seqid} .= substr($member_seqstr{$seqid}->[$i],2,1);
+        }
+    }
+
+    foreach my $seqid (keys %FourD_member_seqstr) {
+
+        my $seq = Bio::LocatableSeq->new(
+                -SEQ    => $FourD_member_seqstr{$seqid},
+                -START  => 1,
+                -END    => length($FourD_member_seqstr{$seqid}),
+                -ID     => $seqid,
+                -STRAND => 0
+        );
+
+        if($bio07) {
+            $sa->addSeq($seq);
+        } else {
+            $sa->add_seq($seq);
+        }
+    }
+
+    return $sa;
+}
+
+my %matrix_hash;
+
+{
+  my $BLOSUM62 = "#  Matrix made by matblas from blosum62.iij
+#  * column uses minimum score
+#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+#  Blocks Database = /data/blocks_5.0/blocks.dat
+#  Cluster Percentage: >= 62
+#  Entropy =   0.6979, Expected =  -0.5209
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
+A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
+R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
+N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
+D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
+C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
+Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
+E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
+G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
+H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
+I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
+L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
+K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
+M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
+F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
+S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
+T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
+Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
+V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
+B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
+Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
+X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
+* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
+";
+  my $matrix_string;
+  my @lines = split(/\n/,$BLOSUM62);
+  foreach my $line (@lines) {
+    next if ($line =~ /^\#/);
+    if ($line =~ /^[A-Z\*\s]+$/) {
+      $matrix_string .= sprintf "$line\n";
+    } else {
+      my @t = split(/\s+/,$line);
+      shift @t;
+      #       print scalar @t,"\n";
+      $matrix_string .= sprintf(join(" ",@t)."\n");
+    }
+  }
+
+#  my %matrix_hash;
+  @lines = ();
+  @lines = split /\n/, $matrix_string;
+  my $lts = shift @lines;
+  $lts =~ s/^\s+//;
+  $lts =~ s/\s+$//;
+  my @letters = split /\s+/, $lts;
+
+  foreach my $letter (@letters) {
+    my $line = shift @lines;
+    $line =~ s/^\s+//;
+    $line =~ s/\s+$//;
+    my @penalties = split /\s+/, $line;
+    die "Size of letters array and penalties array are different\n"
+      unless (scalar @letters == scalar @penalties);
+    for (my $i=0; $i < scalar @letters; $i++) {
+      $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
+    }
+  }
+}
+
+
+sub update_alignment_stats {
+    my $self = shift;
+
+    my $genes = $self->get_all_Members;
+    my $ngenes = scalar(@$genes);
+
+    if ($ngenes == 2) {
+        # This code is >4 times faster with pairs of genes
+
+        my $gene1 = $genes->[0];
+        my $gene2 = $genes->[1];
+        my $new_aln1_cigarline = "";
+        my $new_aln2_cigarline = "";
+
+        my $identical_matches = 0;
+        my $positive_matches = 0;
+
+        my ($aln1state, $aln2state);
+        my ($aln1count, $aln2count);
+        my ($aln1cov, $aln2cov) = (0,0);
+
+        my @aln1 = split(//, $gene1->alignment_string);
+        my @aln2 = split(//, $gene2->alignment_string);
+
+        for (my $i=0; $i <= $#aln1; $i++) {
+            next if ($aln1[$i] eq '-' && $aln2[$i] eq '-');
+            my $cur_aln1state = ($aln1[$i] eq '-' ? 'D' : 'M');
+            my $cur_aln2state = ($aln2[$i] eq '-' ? 'D' : 'M');
+            $aln1cov++ if $cur_aln1state ne 'D';
+            $aln2cov++ if $cur_aln2state ne 'D';
+            if ($cur_aln1state eq 'M' && $cur_aln2state eq 'M') {
+                if ($aln1[$i] eq $aln2[$i]) {
+                    $identical_matches++;
+                    $positive_matches++;
+                } elsif ($matrix_hash{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
+                    $positive_matches++;
+                }
+            }
+            unless (defined $aln1state) {
+                $aln1count = 1;
+                $aln2count = 1;
+                $aln1state = $cur_aln1state;
+                $aln2state = $cur_aln2state;
+                next;
+            }
+            if ($cur_aln1state eq $aln1state) {
+                $aln1count++;
+            } else {
+                if ($aln1count == 1) {
+                    $new_aln1_cigarline .= $aln1state;
+                } else {
+                    $new_aln1_cigarline .= $aln1count.$aln1state;
+                }
+                $aln1count = 1;
+                $aln1state = $cur_aln1state;
+            }
+            if ($cur_aln2state eq $aln2state) {
+                $aln2count++;
+            } else {
+                if ($aln2count == 1) {
+                    $new_aln2_cigarline .= $aln2state;
+                } else {
+                    $new_aln2_cigarline .= $aln2count.$aln2state;
+                }
+                $aln2count = 1;
+                $aln2state = $cur_aln2state;
+            }
+        }
+        if ($aln1count == 1) {
+            $new_aln1_cigarline .= $aln1state;
+        } else {
+            $new_aln1_cigarline .= $aln1count.$aln1state;
+        }
+        if ($aln2count == 1) {
+            $new_aln2_cigarline .= $aln2state;
+        } else {
+            $new_aln2_cigarline .= $aln2count.$aln2state;
+        }
+        my $seq_length1 = $gene1->seq_length;
+        unless (0 == $seq_length1) {
+            $gene1->cigar_line($new_aln1_cigarline);
+            $gene1->perc_id( int((100.0 * $identical_matches / $seq_length1 + 0.5)) );
+            $gene1->perc_pos( int((100.0 * $positive_matches  / $seq_length1 + 0.5)) );
+            $gene1->perc_cov( int((100.0 * $aln1cov / $seq_length1 + 0.5)) );
+        }
+        my $seq_length2 = $gene2->seq_length;
+        unless (0 == $seq_length2) {
+            $gene2->cigar_line($new_aln2_cigarline);
+            $gene2->perc_id( int((100.0 * $identical_matches / $seq_length2 + 0.5)) );
+            $gene2->perc_pos( int((100.0 * $positive_matches  / $seq_length2 + 0.5)) );
+            $gene2->perc_cov( int((100.0 * $aln2cov / $seq_length2 + 0.5)) );
+        }
+        return undef;
+    }
+
+    my $min_seq = shift;
+    $min_seq = int($min_seq * $ngenes) if $min_seq <= 1;
+
+    my @new_cigars   = ('') x $ngenes;
+    my @nmatch_id    = (0) x $ngenes; 
+    my @nmatch_pos   = (0) x $ngenes; 
+    my @nmatch_cov   = (0) x $ngenes; 
+    my @alncount     = (1) x $ngenes;
+    my @alnstate     = (undef) x $ngenes;
+    my @cur_alnstate = (undef) x $ngenes;
+
+    my @aln = map {$_->alignment_string} @$genes;
+    my $aln_length = length($aln[0]);
+
+    for (my $i=0; $i < $aln_length; $i++) {
+
+        my @char_i =  map {substr($_, $i, 1)} @aln;
+        #print "pos $i: ", join('/', @char_i), "\n";
+
+        my %seen;
+        map {$seen{$_}++} @char_i;
+        next if $seen{'-'} == $ngenes;
+        delete $seen{'-'};
+        
+        my %pos_chars = ();
+        my @chars = keys %seen;
+        while (my $c1 = shift @chars) {
+            foreach my $c2 (@chars) {
+                if (($matrix_hash{uc $c1}{uc $c2} > 0) and ($seen{$c1}+$seen{$c2} >= $min_seq)) {
+                    $pos_chars{$c1} = 1;
+                    $pos_chars{$c2} = 1;
+                }
+            }
+        }
+
+        for (my $j=0; $j<$ngenes; $j++) {
+            if ($char_i[$j] eq '-') {
+                $cur_alnstate[$j] = 'D';
+            } else {
+                $nmatch_cov[$j]++;
+                $cur_alnstate[$j] = 'M';
+                if ($seen{$char_i[$j]} >= $min_seq) {
+                    $nmatch_id[$j]++;
+                    $nmatch_pos[$j]++;
+                } elsif (exists $pos_chars{$char_i[$j]}) {
+                    $nmatch_pos[$j]++;
+                }
+            }
+        }
+
+        if ($i == 0) {
+            @alnstate = @cur_alnstate;
+            next;
+        }
+
+        for (my $j=0; $j<$ngenes; $j++) {
+            if ($cur_alnstate[$j] eq $alnstate[$j]) {
+                $alncount[$j]++;
+            } else {
+                if ($alncount[$j] == 1) {
+                    $new_cigars[$j] .= $alnstate[$j];
+                } else {
+                    $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
+                }
+                $alncount[$j] = 1;
+                $alnstate[$j] = $cur_alnstate[$j];
+            }
+        }
+    }
+
+    for (my $j=0; $j<$ngenes; $j++) {
+        if ($alncount[$j] == 1) {
+            $new_cigars[$j] .= $alnstate[$j];
+        } else {
+            $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
+        }
+        $genes->[$j]->cigar_line($new_cigars[$j]);
+        my $seq_length = $genes->[$j]->seq_length;
+        unless (0 == $seq_length) {
+            $genes->[$j]->perc_id( int((100.0 * $nmatch_id[$j] / $seq_length + 0.5)) );
+            $genes->[$j]->perc_pos( int((100.0 * $nmatch_pos[$j] / $seq_length + 0.5)) );
+            $genes->[$j]->perc_cov( int((100.0 * $nmatch_cov[$j] / $seq_length + 0.5)) );
+        }
+    }
+}
+
+
+
+1;
author	willmclaren
date	Fri, 03 Aug 2012 10:04:48 -0400
parents
children