Mercurial > repos > willmclaren > ensembl_vep

=head1 LICENSE

  Copyright (c) 1999-2012 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=head1 AUTHORSHIP

Ensembl Team. Individual contributions can be found in the CVS log.

=cut

=head1 NAME

AlignedMemberSet - A superclass for pairwise or multiple relationships, base of
Bio::EnsEMBL::Compara::Family, Bio::EnsEMBL::Compara::Homology and
Bio::EnsEMBL::Compara::Domain.

=head1 DESCRIPTION

A superclass for pairwise and multiple relationships

Currently the AlignedMember objects are used in the GeneTree structure
to represent the leaves of the trees. Each leaf contains an aligned
sequence, which is represented as an AlignedMember object.

=head1 INHERITANCE TREE

  Bio::EnsEMBL::Compara::AlignedMemberSet

=head1 METHODS

=cut

package Bio::EnsEMBL::Compara::AlignedMemberSet;

use strict;
use Scalar::Util qw(weaken);

use Bio::AlignIO;

use Bio::EnsEMBL::Utils::Argument;
use Bio::EnsEMBL::Utils::Scalar;
use Bio::EnsEMBL::Utils::Exception;

use Bio::EnsEMBL::Compara::AlignedMember;

use base ('Bio::EnsEMBL::Compara::MemberSet');


=head2 member_class

  Description: Returns the type of member used in the set
  Returntype : String: Bio::EnsEMBL::Compara::AlignedMember
  Caller     : general
  Status     : Stable

=cut

sub member_class {
    return 'Bio::EnsEMBL::Compara::AlignedMember';
}

######################
# Alignment sections #
######################


sub print_sequences_to_fasta {
    my ($self, $pep_file) = @_;
    my $pep_counter = 0;
    open PEP, ">$pep_file";
    foreach my $member (@{$self->get_all_Members}) {
        next if $member->source_name eq 'ENSEMBLGENE';
        my $member_stable_id = $member->stable_id;
        my $seq = $member->sequence;

        print PEP ">$member_stable_id\n";
        $seq =~ s/(.{72})/$1\n/g;
        chomp $seq;
        unless (defined($seq)) {
            my $family_id = $self->dbID;
            die "member $member_stable_id in family $family_id doesn't have a sequence";
        }
        print PEP $seq,"\n";
        $pep_counter++;
    }
    close PEP;
    return $pep_counter;
}


=head2 read_clustalw

  Arg [1]    : string $file
               The name of the file containing the clustalw output
  Example    : $family->read_clustalw('/tmp/clustalw.aln');
  Description: Parses the output from clustalw and sets the alignment strings
               of each of the memebers of this family
  Returntype : none
  Exceptions : thrown if file cannot be parsed
               warning if alignment file contains identifiers for sequences
               which are not members of this family
  Caller     : general

=cut

sub read_clustalw {
    my $self = shift;
    my $file = shift;

    my %align_hash;
    my $FH = IO::File->new();
    $FH->open($file) || throw("Could not open alignment file [$file]");

    <$FH>; #skip header
    while(<$FH>) {
        next if($_ =~ /^\s+/);  #skip lines that start with space

        my ($id, $align) = split;
        $align_hash{$id} ||= '';
        $align_hash{$id} .= $align;
    }

    $FH->close;

    #place all members in a hash on their member name
    my %member_hash;
    foreach my $member (@{$self->get_all_Members}) {
        $member_hash{$member->stable_id} = $member;
    }

    #assign cigar_line to each of the member attribute
    foreach my $id (keys %align_hash) {
        throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};

        my $alignment_string = $align_hash{$id};
        $alignment_string =~ s/\-([A-Z])/\- $1/g;
        $alignment_string =~ s/([A-Z])\-/$1 \-/g;

        my @cigar_segments = split " ",$alignment_string;

        my $cigar_line = "";
        foreach my $segment (@cigar_segments) {
            my $seglength = length($segment);
            $seglength = "" if ($seglength == 1);
            if ($segment =~ /^\-+$/) {
                $cigar_line .= $seglength . "D";
            } else {
                $cigar_line .= $seglength . "M";
            }
        }

        $member_hash{$id}->cigar_line($cigar_line);
    }
}

sub load_cigars_from_fasta {
    my ($self, $file) = @_;

    my $alignio = Bio::AlignIO->new(-file => "$file", -format => "fasta");

    my $aln = $alignio->next_aln or die "Bio::AlignIO could not get next_aln() from file '$file'";

    #place all members in a hash on their member name
    my %member_hash;
    foreach my $member (@{$self->get_all_Members}) {
        $member_hash{$member->stable_id} = $member;
    }

    #assign cigar_line to each of the member attribute
    foreach my $seq ($aln->each_seq) {
        my $id = $seq->display_id;
        throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};

        my $cigar_line = '';
        my $seq_string = $seq->seq();
        while($seq_string=~/(?:\b|^)(.)(.*?)(?:\b|$)/g) {
            $cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M');
        }

        $member_hash{$id}->cigar_line($cigar_line);
    }
}


sub get_SimpleAlign {

    my ($self, @args) = @_;

    my $id_type = 'STABLE';
    my $unique_seqs = 0;
    my $cdna = 0;
    my $stop2x = 0;
    my $append_taxon_id = 0;
    my $append_sp_short_name = 0;
    my $append_genomedb_id = 0;
    my $exon_cased = 0;
    my $alignment = 'protein';
    my $changeSelenos = 0;
    if (scalar @args) {
        ($unique_seqs, $cdna, $id_type, $stop2x, $append_taxon_id, $append_sp_short_name, $append_genomedb_id, $exon_cased, $alignment, $changeSelenos) =
            rearrange([qw(UNIQ_SEQ CDNA ID_TYPE STOP2X APPEND_TAXON_ID APPEND_SP_SHORT_NAME APPEND_GENOMEDB_ID EXON_CASED ALIGNMENT CHANGE_SELENO)], @args);
    }

    my $sa = Bio::SimpleAlign->new();

    #Hack to try to work with both bioperl 0.7 and 1.2:
    #Check to see if the method is called 'addSeq' or 'add_seq'
    my $bio07 = ($sa->can('add_seq') ? 0 : 1);

    my $seq_id_hash = {};
    foreach my $member (@{$self->get_all_Members}) {

        next if $member->source_name eq 'ENSEMBLGENE';

        # Print unique sequences only ?
        next if($unique_seqs and $seq_id_hash->{$member->sequence_id});
        $seq_id_hash->{$member->sequence_id} = 1;

        # The correct codon table
        if ($member->chr_name =~ /mt/i) {
            # codeml icodes
            # 0:universal code (default)
            my $class;
            eval {$class = member->taxon->classification;};
            unless ($@) {
                if ($class =~ /vertebrata/i) {
                    # 1:mamalian mt
                    $sa->{_special_codeml_icode} = 1;
                } else {
                    # 4:invertebrate mt
                    $sa->{_special_codeml_icode} = 4;
                }
            }
        }

        my $seqstr;
        my $alphabet;
        if ($cdna or (lc($alignment) eq 'cdna')) {
            $seqstr = $member->cdna_alignment_string($changeSelenos);
            $seqstr =~ s/\s+//g;
            $alphabet = 'dna';
        } else {
            $seqstr = $member->alignment_string($exon_cased);
            $alphabet = 'protein';
        }
        next if(!$seqstr);

        # Sequence name
        my $seqID = $member->stable_id;
        $seqID = $member->sequence_id if($id_type eq "SEQ");
        $seqID = $member->member_id if($id_type eq "MEMBER");
        $seqID .= "_" . $member->taxon_id if($append_taxon_id);
        $seqID .= "_" . $member->genome_db_id if ($append_genomedb_id);

        ## Append $seqID with Speciae short name, if required
        if ($append_sp_short_name) {
            my $species = $member->genome_db->short_name;
            $species =~ s/\s/_/g;
            $seqID .= "_" . $species . "_";
        }

        # Sequence length
        my $aln_end = $member->seq_length;
        $aln_end = $aln_end*3 if $alphabet eq 'dna';

        $seqstr =~ s/\*/X/g if ($stop2x);
        my $seq = Bio::LocatableSeq->new(
                -SEQ        => $seqstr,
                -ALPHABET   => $alphabet,
                -START      => 1,
                -END        => $aln_end,
                -ID         => $seqID,
                -STRAND     => 0
        );

        if($bio07) {
            $sa->addSeq($seq);
        } else {
            $sa->add_seq($seq);
        }
    }
    $sa = $sa->remove_gaps(undef, 1) if UNIVERSAL::can($sa, 'remove_gaps');
    return $sa;
}


# Takes a protein tree and creates a consensus cigar line from the
# constituent leaf nodes.
sub consensus_cigar_line {

   my $self = shift;
   my @cigars;

   # First get an 'expanded' cigar string for each leaf of the subtree
   my @all_members = @{$self->get_all_Members};
   my $num_members = scalar(@all_members);
   foreach my $leaf (@all_members) {
     next unless( UNIVERSAL::can( $leaf, 'cigar_line' ) );
     my $cigar = $leaf->cigar_line;
     my $newcigar = "";
#     $cigar =~ s/(\d*)([A-Z])/$2 x ($1||1)/ge; #Expand
      while ($cigar =~ /(\d*)([A-Z])/g) {
          $newcigar .= $2 x ($1 || 1);
      }
     $cigar = $newcigar;
     push @cigars, $cigar;
   }

   # Itterate through each character of the expanded cigars.
   # If there is a 'D' at a given location in any cigar,
   # set the consensus to 'D', otherwise assume an 'M'.
   # TODO: Fix assumption that cigar strings are always the same length,
   # and start at the same point.
   my $cigar_len = length( $cigars[0] );
   my $cons_cigar;
   for( my $i=0; $i<$cigar_len; $i++ ){
     my $char = 'M';
     my $num_deletions = 0;
     foreach my $cigar( @cigars ){
       if ( substr($cigar,$i,1) eq 'D'){
         $num_deletions++;
       }
     }
     if ($num_deletions * 3 > 2 * $num_members) {
       $char = "D";
     } elsif ($num_deletions * 3 > $num_members) {
       $char = "m";
     }
     $cons_cigar .= $char;
   }

   # Collapse the consensus cigar, e.g. 'DDDD' = 4D
#   $cons_cigar =~ s/(\w)(\1*)/($2?length($2)+1:"").$1/ge;
   my $collapsed_cigar = "";
   while ($cons_cigar =~ /(D+|M+|I+|m+)/g) {
     $collapsed_cigar .= (length($1) == 1 ? "" : length($1)) . substr($1,0,1)
 }
   $cons_cigar = $collapsed_cigar;
   # Return the consensus
   return $cons_cigar;
}


my %TWOD_CODONS = ("TTT" => "Phe",#Phe
                   "TTC" => "Phe",

                   "TTA" => "Leu",#Leu
                   "TTG" => "Leu",

                   "TAT" => "Tyr",#Tyr
                   "TAC" => "Tyr",

                   "CAT" => "His",#His
                   "CAC" => "His",

                   "CAA" => "Gln",#Gln
                   "CAG" => "Gln",

                   "AAT" => "Asn",#Asn
                   "AAC" => "Asn",

                   "AAA" => "Lys",#Lys
                   "AAG" => "Lys",

                   "GAT" => "Asp",#Asp
                   "GAC" => "Asp",

                   "GAA" => "Glu",#Glu
                   "GAG" => "Glu",

                   "TGT" => "Cys",#Cys
                   "TGC" => "Cys",

                   "AGT" => "Ser",#Ser
                   "AGC" => "Ser",

                   "AGA" => "Arg",#Arg
                   "AGG" => "Arg",

                   "ATT" => "Ile",#Ile
                   "ATC" => "Ile",
                   "ATA" => "Ile");

my %FOURD_CODONS = ("CTT" => "Leu",#Leu
                    "CTC" => "Leu",
                    "CTA" => "Leu",
                    "CTG" => "Leu",

                    "GTT" => "Val",#Val
                    "GTC" => "Val",
                    "GTA" => "Val",
                    "GTG" => "Val",

                    "TCT" => "Ser",#Ser
                    "TCC" => "Ser",
                    "TCA" => "Ser",
                    "TCG" => "Ser",

                    "CCT" => "Pro",#Pro
                    "CCC" => "Pro",
                    "CCA" => "Pro",
                    "CCG" => "Pro",

                    "ACT" => "Thr",#Thr
                    "ACC" => "Thr",
                    "ACA" => "Thr",
                    "ACG" => "Thr",

                    "GCT" => "Ala",#Ala
                    "GCC" => "Ala",
                    "GCA" => "Ala",
                    "GCG" => "Ala",

                    "CGT" => "Arg",#Arg
                    "CGC" => "Arg",
                    "CGA" => "Arg",
                    "CGG" => "Arg",

                    "GGT" => "Gly",#Gly
                    "GGC" => "Gly",
                    "GGA" => "Gly",
                    "GGG" => "Gly");

my %CODONS =   ("ATG" => "Met",
                "TGG" => "Trp",
                "TAA" => "TER",
                "TAG" => "TER",
                "TGA" => "TER",
                "---" => "---",
                %TWOD_CODONS,
                %FOURD_CODONS,
                );


=head2 get_4D_SimpleAlign

  Example    : $4d_align = $homology->get_4D_SimpleAlign();
  Description: get 4 times degenerate positions pairwise simple alignment
  Returntype : Bio::SimpleAlign

=cut

sub get_4D_SimpleAlign {
    my $self = shift;

    my $sa = Bio::SimpleAlign->new();

    #Hack to try to work with both bioperl 0.7 and 1.2:
    #Check to see if the method is called 'addSeq' or 'add_seq'
    my $bio07 = 0;
    if(!$sa->can('add_seq')) {
        $bio07 = 1;
    }

    my $ma = $self->adaptor->db->get_MemberAdaptor;

    my %member_seqstr;
    foreach my $member (@{$self->get_all_Members}) {
        next if $member->source_name ne 'ENSEMBLPEP';
        my $seqstr = $member->cdna_alignment_string();
        next if(!$seqstr);
        #print STDERR $seqstr,"\n";
        my @tmp_tab = split /\s+/, $seqstr;
        #print STDERR "tnp_tab 0: ", $tmp_tab[0],"\n";
        $member_seqstr{$member->stable_id} = \@tmp_tab;
    }

    my $seqstr_length;
    foreach my $seqid (keys %member_seqstr) {
        unless (defined $seqstr_length) {
            #print STDERR $member_seqstr{$seqid}->[0],"\n";
            $seqstr_length = scalar @{$member_seqstr{$seqid}};
            next;
        }
        unless ($seqstr_length == scalar @{$member_seqstr{$seqid}}) {
            die "Length of dna alignment are not the same, $seqstr_length and " . scalar @{$member_seqstr{$seqid}} ." respectively for homology_id " . $self->dbID . "\n";
        }
    }

    my %FourD_member_seqstr;
    for (my $i=0; $i < $seqstr_length; $i++) {
        my $FourD_codon = 1;
        my $FourD_aminoacid;
        foreach my $seqid (keys %member_seqstr) {
            if (defined $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
                if (defined $FourD_aminoacid && $FourD_aminoacid eq $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
                    #print STDERR "YES ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
                    next;
                } elsif (defined $FourD_aminoacid) {
                    #print STDERR "NO ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
                    $FourD_codon = 0;
                    last;
                } else {
                    $FourD_aminoacid = $FOURD_CODONS{$member_seqstr{$seqid}->[$i]};
                    #print STDERR $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i]," ";
                }
                next;
            } else {
                #print STDERR "NO ",$CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
                $FourD_codon = 0;
                last;
            }
        }
        next unless ($FourD_codon);
        foreach my $seqid (keys %member_seqstr) {
            $FourD_member_seqstr{$seqid} .= substr($member_seqstr{$seqid}->[$i],2,1);
        }
    }

    foreach my $seqid (keys %FourD_member_seqstr) {

        my $seq = Bio::LocatableSeq->new(
                -SEQ    => $FourD_member_seqstr{$seqid},
                -START  => 1,
                -END    => length($FourD_member_seqstr{$seqid}),
                -ID     => $seqid,
                -STRAND => 0
        );

        if($bio07) {
            $sa->addSeq($seq);
        } else {
            $sa->add_seq($seq);
        }
    }

    return $sa;
}

my %matrix_hash;

{
  my $BLOSUM62 = "#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
";
  my $matrix_string;
  my @lines = split(/\n/,$BLOSUM62);
  foreach my $line (@lines) {
    next if ($line =~ /^\#/);
    if ($line =~ /^[A-Z\*\s]+$/) {
      $matrix_string .= sprintf "$line\n";
    } else {
      my @t = split(/\s+/,$line);
      shift @t;
      #       print scalar @t,"\n";
      $matrix_string .= sprintf(join(" ",@t)."\n");
    }
  }

#  my %matrix_hash;
  @lines = ();
  @lines = split /\n/, $matrix_string;
  my $lts = shift @lines;
  $lts =~ s/^\s+//;
  $lts =~ s/\s+$//;
  my @letters = split /\s+/, $lts;

  foreach my $letter (@letters) {
    my $line = shift @lines;
    $line =~ s/^\s+//;
    $line =~ s/\s+$//;
    my @penalties = split /\s+/, $line;
    die "Size of letters array and penalties array are different\n"
      unless (scalar @letters == scalar @penalties);
    for (my $i=0; $i < scalar @letters; $i++) {
      $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
    }
  }
}


sub update_alignment_stats {
    my $self = shift;

    my $genes = $self->get_all_Members;
    my $ngenes = scalar(@$genes);

    if ($ngenes == 2) {
        # This code is >4 times faster with pairs of genes

        my $gene1 = $genes->[0];
        my $gene2 = $genes->[1];
        my $new_aln1_cigarline = "";
        my $new_aln2_cigarline = "";

        my $identical_matches = 0;
        my $positive_matches = 0;

        my ($aln1state, $aln2state);
        my ($aln1count, $aln2count);
        my ($aln1cov, $aln2cov) = (0,0);

        my @aln1 = split(//, $gene1->alignment_string);
        my @aln2 = split(//, $gene2->alignment_string);

        for (my $i=0; $i <= $#aln1; $i++) {
            next if ($aln1[$i] eq '-' && $aln2[$i] eq '-');
            my $cur_aln1state = ($aln1[$i] eq '-' ? 'D' : 'M');
            my $cur_aln2state = ($aln2[$i] eq '-' ? 'D' : 'M');
            $aln1cov++ if $cur_aln1state ne 'D';
            $aln2cov++ if $cur_aln2state ne 'D';
            if ($cur_aln1state eq 'M' && $cur_aln2state eq 'M') {
                if ($aln1[$i] eq $aln2[$i]) {
                    $identical_matches++;
                    $positive_matches++;
                } elsif ($matrix_hash{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
                    $positive_matches++;
                }
            }
            unless (defined $aln1state) {
                $aln1count = 1;
                $aln2count = 1;
                $aln1state = $cur_aln1state;
                $aln2state = $cur_aln2state;
                next;
            }
            if ($cur_aln1state eq $aln1state) {
                $aln1count++;
            } else {
                if ($aln1count == 1) {
                    $new_aln1_cigarline .= $aln1state;
                } else {
                    $new_aln1_cigarline .= $aln1count.$aln1state;
                }
                $aln1count = 1;
                $aln1state = $cur_aln1state;
            }
            if ($cur_aln2state eq $aln2state) {
                $aln2count++;
            } else {
                if ($aln2count == 1) {
                    $new_aln2_cigarline .= $aln2state;
                } else {
                    $new_aln2_cigarline .= $aln2count.$aln2state;
                }
                $aln2count = 1;
                $aln2state = $cur_aln2state;
            }
        }
        if ($aln1count == 1) {
            $new_aln1_cigarline .= $aln1state;
        } else {
            $new_aln1_cigarline .= $aln1count.$aln1state;
        }
        if ($aln2count == 1) {
            $new_aln2_cigarline .= $aln2state;
        } else {
            $new_aln2_cigarline .= $aln2count.$aln2state;
        }
        my $seq_length1 = $gene1->seq_length;
        unless (0 == $seq_length1) {
            $gene1->cigar_line($new_aln1_cigarline);
            $gene1->perc_id( int((100.0 * $identical_matches / $seq_length1 + 0.5)) );
            $gene1->perc_pos( int((100.0 * $positive_matches  / $seq_length1 + 0.5)) );
            $gene1->perc_cov( int((100.0 * $aln1cov / $seq_length1 + 0.5)) );
        }
        my $seq_length2 = $gene2->seq_length;
        unless (0 == $seq_length2) {
            $gene2->cigar_line($new_aln2_cigarline);
            $gene2->perc_id( int((100.0 * $identical_matches / $seq_length2 + 0.5)) );
            $gene2->perc_pos( int((100.0 * $positive_matches  / $seq_length2 + 0.5)) );
            $gene2->perc_cov( int((100.0 * $aln2cov / $seq_length2 + 0.5)) );
        }
        return undef;
    }

    my $min_seq = shift;
    $min_seq = int($min_seq * $ngenes) if $min_seq <= 1;

    my @new_cigars   = ('') x $ngenes;
    my @nmatch_id    = (0) x $ngenes;
    my @nmatch_pos   = (0) x $ngenes;
    my @nmatch_cov   = (0) x $ngenes;
    my @alncount     = (1) x $ngenes;
    my @alnstate     = (undef) x $ngenes;
    my @cur_alnstate = (undef) x $ngenes;

    my @aln = map {$_->alignment_string} @$genes;
    my $aln_length = length($aln[0]);

    for (my $i=0; $i < $aln_length; $i++) {

        my @char_i =  map {substr($_, $i, 1)} @aln;
        #print "pos $i: ", join('/', @char_i), "\n";

        my %seen;
        map {$seen{$_}++} @char_i;
        next if $seen{'-'} == $ngenes;
        delete $seen{'-'};

        my %pos_chars = ();
        my @chars = keys %seen;
        while (my $c1 = shift @chars) {
            foreach my $c2 (@chars) {
                if (($matrix_hash{uc $c1}{uc $c2} > 0) and ($seen{$c1}+$seen{$c2} >= $min_seq)) {
                    $pos_chars{$c1} = 1;
                    $pos_chars{$c2} = 1;
                }
            }
        }

        for (my $j=0; $j<$ngenes; $j++) {
            if ($char_i[$j] eq '-') {
                $cur_alnstate[$j] = 'D';
            } else {
                $nmatch_cov[$j]++;
                $cur_alnstate[$j] = 'M';
                if ($seen{$char_i[$j]} >= $min_seq) {
                    $nmatch_id[$j]++;
                    $nmatch_pos[$j]++;
                } elsif (exists $pos_chars{$char_i[$j]}) {
                    $nmatch_pos[$j]++;
                }
            }
        }

        if ($i == 0) {
            @alnstate = @cur_alnstate;
            next;
        }

        for (my $j=0; $j<$ngenes; $j++) {
            if ($cur_alnstate[$j] eq $alnstate[$j]) {
                $alncount[$j]++;
            } else {
                if ($alncount[$j] == 1) {
                    $new_cigars[$j] .= $alnstate[$j];
                } else {
                    $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
                }
                $alncount[$j] = 1;
                $alnstate[$j] = $cur_alnstate[$j];
            }
        }
    }

    for (my $j=0; $j<$ngenes; $j++) {
        if ($alncount[$j] == 1) {
            $new_cigars[$j] .= $alnstate[$j];
        } else {
            $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
        }
        $genes->[$j]->cigar_line($new_cigars[$j]);
        my $seq_length = $genes->[$j]->seq_length;
        unless (0 == $seq_length) {
            $genes->[$j]->perc_id( int((100.0 * $nmatch_id[$j] / $seq_length + 0.5)) );
            $genes->[$j]->perc_pos( int((100.0 * $nmatch_pos[$j] / $seq_length + 0.5)) );
            $genes->[$j]->perc_cov( int((100.0 * $nmatch_cov[$j] / $seq_length + 0.5)) );
        }
    }
}


1;
author	willmclaren
date	Fri, 03 Aug 2012 10:05:43 -0400
parents	21066c0abaf5
children