Mercurial > repos > willmclaren > ensembl_vep
view variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm @ 1:d6778b5d8382 draft default tip
Deleted selected files
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:05:43 -0400 |
parents | 21066c0abaf5 |
children |
line wrap: on
line source
=head1 LICENSE Copyright (c) 1999-2012 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <dev@ensembl.org>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =head1 AUTHORSHIP Ensembl Team. Individual contributions can be found in the CVS log. =cut =head1 NAME AlignedMemberSet - A superclass for pairwise or multiple relationships, base of Bio::EnsEMBL::Compara::Family, Bio::EnsEMBL::Compara::Homology and Bio::EnsEMBL::Compara::Domain. =head1 DESCRIPTION A superclass for pairwise and multiple relationships Currently the AlignedMember objects are used in the GeneTree structure to represent the leaves of the trees. Each leaf contains an aligned sequence, which is represented as an AlignedMember object. =head1 INHERITANCE TREE Bio::EnsEMBL::Compara::AlignedMemberSet =head1 METHODS =cut package Bio::EnsEMBL::Compara::AlignedMemberSet; use strict; use Scalar::Util qw(weaken); use Bio::AlignIO; use Bio::EnsEMBL::Utils::Argument; use Bio::EnsEMBL::Utils::Scalar; use Bio::EnsEMBL::Utils::Exception; use Bio::EnsEMBL::Compara::AlignedMember; use base ('Bio::EnsEMBL::Compara::MemberSet'); =head2 member_class Description: Returns the type of member used in the set Returntype : String: Bio::EnsEMBL::Compara::AlignedMember Caller : general Status : Stable =cut sub member_class { return 'Bio::EnsEMBL::Compara::AlignedMember'; } ###################### # Alignment sections # ###################### sub print_sequences_to_fasta { my ($self, $pep_file) = @_; my $pep_counter = 0; open PEP, ">$pep_file"; foreach my $member (@{$self->get_all_Members}) { next if $member->source_name eq 'ENSEMBLGENE'; my $member_stable_id = $member->stable_id; my $seq = $member->sequence; print PEP ">$member_stable_id\n"; $seq =~ s/(.{72})/$1\n/g; chomp $seq; unless (defined($seq)) { my $family_id = $self->dbID; die "member $member_stable_id in family $family_id doesn't have a sequence"; } print PEP $seq,"\n"; $pep_counter++; } close PEP; return $pep_counter; } =head2 read_clustalw Arg [1] : string $file The name of the file containing the clustalw output Example : $family->read_clustalw('/tmp/clustalw.aln'); Description: Parses the output from clustalw and sets the alignment strings of each of the memebers of this family Returntype : none Exceptions : thrown if file cannot be parsed warning if alignment file contains identifiers for sequences which are not members of this family Caller : general =cut sub read_clustalw { my $self = shift; my $file = shift; my %align_hash; my $FH = IO::File->new(); $FH->open($file) || throw("Could not open alignment file [$file]"); <$FH>; #skip header while(<$FH>) { next if($_ =~ /^\s+/); #skip lines that start with space my ($id, $align) = split; $align_hash{$id} ||= ''; $align_hash{$id} .= $align; } $FH->close; #place all members in a hash on their member name my %member_hash; foreach my $member (@{$self->get_all_Members}) { $member_hash{$member->stable_id} = $member; } #assign cigar_line to each of the member attribute foreach my $id (keys %align_hash) { throw("No member for alignment portion: [$id]") unless exists $member_hash{$id}; my $alignment_string = $align_hash{$id}; $alignment_string =~ s/\-([A-Z])/\- $1/g; $alignment_string =~ s/([A-Z])\-/$1 \-/g; my @cigar_segments = split " ",$alignment_string; my $cigar_line = ""; foreach my $segment (@cigar_segments) { my $seglength = length($segment); $seglength = "" if ($seglength == 1); if ($segment =~ /^\-+$/) { $cigar_line .= $seglength . "D"; } else { $cigar_line .= $seglength . "M"; } } $member_hash{$id}->cigar_line($cigar_line); } } sub load_cigars_from_fasta { my ($self, $file) = @_; my $alignio = Bio::AlignIO->new(-file => "$file", -format => "fasta"); my $aln = $alignio->next_aln or die "Bio::AlignIO could not get next_aln() from file '$file'"; #place all members in a hash on their member name my %member_hash; foreach my $member (@{$self->get_all_Members}) { $member_hash{$member->stable_id} = $member; } #assign cigar_line to each of the member attribute foreach my $seq ($aln->each_seq) { my $id = $seq->display_id; throw("No member for alignment portion: [$id]") unless exists $member_hash{$id}; my $cigar_line = ''; my $seq_string = $seq->seq(); while($seq_string=~/(?:\b|^)(.)(.*?)(?:\b|$)/g) { $cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M'); } $member_hash{$id}->cigar_line($cigar_line); } } sub get_SimpleAlign { my ($self, @args) = @_; my $id_type = 'STABLE'; my $unique_seqs = 0; my $cdna = 0; my $stop2x = 0; my $append_taxon_id = 0; my $append_sp_short_name = 0; my $append_genomedb_id = 0; my $exon_cased = 0; my $alignment = 'protein'; my $changeSelenos = 0; if (scalar @args) { ($unique_seqs, $cdna, $id_type, $stop2x, $append_taxon_id, $append_sp_short_name, $append_genomedb_id, $exon_cased, $alignment, $changeSelenos) = rearrange([qw(UNIQ_SEQ CDNA ID_TYPE STOP2X APPEND_TAXON_ID APPEND_SP_SHORT_NAME APPEND_GENOMEDB_ID EXON_CASED ALIGNMENT CHANGE_SELENO)], @args); } my $sa = Bio::SimpleAlign->new(); #Hack to try to work with both bioperl 0.7 and 1.2: #Check to see if the method is called 'addSeq' or 'add_seq' my $bio07 = ($sa->can('add_seq') ? 0 : 1); my $seq_id_hash = {}; foreach my $member (@{$self->get_all_Members}) { next if $member->source_name eq 'ENSEMBLGENE'; # Print unique sequences only ? next if($unique_seqs and $seq_id_hash->{$member->sequence_id}); $seq_id_hash->{$member->sequence_id} = 1; # The correct codon table if ($member->chr_name =~ /mt/i) { # codeml icodes # 0:universal code (default) my $class; eval {$class = member->taxon->classification;}; unless ($@) { if ($class =~ /vertebrata/i) { # 1:mamalian mt $sa->{_special_codeml_icode} = 1; } else { # 4:invertebrate mt $sa->{_special_codeml_icode} = 4; } } } my $seqstr; my $alphabet; if ($cdna or (lc($alignment) eq 'cdna')) { $seqstr = $member->cdna_alignment_string($changeSelenos); $seqstr =~ s/\s+//g; $alphabet = 'dna'; } else { $seqstr = $member->alignment_string($exon_cased); $alphabet = 'protein'; } next if(!$seqstr); # Sequence name my $seqID = $member->stable_id; $seqID = $member->sequence_id if($id_type eq "SEQ"); $seqID = $member->member_id if($id_type eq "MEMBER"); $seqID .= "_" . $member->taxon_id if($append_taxon_id); $seqID .= "_" . $member->genome_db_id if ($append_genomedb_id); ## Append $seqID with Speciae short name, if required if ($append_sp_short_name) { my $species = $member->genome_db->short_name; $species =~ s/\s/_/g; $seqID .= "_" . $species . "_"; } # Sequence length my $aln_end = $member->seq_length; $aln_end = $aln_end*3 if $alphabet eq 'dna'; $seqstr =~ s/\*/X/g if ($stop2x); my $seq = Bio::LocatableSeq->new( -SEQ => $seqstr, -ALPHABET => $alphabet, -START => 1, -END => $aln_end, -ID => $seqID, -STRAND => 0 ); if($bio07) { $sa->addSeq($seq); } else { $sa->add_seq($seq); } } $sa = $sa->remove_gaps(undef, 1) if UNIVERSAL::can($sa, 'remove_gaps'); return $sa; } # Takes a protein tree and creates a consensus cigar line from the # constituent leaf nodes. sub consensus_cigar_line { my $self = shift; my @cigars; # First get an 'expanded' cigar string for each leaf of the subtree my @all_members = @{$self->get_all_Members}; my $num_members = scalar(@all_members); foreach my $leaf (@all_members) { next unless( UNIVERSAL::can( $leaf, 'cigar_line' ) ); my $cigar = $leaf->cigar_line; my $newcigar = ""; # $cigar =~ s/(\d*)([A-Z])/$2 x ($1||1)/ge; #Expand while ($cigar =~ /(\d*)([A-Z])/g) { $newcigar .= $2 x ($1 || 1); } $cigar = $newcigar; push @cigars, $cigar; } # Itterate through each character of the expanded cigars. # If there is a 'D' at a given location in any cigar, # set the consensus to 'D', otherwise assume an 'M'. # TODO: Fix assumption that cigar strings are always the same length, # and start at the same point. my $cigar_len = length( $cigars[0] ); my $cons_cigar; for( my $i=0; $i<$cigar_len; $i++ ){ my $char = 'M'; my $num_deletions = 0; foreach my $cigar( @cigars ){ if ( substr($cigar,$i,1) eq 'D'){ $num_deletions++; } } if ($num_deletions * 3 > 2 * $num_members) { $char = "D"; } elsif ($num_deletions * 3 > $num_members) { $char = "m"; } $cons_cigar .= $char; } # Collapse the consensus cigar, e.g. 'DDDD' = 4D # $cons_cigar =~ s/(\w)(\1*)/($2?length($2)+1:"").$1/ge; my $collapsed_cigar = ""; while ($cons_cigar =~ /(D+|M+|I+|m+)/g) { $collapsed_cigar .= (length($1) == 1 ? "" : length($1)) . substr($1,0,1) } $cons_cigar = $collapsed_cigar; # Return the consensus return $cons_cigar; } my %TWOD_CODONS = ("TTT" => "Phe",#Phe "TTC" => "Phe", "TTA" => "Leu",#Leu "TTG" => "Leu", "TAT" => "Tyr",#Tyr "TAC" => "Tyr", "CAT" => "His",#His "CAC" => "His", "CAA" => "Gln",#Gln "CAG" => "Gln", "AAT" => "Asn",#Asn "AAC" => "Asn", "AAA" => "Lys",#Lys "AAG" => "Lys", "GAT" => "Asp",#Asp "GAC" => "Asp", "GAA" => "Glu",#Glu "GAG" => "Glu", "TGT" => "Cys",#Cys "TGC" => "Cys", "AGT" => "Ser",#Ser "AGC" => "Ser", "AGA" => "Arg",#Arg "AGG" => "Arg", "ATT" => "Ile",#Ile "ATC" => "Ile", "ATA" => "Ile"); my %FOURD_CODONS = ("CTT" => "Leu",#Leu "CTC" => "Leu", "CTA" => "Leu", "CTG" => "Leu", "GTT" => "Val",#Val "GTC" => "Val", "GTA" => "Val", "GTG" => "Val", "TCT" => "Ser",#Ser "TCC" => "Ser", "TCA" => "Ser", "TCG" => "Ser", "CCT" => "Pro",#Pro "CCC" => "Pro", "CCA" => "Pro", "CCG" => "Pro", "ACT" => "Thr",#Thr "ACC" => "Thr", "ACA" => "Thr", "ACG" => "Thr", "GCT" => "Ala",#Ala "GCC" => "Ala", "GCA" => "Ala", "GCG" => "Ala", "CGT" => "Arg",#Arg "CGC" => "Arg", "CGA" => "Arg", "CGG" => "Arg", "GGT" => "Gly",#Gly "GGC" => "Gly", "GGA" => "Gly", "GGG" => "Gly"); my %CODONS = ("ATG" => "Met", "TGG" => "Trp", "TAA" => "TER", "TAG" => "TER", "TGA" => "TER", "---" => "---", %TWOD_CODONS, %FOURD_CODONS, ); =head2 get_4D_SimpleAlign Example : $4d_align = $homology->get_4D_SimpleAlign(); Description: get 4 times degenerate positions pairwise simple alignment Returntype : Bio::SimpleAlign =cut sub get_4D_SimpleAlign { my $self = shift; my $sa = Bio::SimpleAlign->new(); #Hack to try to work with both bioperl 0.7 and 1.2: #Check to see if the method is called 'addSeq' or 'add_seq' my $bio07 = 0; if(!$sa->can('add_seq')) { $bio07 = 1; } my $ma = $self->adaptor->db->get_MemberAdaptor; my %member_seqstr; foreach my $member (@{$self->get_all_Members}) { next if $member->source_name ne 'ENSEMBLPEP'; my $seqstr = $member->cdna_alignment_string(); next if(!$seqstr); #print STDERR $seqstr,"\n"; my @tmp_tab = split /\s+/, $seqstr; #print STDERR "tnp_tab 0: ", $tmp_tab[0],"\n"; $member_seqstr{$member->stable_id} = \@tmp_tab; } my $seqstr_length; foreach my $seqid (keys %member_seqstr) { unless (defined $seqstr_length) { #print STDERR $member_seqstr{$seqid}->[0],"\n"; $seqstr_length = scalar @{$member_seqstr{$seqid}}; next; } unless ($seqstr_length == scalar @{$member_seqstr{$seqid}}) { die "Length of dna alignment are not the same, $seqstr_length and " . scalar @{$member_seqstr{$seqid}} ." respectively for homology_id " . $self->dbID . "\n"; } } my %FourD_member_seqstr; for (my $i=0; $i < $seqstr_length; $i++) { my $FourD_codon = 1; my $FourD_aminoacid; foreach my $seqid (keys %member_seqstr) { if (defined $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) { if (defined $FourD_aminoacid && $FourD_aminoacid eq $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) { #print STDERR "YES ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n"; next; } elsif (defined $FourD_aminoacid) { #print STDERR "NO ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n"; $FourD_codon = 0; last; } else { $FourD_aminoacid = $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}; #print STDERR $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i]," "; } next; } else { #print STDERR "NO ",$CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n"; $FourD_codon = 0; last; } } next unless ($FourD_codon); foreach my $seqid (keys %member_seqstr) { $FourD_member_seqstr{$seqid} .= substr($member_seqstr{$seqid}->[$i],2,1); } } foreach my $seqid (keys %FourD_member_seqstr) { my $seq = Bio::LocatableSeq->new( -SEQ => $FourD_member_seqstr{$seqid}, -START => 1, -END => length($FourD_member_seqstr{$seqid}), -ID => $seqid, -STRAND => 0 ); if($bio07) { $sa->addSeq($seq); } else { $sa->add_seq($seq); } } return $sa; } my %matrix_hash; { my $BLOSUM62 = "# Matrix made by matblas from blosum62.iij # * column uses minimum score # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units # Blocks Database = /data/blocks_5.0/blocks.dat # Cluster Percentage: >= 62 # Entropy = 0.6979, Expected = -0.5209 A R N D C Q E G H I L K M F P S T W Y V B Z X * A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 "; my $matrix_string; my @lines = split(/\n/,$BLOSUM62); foreach my $line (@lines) { next if ($line =~ /^\#/); if ($line =~ /^[A-Z\*\s]+$/) { $matrix_string .= sprintf "$line\n"; } else { my @t = split(/\s+/,$line); shift @t; # print scalar @t,"\n"; $matrix_string .= sprintf(join(" ",@t)."\n"); } } # my %matrix_hash; @lines = (); @lines = split /\n/, $matrix_string; my $lts = shift @lines; $lts =~ s/^\s+//; $lts =~ s/\s+$//; my @letters = split /\s+/, $lts; foreach my $letter (@letters) { my $line = shift @lines; $line =~ s/^\s+//; $line =~ s/\s+$//; my @penalties = split /\s+/, $line; die "Size of letters array and penalties array are different\n" unless (scalar @letters == scalar @penalties); for (my $i=0; $i < scalar @letters; $i++) { $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i]; } } } sub update_alignment_stats { my $self = shift; my $genes = $self->get_all_Members; my $ngenes = scalar(@$genes); if ($ngenes == 2) { # This code is >4 times faster with pairs of genes my $gene1 = $genes->[0]; my $gene2 = $genes->[1]; my $new_aln1_cigarline = ""; my $new_aln2_cigarline = ""; my $identical_matches = 0; my $positive_matches = 0; my ($aln1state, $aln2state); my ($aln1count, $aln2count); my ($aln1cov, $aln2cov) = (0,0); my @aln1 = split(//, $gene1->alignment_string); my @aln2 = split(//, $gene2->alignment_string); for (my $i=0; $i <= $#aln1; $i++) { next if ($aln1[$i] eq '-' && $aln2[$i] eq '-'); my $cur_aln1state = ($aln1[$i] eq '-' ? 'D' : 'M'); my $cur_aln2state = ($aln2[$i] eq '-' ? 'D' : 'M'); $aln1cov++ if $cur_aln1state ne 'D'; $aln2cov++ if $cur_aln2state ne 'D'; if ($cur_aln1state eq 'M' && $cur_aln2state eq 'M') { if ($aln1[$i] eq $aln2[$i]) { $identical_matches++; $positive_matches++; } elsif ($matrix_hash{uc $aln1[$i]}{uc $aln2[$i]} > 0) { $positive_matches++; } } unless (defined $aln1state) { $aln1count = 1; $aln2count = 1; $aln1state = $cur_aln1state; $aln2state = $cur_aln2state; next; } if ($cur_aln1state eq $aln1state) { $aln1count++; } else { if ($aln1count == 1) { $new_aln1_cigarline .= $aln1state; } else { $new_aln1_cigarline .= $aln1count.$aln1state; } $aln1count = 1; $aln1state = $cur_aln1state; } if ($cur_aln2state eq $aln2state) { $aln2count++; } else { if ($aln2count == 1) { $new_aln2_cigarline .= $aln2state; } else { $new_aln2_cigarline .= $aln2count.$aln2state; } $aln2count = 1; $aln2state = $cur_aln2state; } } if ($aln1count == 1) { $new_aln1_cigarline .= $aln1state; } else { $new_aln1_cigarline .= $aln1count.$aln1state; } if ($aln2count == 1) { $new_aln2_cigarline .= $aln2state; } else { $new_aln2_cigarline .= $aln2count.$aln2state; } my $seq_length1 = $gene1->seq_length; unless (0 == $seq_length1) { $gene1->cigar_line($new_aln1_cigarline); $gene1->perc_id( int((100.0 * $identical_matches / $seq_length1 + 0.5)) ); $gene1->perc_pos( int((100.0 * $positive_matches / $seq_length1 + 0.5)) ); $gene1->perc_cov( int((100.0 * $aln1cov / $seq_length1 + 0.5)) ); } my $seq_length2 = $gene2->seq_length; unless (0 == $seq_length2) { $gene2->cigar_line($new_aln2_cigarline); $gene2->perc_id( int((100.0 * $identical_matches / $seq_length2 + 0.5)) ); $gene2->perc_pos( int((100.0 * $positive_matches / $seq_length2 + 0.5)) ); $gene2->perc_cov( int((100.0 * $aln2cov / $seq_length2 + 0.5)) ); } return undef; } my $min_seq = shift; $min_seq = int($min_seq * $ngenes) if $min_seq <= 1; my @new_cigars = ('') x $ngenes; my @nmatch_id = (0) x $ngenes; my @nmatch_pos = (0) x $ngenes; my @nmatch_cov = (0) x $ngenes; my @alncount = (1) x $ngenes; my @alnstate = (undef) x $ngenes; my @cur_alnstate = (undef) x $ngenes; my @aln = map {$_->alignment_string} @$genes; my $aln_length = length($aln[0]); for (my $i=0; $i < $aln_length; $i++) { my @char_i = map {substr($_, $i, 1)} @aln; #print "pos $i: ", join('/', @char_i), "\n"; my %seen; map {$seen{$_}++} @char_i; next if $seen{'-'} == $ngenes; delete $seen{'-'}; my %pos_chars = (); my @chars = keys %seen; while (my $c1 = shift @chars) { foreach my $c2 (@chars) { if (($matrix_hash{uc $c1}{uc $c2} > 0) and ($seen{$c1}+$seen{$c2} >= $min_seq)) { $pos_chars{$c1} = 1; $pos_chars{$c2} = 1; } } } for (my $j=0; $j<$ngenes; $j++) { if ($char_i[$j] eq '-') { $cur_alnstate[$j] = 'D'; } else { $nmatch_cov[$j]++; $cur_alnstate[$j] = 'M'; if ($seen{$char_i[$j]} >= $min_seq) { $nmatch_id[$j]++; $nmatch_pos[$j]++; } elsif (exists $pos_chars{$char_i[$j]}) { $nmatch_pos[$j]++; } } } if ($i == 0) { @alnstate = @cur_alnstate; next; } for (my $j=0; $j<$ngenes; $j++) { if ($cur_alnstate[$j] eq $alnstate[$j]) { $alncount[$j]++; } else { if ($alncount[$j] == 1) { $new_cigars[$j] .= $alnstate[$j]; } else { $new_cigars[$j] .= $alncount[$j].$alnstate[$j]; } $alncount[$j] = 1; $alnstate[$j] = $cur_alnstate[$j]; } } } for (my $j=0; $j<$ngenes; $j++) { if ($alncount[$j] == 1) { $new_cigars[$j] .= $alnstate[$j]; } else { $new_cigars[$j] .= $alncount[$j].$alnstate[$j]; } $genes->[$j]->cigar_line($new_cigars[$j]); my $seq_length = $genes->[$j]->seq_length; unless (0 == $seq_length) { $genes->[$j]->perc_id( int((100.0 * $nmatch_id[$j] / $seq_length + 0.5)) ); $genes->[$j]->perc_pos( int((100.0 * $nmatch_pos[$j] / $seq_length + 0.5)) ); $genes->[$j]->perc_cov( int((100.0 * $nmatch_cov[$j] / $seq_length + 0.5)) ); } } } 1;