ensembl_vep: variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm comparison

comparison variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm @ 0:21066c0abaf5 draft

Uploaded

author	willmclaren
date	Fri, 03 Aug 2012 10:04:48 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:21066c0abaf5
+=head1 LICENSE
+Copyright (c) 1999-2012 The European Bioinformatics Institute and
+Genome Research Limited.  All rights reserved.
+This software is distributed under a modified Apache license.
+For license details, please see
+http://www.ensembl.org/info/about/code_licence.html
+=head1 CONTACT
+Please email comments or questions to the public Ensembl
+developers list at <dev@ensembl.org>.
+Questions may also be sent to the Ensembl help desk at
+<helpdesk@ensembl.org>.
+=head1 AUTHORSHIP
+Ensembl Team. Individual contributions can be found in the CVS log.
+=cut
+=head1 NAME
+AlignedMemberSet - A superclass for pairwise or multiple relationships, base of
+Bio::EnsEMBL::Compara::Family, Bio::EnsEMBL::Compara::Homology and
+Bio::EnsEMBL::Compara::Domain.
+=head1 DESCRIPTION
+A superclass for pairwise and multiple relationships
+Currently the AlignedMember objects are used in the GeneTree structure
+to represent the leaves of the trees. Each leaf contains an aligned
+sequence, which is represented as an AlignedMember object.
+=head1 INHERITANCE TREE
+Bio::EnsEMBL::Compara::AlignedMemberSet
+=head1 METHODS
+=cut
+package Bio::EnsEMBL::Compara::AlignedMemberSet;
+use strict;
+use Scalar::Util qw(weaken);
+use Bio::AlignIO;
+use Bio::EnsEMBL::Utils::Argument;
+use Bio::EnsEMBL::Utils::Scalar;
+use Bio::EnsEMBL::Utils::Exception;
+use Bio::EnsEMBL::Compara::AlignedMember;
+use base ('Bio::EnsEMBL::Compara::MemberSet');
+=head2 member_class
+Description: Returns the type of member used in the set
+Returntype : String: Bio::EnsEMBL::Compara::AlignedMember
+Caller     : general
+Status     : Stable
+=cut
+sub member_class {
+return 'Bio::EnsEMBL::Compara::AlignedMember';
+}
+######################
+# Alignment sections #
+######################
+sub print_sequences_to_fasta {
+my ($self, $pep_file) = @_;
+my $pep_counter = 0;
+open PEP, ">$pep_file";
+foreach my $member (@{$self->get_all_Members}) {
+next if $member->source_name eq 'ENSEMBLGENE';
+my $member_stable_id = $member->stable_id;
+my $seq = $member->sequence;
+print PEP ">$member_stable_id\n";
+$seq =~ s/(.{72})/$1\n/g;
+chomp $seq;
+unless (defined($seq)) {
+my $family_id = $self->dbID;
+die "member $member_stable_id in family $family_id doesn't have a sequence";
+}
+print PEP $seq,"\n";
+$pep_counter++;
+}
+close PEP;
+return $pep_counter;
+}
+=head2 read_clustalw
+Arg [1]    : string $file
+The name of the file containing the clustalw output
+Example    : $family->read_clustalw('/tmp/clustalw.aln');
+Description: Parses the output from clustalw and sets the alignment strings
+of each of the memebers of this family
+Returntype : none
+Exceptions : thrown if file cannot be parsed
+warning if alignment file contains identifiers for sequences
+which are not members of this family
+Caller     : general
+=cut
+sub read_clustalw {
+my $self = shift;
+my $file = shift;
+my %align_hash;
+my $FH = IO::File->new();
+$FH->open($file) || throw("Could not open alignment file [$file]");
+<$FH>; #skip header
+while(<$FH>) {
+next if($_ =~ /^\s+/);  #skip lines that start with space
+my ($id, $align) = split;
+$align_hash{$id} ||= '';
+$align_hash{$id} .= $align;
+}
+$FH->close;
+#place all members in a hash on their member name
+my %member_hash;
+foreach my $member (@{$self->get_all_Members}) {
+$member_hash{$member->stable_id} = $member;
+}
+#assign cigar_line to each of the member attribute
+foreach my $id (keys %align_hash) {
+throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
+my $alignment_string = $align_hash{$id};
+$alignment_string =~ s/\-([A-Z])/\- $1/g;
+$alignment_string =~ s/([A-Z])\-/$1 \-/g;
+my @cigar_segments = split " ",$alignment_string;
+my $cigar_line = "";
+foreach my $segment (@cigar_segments) {
+my $seglength = length($segment);
+$seglength = "" if ($seglength == 1);
+if ($segment =~ /^\-+$/) {
+$cigar_line .= $seglength . "D";
+} else {
+$cigar_line .= $seglength . "M";
+}
+}
+$member_hash{$id}->cigar_line($cigar_line);
+}
+}
+sub load_cigars_from_fasta {
+my ($self, $file) = @_;
+my $alignio = Bio::AlignIO->new(-file => "$file", -format => "fasta");
+my $aln = $alignio->next_aln or die "Bio::AlignIO could not get next_aln() from file '$file'";
+#place all members in a hash on their member name
+my %member_hash;
+foreach my $member (@{$self->get_all_Members}) {
+$member_hash{$member->stable_id} = $member;
+}
+#assign cigar_line to each of the member attribute
+foreach my $seq ($aln->each_seq) {
+my $id = $seq->display_id;
+throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
+my $cigar_line = '';
+my $seq_string = $seq->seq();
+while($seq_string=~/(?:\b|^)(.)(.*?)(?:\b|$)/g) {
+$cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M');
+}
+$member_hash{$id}->cigar_line($cigar_line);
+}
+}
+sub get_SimpleAlign {
+my ($self, @args) = @_;
+my $id_type = 'STABLE';
+my $unique_seqs = 0;
+my $cdna = 0;
+my $stop2x = 0;
+my $append_taxon_id = 0;
+my $append_sp_short_name = 0;
+my $append_genomedb_id = 0;
+my $exon_cased = 0;
+my $alignment = 'protein';
+my $changeSelenos = 0;
+if (scalar @args) {
+($unique_seqs, $cdna, $id_type, $stop2x, $append_taxon_id, $append_sp_short_name, $append_genomedb_id, $exon_cased, $alignment, $changeSelenos) =
+rearrange([qw(UNIQ_SEQ CDNA ID_TYPE STOP2X APPEND_TAXON_ID APPEND_SP_SHORT_NAME APPEND_GENOMEDB_ID EXON_CASED ALIGNMENT CHANGE_SELENO)], @args);
+}
+my $sa = Bio::SimpleAlign->new();
+#Hack to try to work with both bioperl 0.7 and 1.2:
+#Check to see if the method is called 'addSeq' or 'add_seq'
+my $bio07 = ($sa->can('add_seq') ? 0 : 1);
+my $seq_id_hash = {};
+foreach my $member (@{$self->get_all_Members}) {
+next if $member->source_name eq 'ENSEMBLGENE';
+# Print unique sequences only ?
+next if($unique_seqs and $seq_id_hash->{$member->sequence_id});
+$seq_id_hash->{$member->sequence_id} = 1;
+# The correct codon table
+if ($member->chr_name =~ /mt/i) {
+# codeml icodes
+# 0:universal code (default)
+my $class;
+eval {$class = member->taxon->classification;};
+unless ($@) {
+if ($class =~ /vertebrata/i) {
+# 1:mamalian mt
+$sa->{_special_codeml_icode} = 1;
+} else {
+# 4:invertebrate mt
+$sa->{_special_codeml_icode} = 4;
+}
+}
+}
+my $seqstr;
+my $alphabet;
+if ($cdna or (lc($alignment) eq 'cdna')) {
+$seqstr = $member->cdna_alignment_string($changeSelenos);
+$seqstr =~ s/\s+//g;
+$alphabet = 'dna';
+} else {
+$seqstr = $member->alignment_string($exon_cased);
+$alphabet = 'protein';
+}
+next if(!$seqstr);
+# Sequence name
+my $seqID = $member->stable_id;
+$seqID = $member->sequence_id if($id_type eq "SEQ");
+$seqID = $member->member_id if($id_type eq "MEMBER");
+$seqID .= "_" . $member->taxon_id if($append_taxon_id);
+$seqID .= "_" . $member->genome_db_id if ($append_genomedb_id);
+## Append $seqID with Speciae short name, if required
+if ($append_sp_short_name) {
+my $species = $member->genome_db->short_name;
+$species =~ s/\s/_/g;
+$seqID .= "_" . $species . "_";
+}
+# Sequence length
+my $aln_end = $member->seq_length;
+$aln_end = $aln_end*3 if $alphabet eq 'dna';
+$seqstr =~ s/\*/X/g if ($stop2x);
+my $seq = Bio::LocatableSeq->new(
+-SEQ        => $seqstr,
+-ALPHABET   => $alphabet,
+-START      => 1,
+-END        => $aln_end,
+-ID         => $seqID,
+-STRAND     => 0
+);
+if($bio07) {
+$sa->addSeq($seq);
+} else {
+$sa->add_seq($seq);
+}
+}
+$sa = $sa->remove_gaps(undef, 1) if UNIVERSAL::can($sa, 'remove_gaps');
+return $sa;
+}
+# Takes a protein tree and creates a consensus cigar line from the
+# constituent leaf nodes.
+sub consensus_cigar_line {
+my $self = shift;
+my @cigars;
+# First get an 'expanded' cigar string for each leaf of the subtree
+my @all_members = @{$self->get_all_Members};
+my $num_members = scalar(@all_members);
+foreach my $leaf (@all_members) {
+next unless( UNIVERSAL::can( $leaf, 'cigar_line' ) );
+my $cigar = $leaf->cigar_line;
+my $newcigar = "";
+#     $cigar =~ s/(\d*)([A-Z])/$2 x ($1||1)/ge; #Expand
+while ($cigar =~ /(\d*)([A-Z])/g) {
+$newcigar .= $2 x ($1 || 1);
+}
+$cigar = $newcigar;
+push @cigars, $cigar;
+}
+# Itterate through each character of the expanded cigars.
+# If there is a 'D' at a given location in any cigar,
+# set the consensus to 'D', otherwise assume an 'M'.
+# TODO: Fix assumption that cigar strings are always the same length,
+# and start at the same point.
+my $cigar_len = length( $cigars[0] );
+my $cons_cigar;
+for( my $i=0; $i<$cigar_len; $i++ ){
+my $char = 'M';
+my $num_deletions = 0;
+foreach my $cigar( @cigars ){
+if ( substr($cigar,$i,1) eq 'D'){
+$num_deletions++;
+}
+}
+if ($num_deletions * 3 > 2 * $num_members) {
+$char = "D";
+} elsif ($num_deletions * 3 > $num_members) {
+$char = "m";
+}
+$cons_cigar .= $char;
+}
+# Collapse the consensus cigar, e.g. 'DDDD' = 4D
+#   $cons_cigar =~ s/(\w)(\1*)/($2?length($2)+1:"").$1/ge;
+my $collapsed_cigar = "";
+while ($cons_cigar =~ /(D+|M+|I+|m+)/g) {
+$collapsed_cigar .= (length($1) == 1 ? "" : length($1)) . substr($1,0,1)
+}
+$cons_cigar = $collapsed_cigar;
+# Return the consensus
+return $cons_cigar;
+}
+my %TWOD_CODONS = ("TTT" => "Phe",#Phe
+"TTC" => "Phe",
+"TTA" => "Leu",#Leu
+"TTG" => "Leu",
+"TAT" => "Tyr",#Tyr
+"TAC" => "Tyr",
+"CAT" => "His",#His
+"CAC" => "His",
+"CAA" => "Gln",#Gln
+"CAG" => "Gln",
+"AAT" => "Asn",#Asn
+"AAC" => "Asn",
+"AAA" => "Lys",#Lys
+"AAG" => "Lys",
+"GAT" => "Asp",#Asp
+"GAC" => "Asp",
+"GAA" => "Glu",#Glu
+"GAG" => "Glu",
+"TGT" => "Cys",#Cys
+"TGC" => "Cys",
+"AGT" => "Ser",#Ser
+"AGC" => "Ser",
+"AGA" => "Arg",#Arg
+"AGG" => "Arg",
+"ATT" => "Ile",#Ile
+"ATC" => "Ile",
+"ATA" => "Ile");
+my %FOURD_CODONS = ("CTT" => "Leu",#Leu
+"CTC" => "Leu",
+"CTA" => "Leu",
+"CTG" => "Leu",
+"GTT" => "Val",#Val
+"GTC" => "Val",
+"GTA" => "Val",
+"GTG" => "Val",
+"TCT" => "Ser",#Ser
+"TCC" => "Ser",
+"TCA" => "Ser",
+"TCG" => "Ser",
+"CCT" => "Pro",#Pro
+"CCC" => "Pro",
+"CCA" => "Pro",
+"CCG" => "Pro",
+"ACT" => "Thr",#Thr
+"ACC" => "Thr",
+"ACA" => "Thr",
+"ACG" => "Thr",
+"GCT" => "Ala",#Ala
+"GCC" => "Ala",
+"GCA" => "Ala",
+"GCG" => "Ala",
+"CGT" => "Arg",#Arg
+"CGC" => "Arg",
+"CGA" => "Arg",
+"CGG" => "Arg",
+"GGT" => "Gly",#Gly
+"GGC" => "Gly",
+"GGA" => "Gly",
+"GGG" => "Gly");
+my %CODONS =   ("ATG" => "Met",
+"TGG" => "Trp",
+"TAA" => "TER",
+"TAG" => "TER",
+"TGA" => "TER",
+"---" => "---",
+%TWOD_CODONS,
+%FOURD_CODONS,
+);
+=head2 get_4D_SimpleAlign
+Example    : $4d_align = $homology->get_4D_SimpleAlign();
+Description: get 4 times degenerate positions pairwise simple alignment
+Returntype : Bio::SimpleAlign
+=cut
+sub get_4D_SimpleAlign {
+my $self = shift;
+my $sa = Bio::SimpleAlign->new();
+#Hack to try to work with both bioperl 0.7 and 1.2:
+#Check to see if the method is called 'addSeq' or 'add_seq'
+my $bio07 = 0;
+if(!$sa->can('add_seq')) {
+$bio07 = 1;
+}
+my $ma = $self->adaptor->db->get_MemberAdaptor;
+my %member_seqstr;
+foreach my $member (@{$self->get_all_Members}) {
+next if $member->source_name ne 'ENSEMBLPEP';
+my $seqstr = $member->cdna_alignment_string();
+next if(!$seqstr);
+#print STDERR $seqstr,"\n";
+my @tmp_tab = split /\s+/, $seqstr;
+#print STDERR "tnp_tab 0: ", $tmp_tab[0],"\n";
+$member_seqstr{$member->stable_id} = \@tmp_tab;
+}
+my $seqstr_length;
+foreach my $seqid (keys %member_seqstr) {
+unless (defined $seqstr_length) {
+#print STDERR $member_seqstr{$seqid}->[0],"\n";
+$seqstr_length = scalar @{$member_seqstr{$seqid}};
+next;
+}
+unless ($seqstr_length == scalar @{$member_seqstr{$seqid}}) {
+die "Length of dna alignment are not the same, $seqstr_length and " . scalar @{$member_seqstr{$seqid}} ." respectively for homology_id " . $self->dbID . "\n";
+}
+}
+my %FourD_member_seqstr;
+for (my $i=0; $i < $seqstr_length; $i++) {
+my $FourD_codon = 1;
+my $FourD_aminoacid;
+foreach my $seqid (keys %member_seqstr) {
+if (defined $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
+if (defined $FourD_aminoacid && $FourD_aminoacid eq $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
+#print STDERR "YES ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
+next;
+} elsif (defined $FourD_aminoacid) {
+#print STDERR "NO ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
+$FourD_codon = 0;
+last;
+} else {
+$FourD_aminoacid = $FOURD_CODONS{$member_seqstr{$seqid}->[$i]};
+#print STDERR $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i]," ";
+}
+next;
+} else {
+#print STDERR "NO ",$CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
+$FourD_codon = 0;
+last;
+}
+}
+next unless ($FourD_codon);
+foreach my $seqid (keys %member_seqstr) {
+$FourD_member_seqstr{$seqid} .= substr($member_seqstr{$seqid}->[$i],2,1);
+}
+}
+foreach my $seqid (keys %FourD_member_seqstr) {
+my $seq = Bio::LocatableSeq->new(
+-SEQ    => $FourD_member_seqstr{$seqid},
+-START  => 1,
+-END    => length($FourD_member_seqstr{$seqid}),
+-ID     => $seqid,
+-STRAND => 0
+);
+if($bio07) {
+$sa->addSeq($seq);
+} else {
+$sa->add_seq($seq);
+}
+}
+return $sa;
+}
+my %matrix_hash;
+{
+my $BLOSUM62 = "#  Matrix made by matblas from blosum62.iij
+#  * column uses minimum score
+#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+#  Blocks Database = /data/blocks_5.0/blocks.dat
+#  Cluster Percentage: >= 62
+#  Entropy =   0.6979, Expected =  -0.5209
+A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
+A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
+R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
+N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
+D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
+C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
+Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
+E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
+G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
+H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
+I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
+L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
+K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
+M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
+F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
+S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
+T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
+Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
+V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
+B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
+Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
+X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
+* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
+";
+my $matrix_string;
+my @lines = split(/\n/,$BLOSUM62);
+foreach my $line (@lines) {
+next if ($line =~ /^\#/);
+if ($line =~ /^[A-Z\*\s]+$/) {
+$matrix_string .= sprintf "$line\n";
+} else {
+my @t = split(/\s+/,$line);
+shift @t;
+#       print scalar @t,"\n";
+$matrix_string .= sprintf(join(" ",@t)."\n");
+}
+}
+#  my %matrix_hash;
+@lines = ();
+@lines = split /\n/, $matrix_string;
+my $lts = shift @lines;
+$lts =~ s/^\s+//;
+$lts =~ s/\s+$//;
+my @letters = split /\s+/, $lts;
+foreach my $letter (@letters) {
+my $line = shift @lines;
+$line =~ s/^\s+//;
+$line =~ s/\s+$//;
+my @penalties = split /\s+/, $line;
+die "Size of letters array and penalties array are different\n"
+unless (scalar @letters == scalar @penalties);
+for (my $i=0; $i < scalar @letters; $i++) {
+$matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
+}
+}
+}
+sub update_alignment_stats {
+my $self = shift;
+my $genes = $self->get_all_Members;
+my $ngenes = scalar(@$genes);
+if ($ngenes == 2) {
+# This code is >4 times faster with pairs of genes
+my $gene1 = $genes->[0];
+my $gene2 = $genes->[1];
+my $new_aln1_cigarline = "";
+my $new_aln2_cigarline = "";
+my $identical_matches = 0;
+my $positive_matches = 0;
+my ($aln1state, $aln2state);
+my ($aln1count, $aln2count);
+my ($aln1cov, $aln2cov) = (0,0);
+my @aln1 = split(//, $gene1->alignment_string);
+my @aln2 = split(//, $gene2->alignment_string);
+for (my $i=0; $i <= $#aln1; $i++) {
+next if ($aln1[$i] eq '-' && $aln2[$i] eq '-');
+my $cur_aln1state = ($aln1[$i] eq '-' ? 'D' : 'M');
+my $cur_aln2state = ($aln2[$i] eq '-' ? 'D' : 'M');
+$aln1cov++ if $cur_aln1state ne 'D';
+$aln2cov++ if $cur_aln2state ne 'D';
+if ($cur_aln1state eq 'M' && $cur_aln2state eq 'M') {
+if ($aln1[$i] eq $aln2[$i]) {
+$identical_matches++;
+$positive_matches++;
+} elsif ($matrix_hash{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
+$positive_matches++;
+}
+}
+unless (defined $aln1state) {
+$aln1count = 1;
+$aln2count = 1;
+$aln1state = $cur_aln1state;
+$aln2state = $cur_aln2state;
+next;
+}
+if ($cur_aln1state eq $aln1state) {
+$aln1count++;
+} else {
+if ($aln1count == 1) {
+$new_aln1_cigarline .= $aln1state;
+} else {
+$new_aln1_cigarline .= $aln1count.$aln1state;
+}
+$aln1count = 1;
+$aln1state = $cur_aln1state;
+}
+if ($cur_aln2state eq $aln2state) {
+$aln2count++;
+} else {
+if ($aln2count == 1) {
+$new_aln2_cigarline .= $aln2state;
+} else {
+$new_aln2_cigarline .= $aln2count.$aln2state;
+}
+$aln2count = 1;
+$aln2state = $cur_aln2state;
+}
+}
+if ($aln1count == 1) {
+$new_aln1_cigarline .= $aln1state;
+} else {
+$new_aln1_cigarline .= $aln1count.$aln1state;
+}
+if ($aln2count == 1) {
+$new_aln2_cigarline .= $aln2state;
+} else {
+$new_aln2_cigarline .= $aln2count.$aln2state;
+}
+my $seq_length1 = $gene1->seq_length;
+unless (0 == $seq_length1) {
+$gene1->cigar_line($new_aln1_cigarline);
+$gene1->perc_id( int((100.0 * $identical_matches / $seq_length1 + 0.5)) );
+$gene1->perc_pos( int((100.0 * $positive_matches  / $seq_length1 + 0.5)) );
+$gene1->perc_cov( int((100.0 * $aln1cov / $seq_length1 + 0.5)) );
+}
+my $seq_length2 = $gene2->seq_length;
+unless (0 == $seq_length2) {
+$gene2->cigar_line($new_aln2_cigarline);
+$gene2->perc_id( int((100.0 * $identical_matches / $seq_length2 + 0.5)) );
+$gene2->perc_pos( int((100.0 * $positive_matches  / $seq_length2 + 0.5)) );
+$gene2->perc_cov( int((100.0 * $aln2cov / $seq_length2 + 0.5)) );
+}
+return undef;
+}
+my $min_seq = shift;
+$min_seq = int($min_seq * $ngenes) if $min_seq <= 1;
+my @new_cigars   = ('') x $ngenes;
+my @nmatch_id    = (0) x $ngenes;
+my @nmatch_pos   = (0) x $ngenes;
+my @nmatch_cov   = (0) x $ngenes;
+my @alncount     = (1) x $ngenes;
+my @alnstate     = (undef) x $ngenes;
+my @cur_alnstate = (undef) x $ngenes;
+my @aln = map {$_->alignment_string} @$genes;
+my $aln_length = length($aln[0]);
+for (my $i=0; $i < $aln_length; $i++) {
+my @char_i =  map {substr($_, $i, 1)} @aln;
+#print "pos $i: ", join('/', @char_i), "\n";
+my %seen;
+map {$seen{$_}++} @char_i;
+next if $seen{'-'} == $ngenes;
+delete $seen{'-'};
+my %pos_chars = ();
+my @chars = keys %seen;
+while (my $c1 = shift @chars) {
+foreach my $c2 (@chars) {
+if (($matrix_hash{uc $c1}{uc $c2} > 0) and ($seen{$c1}+$seen{$c2} >= $min_seq)) {
+$pos_chars{$c1} = 1;
+$pos_chars{$c2} = 1;
+}
+}
+}
+for (my $j=0; $j<$ngenes; $j++) {
+if ($char_i[$j] eq '-') {
+$cur_alnstate[$j] = 'D';
+} else {
+$nmatch_cov[$j]++;
+$cur_alnstate[$j] = 'M';
+if ($seen{$char_i[$j]} >= $min_seq) {
+$nmatch_id[$j]++;
+$nmatch_pos[$j]++;
+} elsif (exists $pos_chars{$char_i[$j]}) {
+$nmatch_pos[$j]++;
+}
+}
+}
+if ($i == 0) {
+@alnstate = @cur_alnstate;
+next;
+}
+for (my $j=0; $j<$ngenes; $j++) {
+if ($cur_alnstate[$j] eq $alnstate[$j]) {
+$alncount[$j]++;
+} else {
+if ($alncount[$j] == 1) {
+$new_cigars[$j] .= $alnstate[$j];
+} else {
+$new_cigars[$j] .= $alncount[$j].$alnstate[$j];
+}
+$alncount[$j] = 1;
+$alnstate[$j] = $cur_alnstate[$j];
+}
+}
+}
+for (my $j=0; $j<$ngenes; $j++) {
+if ($alncount[$j] == 1) {
+$new_cigars[$j] .= $alnstate[$j];
+} else {
+$new_cigars[$j] .= $alncount[$j].$alnstate[$j];
+}
+$genes->[$j]->cigar_line($new_cigars[$j]);
+my $seq_length = $genes->[$j]->seq_length;
+unless (0 == $seq_length) {
+$genes->[$j]->perc_id( int((100.0 * $nmatch_id[$j] / $seq_length + 0.5)) );
+$genes->[$j]->perc_pos( int((100.0 * $nmatch_pos[$j] / $seq_length + 0.5)) );
+$genes->[$j]->perc_cov( int((100.0 * $nmatch_cov[$j] / $seq_length + 0.5)) );
+}
+}
+}
+1;

Mercurial > repos > willmclaren > ensembl_vep

comparison variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm @ 0:21066c0abaf5 draft