diff variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,337 @@
+=head1 LICENSE
+
+ Copyright (c) 1999-2012 The European Bioinformatics Institute and
+ Genome Research Limited.  All rights reserved.
+
+ This software is distributed under a modified Apache license.
+ For license details, please see
+
+   http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+ Please email comments or questions to the public Ensembl
+ developers list at <dev@ensembl.org>.
+
+ Questions may also be sent to the Ensembl help desk at
+ <helpdesk@ensembl.org>.
+
+=cut
+
+=head1 NAME
+
+Bio::EnsEMBL::Variation::Utils::ComparaUtils
+
+=head1 DESCRIPTION
+
+This module exports two subroutines, dump_alignment_for_polyphen and
+dump_alignment_for_sift that write Compara alignments to files in the 
+formats expected by PolyPhen and SIFT. This allows you to use
+Compara alignments in place of both these tools' alignment pipelines
+
+=cut
+
+package Bio::EnsEMBL::Variation::Utils::ComparaUtils;
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Registry;
+use Bio::SimpleAlign;
+use Bio::AlignIO;
+use Bio::LocatableSeq;
+
+use base qw(Exporter);
+
+our @EXPORT_OK = qw(dump_alignment_for_polyphen dump_alignment_for_sift);
+
+my $MAX_PSIC_SEQS   = 8190;
+my $MAX_PSIC_SEQLEN = 409650;
+
+sub _ungap_alignment {
+    
+    # turn a gapped alignment into an ungapped alignment
+    # with respect to the given query_id,
+    #
+    # e.g. if our query sequence is no. 1 below, we want to 
+    # turn this:
+    #
+    # 1: -ABC--DEFG-H---
+    # 2: ---IJKLMN--OPQR
+    # 3: ST------UVWXYZ-
+    #
+    # into:
+    #
+    # 1: ABCDEFGH
+    # 2: --ILMN-O
+    # 3: T----UVX
+
+    my ($gapped_alignment, $query_id, $include_query) = @_;
+
+    my @seqs = $gapped_alignment->each_seq;
+
+    # find the Seq object corresponding to the query id 
+
+    my $query_seq;
+
+    for my $seq (@seqs) {
+        $query_seq = $seq;
+        last if $seq->display_id eq $query_id;
+    }
+
+    throw("Could not find query sequence '$query_id' in the alignment")
+        unless $query_seq->display_id eq $query_id;
+
+    my $qseq = $query_seq->seq;
+
+    #print $qseq, " (".length($qseq).")\n";
+
+    # first we split the query sequence into its component parts and gaps,
+    # so -ABC--DEFG-H--- will become ('ABC','DEFG','H') and ('-','--','-','---')
+    
+    # we grep because we don't want any empty strings that otherwise seem to 
+    # make it through
+
+    my @parts = grep {$_} split /-+/, $qseq;
+    my @gaps = grep {$_} split /[^-]+/, $qseq;
+
+    my @chunks;
+
+    # then we compute the coordinates of each sequence chunk,
+    # taking into account that we might start with a gap
+
+    my $offset = substr($qseq,0,1) eq '-' ? length(shift @gaps) : 0;
+
+    # we build up a list of [start, length] pairs for each chunk
+    # of sequence, incrementing our offset by the length of each 
+    # chunk and the gap that follows it, for our example above this 
+    # will result in ([1,3],[6,4],[11,1])
+
+    for my $part (@parts) {
+
+        my $l = length($part);
+
+        push @chunks, [$offset, $l];
+
+        #print "$part: $offset - $l\n";
+
+        $offset += $l + length(shift @gaps || '');
+    }
+
+    # we then use this list of chunks to obtain the aligned portions
+    # of all other members of the alignment and create a new alignment,
+   
+    my @seq_objs;
+
+    for my $seq (@seqs) {
+        
+        my $old_seq = $seq->seq;
+        my $new_seq;
+
+        for my $chunk (@chunks) {
+            $new_seq .= substr($old_seq, $chunk->[0], $chunk->[1]);
+        }
+
+        next if $new_seq =~ /^-*$/;
+
+        #my $gap_count = ($new_seq =~ tr/-//);
+
+        #next if $gap_count / length($new_seq) > 0.1;
+
+        if ($seq->display_id eq $query_id) {
+
+            $query_seq =  Bio::LocatableSeq->new(
+                -SEQ    => $new_seq,
+                -START  => 1,
+                -END    => length($new_seq),
+                -ID     => 'QUERY',
+                -STRAND => 0
+            );
+            
+            unshift @seq_objs, $query_seq if $include_query;
+        }
+        else {
+            push @seq_objs,  Bio::LocatableSeq->new(
+                -SEQ    => $new_seq,
+                -START  => 1,
+                -END    => length($new_seq),
+                -ID     => $seq->display_id,
+                -STRAND => 0
+            )
+        }
+    }
+
+    my $new_align = Bio::SimpleAlign->new;
+
+    $new_align->add_seq($_) for @seq_objs;
+
+    # sometimes we want the query sequence back as well as the new alignment
+
+    return wantarray ? ($query_seq, $new_align) : $new_align;
+}
+
+sub _get_ungapped_alignment {
+
+    # get an ungapped Bio::SimpleAlign for the given query translation
+
+    my ($translation_stable_id, $include_query) = @_;
+
+    my $compara_dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'compara')
+        or throw("Failed to get compara DBAdaptor");
+
+    my $ma = $compara_dba->get_MemberAdaptor
+        or throw("Failed to get member adaptor");
+
+    my $fa = $compara_dba->get_FamilyAdaptor
+        or throw("Failed to get family adaptor");
+
+    my $member = $ma->fetch_by_source_stable_id("ENSEMBLPEP", $translation_stable_id) 
+        or throw("Didn't find family member for $translation_stable_id");
+
+    my $fams = $fa->fetch_all_by_Member($member) 
+        or throw("Didn't find a family for $translation_stable_id");
+
+    throw("$translation_stable_id is in more than one family") if @$fams > 1;
+
+    my $orig_align = $fams->[0]->get_SimpleAlign;
+
+    $compara_dba->dbc->disconnect_if_idle;
+
+    return _ungap_alignment(
+        $orig_align, 
+        $translation_stable_id, 
+        $include_query
+    );
+}
+
+sub _percent_id {
+    my ($q, $a) = @_;
+
+    my @q = split //, $q->seq;
+    my @a = split //, $a->seq;
+
+    my $num = scalar(@q);
+        
+    my $tot = 0.0;
+
+    for (my $i = 0; $i < $num; $i++ ) {
+        $tot++ if $q[$i] eq $a[$i];
+    }
+
+    return ($tot / $num);
+}
+
+=head2 dump_alignment_for_polyphen
+
+  Arg[1]      : string $translation_stable_id - the stable of the Ensembl translation
+                you want to run PolyPhen on
+  Arg[2]      : string $file - the name of the file you want to write the alignment to
+  Description : Fetches the Compara protein family containing the specified translation
+                (if available), ungaps the alignment with respect to the translation, and 
+                writes the alignment to the specified file in the format expected by PolyPhen
+  Returntype  : none
+  Exceptions  : throws if an alignment cannot be found, or if the file cannot be written 
+  Status      : At Risk
+  
+=cut
+
+sub dump_alignment_for_polyphen {
+
+    my ($translation_stable_id, $file) = @_;
+
+    # polyphen does not want the query included in the alignment
+
+    my ($query_seq, $alignment) = _get_ungapped_alignment($translation_stable_id, 0);
+
+    my @seqs = $alignment->each_seq;
+    
+    unless (scalar(@seqs)) {
+        throw("No sequences in the alignment for $translation_stable_id");
+    }
+
+    if (length($seqs[0]->seq) > $MAX_PSIC_SEQLEN) {
+        throw("$translation_stable_id sequence too long for PSIC");
+    }
+
+    # polyphen expects the alignment to be sorted descending by % id
+
+    my $percent_id;
+
+    for my $seq (@seqs) {
+        $percent_id->{$seq->id} = _percent_id($query_seq, $seq);
+    }
+
+    my @sorted = sort { $percent_id->{$b->id} <=> $percent_id->{$a->id} } @seqs;
+
+    # the format is similar to a clustalw .aln file, but it *must* have a
+    # 70 character description beginning each line, and then the alignment string
+    # on the rest of the line with no line breaks between sequences, PSIC
+    # can also only deal with a fixed maximum sequence length and number
+    # of sequences on the alignment, these are set in the constants at the
+    # top of this file
+
+    my $num_seqs = scalar(@seqs);
+
+    open my $ALN, ">$file" or throw("Failed to open $file for writing");
+
+    print $ALN "CLUSTAL $translation_stable_id (".($num_seqs > $MAX_PSIC_SEQS ? $MAX_PSIC_SEQS : $num_seqs).")\n\n";
+
+    my $count = 0;
+
+    for my $seq (@sorted) {
+        if ($percent_id->{$seq->id} == 1) {
+            #warn "Ignoring identical seq ".$seq->id."\n";
+            #next;
+        }
+
+        my $id = $seq->id;
+        my $extra = 70 - length($id);
+        
+        print $ALN $seq->id, ' ' x $extra, $seq->seq, "\n";                
+
+        last if ++$count >= $MAX_PSIC_SEQS;
+    }
+
+    close $ALN;
+}
+
+=head2 dump_alignment_for_sift
+
+  Arg[1]      : string $translation_stable_id - the stable id of the Ensembl translation
+                you want to run SIFT on
+  Arg[2]      : string $file - the name of the file you want to write the alignment to
+  Description : Fetches the Compara protein family containing the specified translation
+                (if available), ungaps the alignment with respect to the translation, and 
+                writes the alignment to the specified file in the format expected by SIFT
+  Returntype  : none
+  Exceptions  : throws if an alignment cannot be found, or if the file cannot be written 
+  Status      : At Risk
+  
+=cut
+
+sub dump_alignment_for_sift {
+
+    my ($translation_stable_id, $file) = @_;
+
+    # SIFT is easy, we just fetch an ungapped alignment including the query 
+    # and dump it in FASTA format
+
+    my $aln = _get_ungapped_alignment($translation_stable_id, 1);
+
+    open my $ALN, ">$file" or throw("Failed to open $file for writing");
+
+    my $alignIO = Bio::AlignIO->newFh(
+        -interleaved => 0,
+        -fh          => $ALN,
+        -format      => 'fasta',
+        -idlength    => 20,
+    );
+
+    print $alignIO $aln;
+
+    close $ALN;
+}
+
+1;
+