ensembl: variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm comparison

comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm @ 0:1f6dce3d34e0

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1f6dce3d34e0
+=head1 LICENSE
+Copyright (c) 1999-2012 The European Bioinformatics Institute and
+Genome Research Limited.  All rights reserved.
+This software is distributed under a modified Apache license.
+For license details, please see
+http://www.ensembl.org/info/about/code_licence.html
+=head1 CONTACT
+Please email comments or questions to the public Ensembl
+developers list at <dev@ensembl.org>.
+Questions may also be sent to the Ensembl help desk at
+<helpdesk@ensembl.org>.
+=cut
+=head1 NAME
+Bio::EnsEMBL::Variation::Utils::ComparaUtils
+=head1 DESCRIPTION
+This module exports two subroutines, dump_alignment_for_polyphen and
+dump_alignment_for_sift that write Compara alignments to files in the
+formats expected by PolyPhen and SIFT. This allows you to use
+Compara alignments in place of both these tools' alignment pipelines
+=cut
+package Bio::EnsEMBL::Variation::Utils::ComparaUtils;
+use strict;
+use warnings;
+use Bio::EnsEMBL::Utils::Exception qw(throw warning);
+use Bio::EnsEMBL::Registry;
+use Bio::SimpleAlign;
+use Bio::AlignIO;
+use Bio::LocatableSeq;
+use base qw(Exporter);
+our @EXPORT_OK = qw(dump_alignment_for_polyphen dump_alignment_for_sift);
+my $MAX_PSIC_SEQS   = 8190;
+my $MAX_PSIC_SEQLEN = 409650;
+sub _ungap_alignment {
+# turn a gapped alignment into an ungapped alignment
+# with respect to the given query_id,
+#
+# e.g. if our query sequence is no. 1 below, we want to
+# turn this:
+#
+# 1: -ABC--DEFG-H---
+# 2: ---IJKLMN--OPQR
+# 3: ST------UVWXYZ-
+#
+# into:
+#
+# 1: ABCDEFGH
+# 2: --ILMN-O
+# 3: T----UVX
+my ($gapped_alignment, $query_id, $include_query) = @_;
+my @seqs = $gapped_alignment->each_seq;
+# find the Seq object corresponding to the query id
+my $query_seq;
+for my $seq (@seqs) {
+$query_seq = $seq;
+last if $seq->display_id eq $query_id;
+}
+throw("Could not find query sequence '$query_id' in the alignment")
+unless $query_seq->display_id eq $query_id;
+my $qseq = $query_seq->seq;
+#print $qseq, " (".length($qseq).")\n";
+# first we split the query sequence into its component parts and gaps,
+# so -ABC--DEFG-H--- will become ('ABC','DEFG','H') and ('-','--','-','---')
+# we grep because we don't want any empty strings that otherwise seem to
+# make it through
+my @parts = grep {$_} split /-+/, $qseq;
+my @gaps = grep {$_} split /[^-]+/, $qseq;
+my @chunks;
+# then we compute the coordinates of each sequence chunk,
+# taking into account that we might start with a gap
+my $offset = substr($qseq,0,1) eq '-' ? length(shift @gaps) : 0;
+# we build up a list of [start, length] pairs for each chunk
+# of sequence, incrementing our offset by the length of each
+# chunk and the gap that follows it, for our example above this
+# will result in ([1,3],[6,4],[11,1])
+for my $part (@parts) {
+my $l = length($part);
+push @chunks, [$offset, $l];
+#print "$part: $offset - $l\n";
+$offset += $l + length(shift @gaps || '');
+}
+# we then use this list of chunks to obtain the aligned portions
+# of all other members of the alignment and create a new alignment,
+my @seq_objs;
+for my $seq (@seqs) {
+my $old_seq = $seq->seq;
+my $new_seq;
+for my $chunk (@chunks) {
+$new_seq .= substr($old_seq, $chunk->[0], $chunk->[1]);
+}
+next if $new_seq =~ /^-*$/;
+#my $gap_count = ($new_seq =~ tr/-//);
+#next if $gap_count / length($new_seq) > 0.1;
+if ($seq->display_id eq $query_id) {
+$query_seq =  Bio::LocatableSeq->new(
+-SEQ    => $new_seq,
+-START  => 1,
+-END    => length($new_seq),
+-ID     => 'QUERY',
+-STRAND => 0
+);
+unshift @seq_objs, $query_seq if $include_query;
+}
+else {
+push @seq_objs,  Bio::LocatableSeq->new(
+-SEQ    => $new_seq,
+-START  => 1,
+-END    => length($new_seq),
+-ID     => $seq->display_id,
+-STRAND => 0
+)
+}
+}
+my $new_align = Bio::SimpleAlign->new;
+$new_align->add_seq($_) for @seq_objs;
+# sometimes we want the query sequence back as well as the new alignment
+return wantarray ? ($query_seq, $new_align) : $new_align;
+}
+sub _get_ungapped_alignment {
+# get an ungapped Bio::SimpleAlign for the given query translation
+my ($translation_stable_id, $include_query) = @_;
+my $compara_dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'compara')
+or throw("Failed to get compara DBAdaptor");
+my $ma = $compara_dba->get_MemberAdaptor
+or throw("Failed to get member adaptor");
+my $fa = $compara_dba->get_FamilyAdaptor
+or throw("Failed to get family adaptor");
+my $member = $ma->fetch_by_source_stable_id("ENSEMBLPEP", $translation_stable_id)
+or throw("Didn't find family member for $translation_stable_id");
+my $fams = $fa->fetch_all_by_Member($member)
+or throw("Didn't find a family for $translation_stable_id");
+throw("$translation_stable_id is in more than one family") if @$fams > 1;
+my $orig_align = $fams->[0]->get_SimpleAlign;
+$compara_dba->dbc->disconnect_if_idle;
+return _ungap_alignment(
+$orig_align,
+$translation_stable_id,
+$include_query
+);
+}
+sub _percent_id {
+my ($q, $a) = @_;
+my @q = split //, $q->seq;
+my @a = split //, $a->seq;
+my $num = scalar(@q);
+my $tot = 0.0;
+for (my $i = 0; $i < $num; $i++ ) {
+$tot++ if $q[$i] eq $a[$i];
+}
+return ($tot / $num);
+}
+=head2 dump_alignment_for_polyphen
+Arg[1]      : string $translation_stable_id - the stable of the Ensembl translation
+you want to run PolyPhen on
+Arg[2]      : string $file - the name of the file you want to write the alignment to
+Description : Fetches the Compara protein family containing the specified translation
+(if available), ungaps the alignment with respect to the translation, and
+writes the alignment to the specified file in the format expected by PolyPhen
+Returntype  : none
+Exceptions  : throws if an alignment cannot be found, or if the file cannot be written
+Status      : At Risk
+=cut
+sub dump_alignment_for_polyphen {
+my ($translation_stable_id, $file) = @_;
+# polyphen does not want the query included in the alignment
+my ($query_seq, $alignment) = _get_ungapped_alignment($translation_stable_id, 0);
+my @seqs = $alignment->each_seq;
+unless (scalar(@seqs)) {
+throw("No sequences in the alignment for $translation_stable_id");
+}
+if (length($seqs[0]->seq) > $MAX_PSIC_SEQLEN) {
+throw("$translation_stable_id sequence too long for PSIC");
+}
+# polyphen expects the alignment to be sorted descending by % id
+my $percent_id;
+for my $seq (@seqs) {
+$percent_id->{$seq->id} = _percent_id($query_seq, $seq);
+}
+my @sorted = sort { $percent_id->{$b->id} <=> $percent_id->{$a->id} } @seqs;
+# the format is similar to a clustalw .aln file, but it *must* have a
+# 70 character description beginning each line, and then the alignment string
+# on the rest of the line with no line breaks between sequences, PSIC
+# can also only deal with a fixed maximum sequence length and number
+# of sequences on the alignment, these are set in the constants at the
+# top of this file
+my $num_seqs = scalar(@seqs);
+open my $ALN, ">$file" or throw("Failed to open $file for writing");
+print $ALN "CLUSTAL $translation_stable_id (".($num_seqs > $MAX_PSIC_SEQS ? $MAX_PSIC_SEQS : $num_seqs).")\n\n";
+my $count = 0;
+for my $seq (@sorted) {
+if ($percent_id->{$seq->id} == 1) {
+#warn "Ignoring identical seq ".$seq->id."\n";
+#next;
+}
+my $id = $seq->id;
+my $extra = 70 - length($id);
+print $ALN $seq->id, ' ' x $extra, $seq->seq, "\n";
+last if ++$count >= $MAX_PSIC_SEQS;
+}
+close $ALN;
+}
+=head2 dump_alignment_for_sift
+Arg[1]      : string $translation_stable_id - the stable id of the Ensembl translation
+you want to run SIFT on
+Arg[2]      : string $file - the name of the file you want to write the alignment to
+Description : Fetches the Compara protein family containing the specified translation
+(if available), ungaps the alignment with respect to the translation, and
+writes the alignment to the specified file in the format expected by SIFT
+Returntype  : none
+Exceptions  : throws if an alignment cannot be found, or if the file cannot be written
+Status      : At Risk
+=cut
+sub dump_alignment_for_sift {
+my ($translation_stable_id, $file) = @_;
+# SIFT is easy, we just fetch an ungapped alignment including the query
+# and dump it in FASTA format
+my $aln = _get_ungapped_alignment($translation_stable_id, 1);
+open my $ALN, ">$file" or throw("Failed to open $file for writing");
+my $alignIO = Bio::AlignIO->newFh(
+-interleaved => 0,
+-fh          => $ALN,
+-format      => 'fasta',
+-idlength    => 20,
+);
+print $alignIO $aln;
+close $ALN;
+}
+1;

Mercurial > repos > mahtabm > ensembl

comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm @ 0:1f6dce3d34e0