Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,337 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Variation::Utils::ComparaUtils + +=head1 DESCRIPTION + +This module exports two subroutines, dump_alignment_for_polyphen and +dump_alignment_for_sift that write Compara alignments to files in the +formats expected by PolyPhen and SIFT. This allows you to use +Compara alignments in place of both these tools' alignment pipelines + +=cut + +package Bio::EnsEMBL::Variation::Utils::ComparaUtils; + +use strict; +use warnings; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Bio::EnsEMBL::Registry; +use Bio::SimpleAlign; +use Bio::AlignIO; +use Bio::LocatableSeq; + +use base qw(Exporter); + +our @EXPORT_OK = qw(dump_alignment_for_polyphen dump_alignment_for_sift); + +my $MAX_PSIC_SEQS = 8190; +my $MAX_PSIC_SEQLEN = 409650; + +sub _ungap_alignment { + + # turn a gapped alignment into an ungapped alignment + # with respect to the given query_id, + # + # e.g. if our query sequence is no. 1 below, we want to + # turn this: + # + # 1: -ABC--DEFG-H--- + # 2: ---IJKLMN--OPQR + # 3: ST------UVWXYZ- + # + # into: + # + # 1: ABCDEFGH + # 2: --ILMN-O + # 3: T----UVX + + my ($gapped_alignment, $query_id, $include_query) = @_; + + my @seqs = $gapped_alignment->each_seq; + + # find the Seq object corresponding to the query id + + my $query_seq; + + for my $seq (@seqs) { + $query_seq = $seq; + last if $seq->display_id eq $query_id; + } + + throw("Could not find query sequence '$query_id' in the alignment") + unless $query_seq->display_id eq $query_id; + + my $qseq = $query_seq->seq; + + #print $qseq, " (".length($qseq).")\n"; + + # first we split the query sequence into its component parts and gaps, + # so -ABC--DEFG-H--- will become ('ABC','DEFG','H') and ('-','--','-','---') + + # we grep because we don't want any empty strings that otherwise seem to + # make it through + + my @parts = grep {$_} split /-+/, $qseq; + my @gaps = grep {$_} split /[^-]+/, $qseq; + + my @chunks; + + # then we compute the coordinates of each sequence chunk, + # taking into account that we might start with a gap + + my $offset = substr($qseq,0,1) eq '-' ? length(shift @gaps) : 0; + + # we build up a list of [start, length] pairs for each chunk + # of sequence, incrementing our offset by the length of each + # chunk and the gap that follows it, for our example above this + # will result in ([1,3],[6,4],[11,1]) + + for my $part (@parts) { + + my $l = length($part); + + push @chunks, [$offset, $l]; + + #print "$part: $offset - $l\n"; + + $offset += $l + length(shift @gaps || ''); + } + + # we then use this list of chunks to obtain the aligned portions + # of all other members of the alignment and create a new alignment, + + my @seq_objs; + + for my $seq (@seqs) { + + my $old_seq = $seq->seq; + my $new_seq; + + for my $chunk (@chunks) { + $new_seq .= substr($old_seq, $chunk->[0], $chunk->[1]); + } + + next if $new_seq =~ /^-*$/; + + #my $gap_count = ($new_seq =~ tr/-//); + + #next if $gap_count / length($new_seq) > 0.1; + + if ($seq->display_id eq $query_id) { + + $query_seq = Bio::LocatableSeq->new( + -SEQ => $new_seq, + -START => 1, + -END => length($new_seq), + -ID => 'QUERY', + -STRAND => 0 + ); + + unshift @seq_objs, $query_seq if $include_query; + } + else { + push @seq_objs, Bio::LocatableSeq->new( + -SEQ => $new_seq, + -START => 1, + -END => length($new_seq), + -ID => $seq->display_id, + -STRAND => 0 + ) + } + } + + my $new_align = Bio::SimpleAlign->new; + + $new_align->add_seq($_) for @seq_objs; + + # sometimes we want the query sequence back as well as the new alignment + + return wantarray ? ($query_seq, $new_align) : $new_align; +} + +sub _get_ungapped_alignment { + + # get an ungapped Bio::SimpleAlign for the given query translation + + my ($translation_stable_id, $include_query) = @_; + + my $compara_dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'compara') + or throw("Failed to get compara DBAdaptor"); + + my $ma = $compara_dba->get_MemberAdaptor + or throw("Failed to get member adaptor"); + + my $fa = $compara_dba->get_FamilyAdaptor + or throw("Failed to get family adaptor"); + + my $member = $ma->fetch_by_source_stable_id("ENSEMBLPEP", $translation_stable_id) + or throw("Didn't find family member for $translation_stable_id"); + + my $fams = $fa->fetch_all_by_Member($member) + or throw("Didn't find a family for $translation_stable_id"); + + throw("$translation_stable_id is in more than one family") if @$fams > 1; + + my $orig_align = $fams->[0]->get_SimpleAlign; + + $compara_dba->dbc->disconnect_if_idle; + + return _ungap_alignment( + $orig_align, + $translation_stable_id, + $include_query + ); +} + +sub _percent_id { + my ($q, $a) = @_; + + my @q = split //, $q->seq; + my @a = split //, $a->seq; + + my $num = scalar(@q); + + my $tot = 0.0; + + for (my $i = 0; $i < $num; $i++ ) { + $tot++ if $q[$i] eq $a[$i]; + } + + return ($tot / $num); +} + +=head2 dump_alignment_for_polyphen + + Arg[1] : string $translation_stable_id - the stable of the Ensembl translation + you want to run PolyPhen on + Arg[2] : string $file - the name of the file you want to write the alignment to + Description : Fetches the Compara protein family containing the specified translation + (if available), ungaps the alignment with respect to the translation, and + writes the alignment to the specified file in the format expected by PolyPhen + Returntype : none + Exceptions : throws if an alignment cannot be found, or if the file cannot be written + Status : At Risk + +=cut + +sub dump_alignment_for_polyphen { + + my ($translation_stable_id, $file) = @_; + + # polyphen does not want the query included in the alignment + + my ($query_seq, $alignment) = _get_ungapped_alignment($translation_stable_id, 0); + + my @seqs = $alignment->each_seq; + + unless (scalar(@seqs)) { + throw("No sequences in the alignment for $translation_stable_id"); + } + + if (length($seqs[0]->seq) > $MAX_PSIC_SEQLEN) { + throw("$translation_stable_id sequence too long for PSIC"); + } + + # polyphen expects the alignment to be sorted descending by % id + + my $percent_id; + + for my $seq (@seqs) { + $percent_id->{$seq->id} = _percent_id($query_seq, $seq); + } + + my @sorted = sort { $percent_id->{$b->id} <=> $percent_id->{$a->id} } @seqs; + + # the format is similar to a clustalw .aln file, but it *must* have a + # 70 character description beginning each line, and then the alignment string + # on the rest of the line with no line breaks between sequences, PSIC + # can also only deal with a fixed maximum sequence length and number + # of sequences on the alignment, these are set in the constants at the + # top of this file + + my $num_seqs = scalar(@seqs); + + open my $ALN, ">$file" or throw("Failed to open $file for writing"); + + print $ALN "CLUSTAL $translation_stable_id (".($num_seqs > $MAX_PSIC_SEQS ? $MAX_PSIC_SEQS : $num_seqs).")\n\n"; + + my $count = 0; + + for my $seq (@sorted) { + if ($percent_id->{$seq->id} == 1) { + #warn "Ignoring identical seq ".$seq->id."\n"; + #next; + } + + my $id = $seq->id; + my $extra = 70 - length($id); + + print $ALN $seq->id, ' ' x $extra, $seq->seq, "\n"; + + last if ++$count >= $MAX_PSIC_SEQS; + } + + close $ALN; +} + +=head2 dump_alignment_for_sift + + Arg[1] : string $translation_stable_id - the stable id of the Ensembl translation + you want to run SIFT on + Arg[2] : string $file - the name of the file you want to write the alignment to + Description : Fetches the Compara protein family containing the specified translation + (if available), ungaps the alignment with respect to the translation, and + writes the alignment to the specified file in the format expected by SIFT + Returntype : none + Exceptions : throws if an alignment cannot be found, or if the file cannot be written + Status : At Risk + +=cut + +sub dump_alignment_for_sift { + + my ($translation_stable_id, $file) = @_; + + # SIFT is easy, we just fetch an ungapped alignment including the query + # and dump it in FASTA format + + my $aln = _get_ungapped_alignment($translation_stable_id, 1); + + open my $ALN, ">$file" or throw("Failed to open $file for writing"); + + my $alignIO = Bio::AlignIO->newFh( + -interleaved => 0, + -fh => $ALN, + -format => 'fasta', + -idlength => 20, + ); + + print $alignIO $aln; + + close $ALN; +} + +1; +
