comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Variation::Utils::ComparaUtils
24
25 =head1 DESCRIPTION
26
27 This module exports two subroutines, dump_alignment_for_polyphen and
28 dump_alignment_for_sift that write Compara alignments to files in the
29 formats expected by PolyPhen and SIFT. This allows you to use
30 Compara alignments in place of both these tools' alignment pipelines
31
32 =cut
33
34 package Bio::EnsEMBL::Variation::Utils::ComparaUtils;
35
36 use strict;
37 use warnings;
38
39 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
40 use Bio::EnsEMBL::Registry;
41 use Bio::SimpleAlign;
42 use Bio::AlignIO;
43 use Bio::LocatableSeq;
44
45 use base qw(Exporter);
46
47 our @EXPORT_OK = qw(dump_alignment_for_polyphen dump_alignment_for_sift);
48
49 my $MAX_PSIC_SEQS = 8190;
50 my $MAX_PSIC_SEQLEN = 409650;
51
52 sub _ungap_alignment {
53
54 # turn a gapped alignment into an ungapped alignment
55 # with respect to the given query_id,
56 #
57 # e.g. if our query sequence is no. 1 below, we want to
58 # turn this:
59 #
60 # 1: -ABC--DEFG-H---
61 # 2: ---IJKLMN--OPQR
62 # 3: ST------UVWXYZ-
63 #
64 # into:
65 #
66 # 1: ABCDEFGH
67 # 2: --ILMN-O
68 # 3: T----UVX
69
70 my ($gapped_alignment, $query_id, $include_query) = @_;
71
72 my @seqs = $gapped_alignment->each_seq;
73
74 # find the Seq object corresponding to the query id
75
76 my $query_seq;
77
78 for my $seq (@seqs) {
79 $query_seq = $seq;
80 last if $seq->display_id eq $query_id;
81 }
82
83 throw("Could not find query sequence '$query_id' in the alignment")
84 unless $query_seq->display_id eq $query_id;
85
86 my $qseq = $query_seq->seq;
87
88 #print $qseq, " (".length($qseq).")\n";
89
90 # first we split the query sequence into its component parts and gaps,
91 # so -ABC--DEFG-H--- will become ('ABC','DEFG','H') and ('-','--','-','---')
92
93 # we grep because we don't want any empty strings that otherwise seem to
94 # make it through
95
96 my @parts = grep {$_} split /-+/, $qseq;
97 my @gaps = grep {$_} split /[^-]+/, $qseq;
98
99 my @chunks;
100
101 # then we compute the coordinates of each sequence chunk,
102 # taking into account that we might start with a gap
103
104 my $offset = substr($qseq,0,1) eq '-' ? length(shift @gaps) : 0;
105
106 # we build up a list of [start, length] pairs for each chunk
107 # of sequence, incrementing our offset by the length of each
108 # chunk and the gap that follows it, for our example above this
109 # will result in ([1,3],[6,4],[11,1])
110
111 for my $part (@parts) {
112
113 my $l = length($part);
114
115 push @chunks, [$offset, $l];
116
117 #print "$part: $offset - $l\n";
118
119 $offset += $l + length(shift @gaps || '');
120 }
121
122 # we then use this list of chunks to obtain the aligned portions
123 # of all other members of the alignment and create a new alignment,
124
125 my @seq_objs;
126
127 for my $seq (@seqs) {
128
129 my $old_seq = $seq->seq;
130 my $new_seq;
131
132 for my $chunk (@chunks) {
133 $new_seq .= substr($old_seq, $chunk->[0], $chunk->[1]);
134 }
135
136 next if $new_seq =~ /^-*$/;
137
138 #my $gap_count = ($new_seq =~ tr/-//);
139
140 #next if $gap_count / length($new_seq) > 0.1;
141
142 if ($seq->display_id eq $query_id) {
143
144 $query_seq = Bio::LocatableSeq->new(
145 -SEQ => $new_seq,
146 -START => 1,
147 -END => length($new_seq),
148 -ID => 'QUERY',
149 -STRAND => 0
150 );
151
152 unshift @seq_objs, $query_seq if $include_query;
153 }
154 else {
155 push @seq_objs, Bio::LocatableSeq->new(
156 -SEQ => $new_seq,
157 -START => 1,
158 -END => length($new_seq),
159 -ID => $seq->display_id,
160 -STRAND => 0
161 )
162 }
163 }
164
165 my $new_align = Bio::SimpleAlign->new;
166
167 $new_align->add_seq($_) for @seq_objs;
168
169 # sometimes we want the query sequence back as well as the new alignment
170
171 return wantarray ? ($query_seq, $new_align) : $new_align;
172 }
173
174 sub _get_ungapped_alignment {
175
176 # get an ungapped Bio::SimpleAlign for the given query translation
177
178 my ($translation_stable_id, $include_query) = @_;
179
180 my $compara_dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'compara')
181 or throw("Failed to get compara DBAdaptor");
182
183 my $ma = $compara_dba->get_MemberAdaptor
184 or throw("Failed to get member adaptor");
185
186 my $fa = $compara_dba->get_FamilyAdaptor
187 or throw("Failed to get family adaptor");
188
189 my $member = $ma->fetch_by_source_stable_id("ENSEMBLPEP", $translation_stable_id)
190 or throw("Didn't find family member for $translation_stable_id");
191
192 my $fams = $fa->fetch_all_by_Member($member)
193 or throw("Didn't find a family for $translation_stable_id");
194
195 throw("$translation_stable_id is in more than one family") if @$fams > 1;
196
197 my $orig_align = $fams->[0]->get_SimpleAlign;
198
199 $compara_dba->dbc->disconnect_if_idle;
200
201 return _ungap_alignment(
202 $orig_align,
203 $translation_stable_id,
204 $include_query
205 );
206 }
207
208 sub _percent_id {
209 my ($q, $a) = @_;
210
211 my @q = split //, $q->seq;
212 my @a = split //, $a->seq;
213
214 my $num = scalar(@q);
215
216 my $tot = 0.0;
217
218 for (my $i = 0; $i < $num; $i++ ) {
219 $tot++ if $q[$i] eq $a[$i];
220 }
221
222 return ($tot / $num);
223 }
224
225 =head2 dump_alignment_for_polyphen
226
227 Arg[1] : string $translation_stable_id - the stable of the Ensembl translation
228 you want to run PolyPhen on
229 Arg[2] : string $file - the name of the file you want to write the alignment to
230 Description : Fetches the Compara protein family containing the specified translation
231 (if available), ungaps the alignment with respect to the translation, and
232 writes the alignment to the specified file in the format expected by PolyPhen
233 Returntype : none
234 Exceptions : throws if an alignment cannot be found, or if the file cannot be written
235 Status : At Risk
236
237 =cut
238
239 sub dump_alignment_for_polyphen {
240
241 my ($translation_stable_id, $file) = @_;
242
243 # polyphen does not want the query included in the alignment
244
245 my ($query_seq, $alignment) = _get_ungapped_alignment($translation_stable_id, 0);
246
247 my @seqs = $alignment->each_seq;
248
249 unless (scalar(@seqs)) {
250 throw("No sequences in the alignment for $translation_stable_id");
251 }
252
253 if (length($seqs[0]->seq) > $MAX_PSIC_SEQLEN) {
254 throw("$translation_stable_id sequence too long for PSIC");
255 }
256
257 # polyphen expects the alignment to be sorted descending by % id
258
259 my $percent_id;
260
261 for my $seq (@seqs) {
262 $percent_id->{$seq->id} = _percent_id($query_seq, $seq);
263 }
264
265 my @sorted = sort { $percent_id->{$b->id} <=> $percent_id->{$a->id} } @seqs;
266
267 # the format is similar to a clustalw .aln file, but it *must* have a
268 # 70 character description beginning each line, and then the alignment string
269 # on the rest of the line with no line breaks between sequences, PSIC
270 # can also only deal with a fixed maximum sequence length and number
271 # of sequences on the alignment, these are set in the constants at the
272 # top of this file
273
274 my $num_seqs = scalar(@seqs);
275
276 open my $ALN, ">$file" or throw("Failed to open $file for writing");
277
278 print $ALN "CLUSTAL $translation_stable_id (".($num_seqs > $MAX_PSIC_SEQS ? $MAX_PSIC_SEQS : $num_seqs).")\n\n";
279
280 my $count = 0;
281
282 for my $seq (@sorted) {
283 if ($percent_id->{$seq->id} == 1) {
284 #warn "Ignoring identical seq ".$seq->id."\n";
285 #next;
286 }
287
288 my $id = $seq->id;
289 my $extra = 70 - length($id);
290
291 print $ALN $seq->id, ' ' x $extra, $seq->seq, "\n";
292
293 last if ++$count >= $MAX_PSIC_SEQS;
294 }
295
296 close $ALN;
297 }
298
299 =head2 dump_alignment_for_sift
300
301 Arg[1] : string $translation_stable_id - the stable id of the Ensembl translation
302 you want to run SIFT on
303 Arg[2] : string $file - the name of the file you want to write the alignment to
304 Description : Fetches the Compara protein family containing the specified translation
305 (if available), ungaps the alignment with respect to the translation, and
306 writes the alignment to the specified file in the format expected by SIFT
307 Returntype : none
308 Exceptions : throws if an alignment cannot be found, or if the file cannot be written
309 Status : At Risk
310
311 =cut
312
313 sub dump_alignment_for_sift {
314
315 my ($translation_stable_id, $file) = @_;
316
317 # SIFT is easy, we just fetch an ungapped alignment including the query
318 # and dump it in FASTA format
319
320 my $aln = _get_ungapped_alignment($translation_stable_id, 1);
321
322 open my $ALN, ">$file" or throw("Failed to open $file for writing");
323
324 my $alignIO = Bio::AlignIO->newFh(
325 -interleaved => 0,
326 -fh => $ALN,
327 -format => 'fasta',
328 -idlength => 20,
329 );
330
331 print $alignIO $aln;
332
333 close $ALN;
334 }
335
336 1;
337