0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Variation::Utils::ComparaUtils
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 This module exports two subroutines, dump_alignment_for_polyphen and
|
|
28 dump_alignment_for_sift that write Compara alignments to files in the
|
|
29 formats expected by PolyPhen and SIFT. This allows you to use
|
|
30 Compara alignments in place of both these tools' alignment pipelines
|
|
31
|
|
32 =cut
|
|
33
|
|
34 package Bio::EnsEMBL::Variation::Utils::ComparaUtils;
|
|
35
|
|
36 use strict;
|
|
37 use warnings;
|
|
38
|
|
39 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
40 use Bio::EnsEMBL::Registry;
|
|
41 use Bio::SimpleAlign;
|
|
42 use Bio::AlignIO;
|
|
43 use Bio::LocatableSeq;
|
|
44
|
|
45 use base qw(Exporter);
|
|
46
|
|
47 our @EXPORT_OK = qw(dump_alignment_for_polyphen dump_alignment_for_sift);
|
|
48
|
|
49 my $MAX_PSIC_SEQS = 8190;
|
|
50 my $MAX_PSIC_SEQLEN = 409650;
|
|
51
|
|
52 sub _ungap_alignment {
|
|
53
|
|
54 # turn a gapped alignment into an ungapped alignment
|
|
55 # with respect to the given query_id,
|
|
56 #
|
|
57 # e.g. if our query sequence is no. 1 below, we want to
|
|
58 # turn this:
|
|
59 #
|
|
60 # 1: -ABC--DEFG-H---
|
|
61 # 2: ---IJKLMN--OPQR
|
|
62 # 3: ST------UVWXYZ-
|
|
63 #
|
|
64 # into:
|
|
65 #
|
|
66 # 1: ABCDEFGH
|
|
67 # 2: --ILMN-O
|
|
68 # 3: T----UVX
|
|
69
|
|
70 my ($gapped_alignment, $query_id, $include_query) = @_;
|
|
71
|
|
72 my @seqs = $gapped_alignment->each_seq;
|
|
73
|
|
74 # find the Seq object corresponding to the query id
|
|
75
|
|
76 my $query_seq;
|
|
77
|
|
78 for my $seq (@seqs) {
|
|
79 $query_seq = $seq;
|
|
80 last if $seq->display_id eq $query_id;
|
|
81 }
|
|
82
|
|
83 throw("Could not find query sequence '$query_id' in the alignment")
|
|
84 unless $query_seq->display_id eq $query_id;
|
|
85
|
|
86 my $qseq = $query_seq->seq;
|
|
87
|
|
88 #print $qseq, " (".length($qseq).")\n";
|
|
89
|
|
90 # first we split the query sequence into its component parts and gaps,
|
|
91 # so -ABC--DEFG-H--- will become ('ABC','DEFG','H') and ('-','--','-','---')
|
|
92
|
|
93 # we grep because we don't want any empty strings that otherwise seem to
|
|
94 # make it through
|
|
95
|
|
96 my @parts = grep {$_} split /-+/, $qseq;
|
|
97 my @gaps = grep {$_} split /[^-]+/, $qseq;
|
|
98
|
|
99 my @chunks;
|
|
100
|
|
101 # then we compute the coordinates of each sequence chunk,
|
|
102 # taking into account that we might start with a gap
|
|
103
|
|
104 my $offset = substr($qseq,0,1) eq '-' ? length(shift @gaps) : 0;
|
|
105
|
|
106 # we build up a list of [start, length] pairs for each chunk
|
|
107 # of sequence, incrementing our offset by the length of each
|
|
108 # chunk and the gap that follows it, for our example above this
|
|
109 # will result in ([1,3],[6,4],[11,1])
|
|
110
|
|
111 for my $part (@parts) {
|
|
112
|
|
113 my $l = length($part);
|
|
114
|
|
115 push @chunks, [$offset, $l];
|
|
116
|
|
117 #print "$part: $offset - $l\n";
|
|
118
|
|
119 $offset += $l + length(shift @gaps || '');
|
|
120 }
|
|
121
|
|
122 # we then use this list of chunks to obtain the aligned portions
|
|
123 # of all other members of the alignment and create a new alignment,
|
|
124
|
|
125 my @seq_objs;
|
|
126
|
|
127 for my $seq (@seqs) {
|
|
128
|
|
129 my $old_seq = $seq->seq;
|
|
130 my $new_seq;
|
|
131
|
|
132 for my $chunk (@chunks) {
|
|
133 $new_seq .= substr($old_seq, $chunk->[0], $chunk->[1]);
|
|
134 }
|
|
135
|
|
136 next if $new_seq =~ /^-*$/;
|
|
137
|
|
138 #my $gap_count = ($new_seq =~ tr/-//);
|
|
139
|
|
140 #next if $gap_count / length($new_seq) > 0.1;
|
|
141
|
|
142 if ($seq->display_id eq $query_id) {
|
|
143
|
|
144 $query_seq = Bio::LocatableSeq->new(
|
|
145 -SEQ => $new_seq,
|
|
146 -START => 1,
|
|
147 -END => length($new_seq),
|
|
148 -ID => 'QUERY',
|
|
149 -STRAND => 0
|
|
150 );
|
|
151
|
|
152 unshift @seq_objs, $query_seq if $include_query;
|
|
153 }
|
|
154 else {
|
|
155 push @seq_objs, Bio::LocatableSeq->new(
|
|
156 -SEQ => $new_seq,
|
|
157 -START => 1,
|
|
158 -END => length($new_seq),
|
|
159 -ID => $seq->display_id,
|
|
160 -STRAND => 0
|
|
161 )
|
|
162 }
|
|
163 }
|
|
164
|
|
165 my $new_align = Bio::SimpleAlign->new;
|
|
166
|
|
167 $new_align->add_seq($_) for @seq_objs;
|
|
168
|
|
169 # sometimes we want the query sequence back as well as the new alignment
|
|
170
|
|
171 return wantarray ? ($query_seq, $new_align) : $new_align;
|
|
172 }
|
|
173
|
|
174 sub _get_ungapped_alignment {
|
|
175
|
|
176 # get an ungapped Bio::SimpleAlign for the given query translation
|
|
177
|
|
178 my ($translation_stable_id, $include_query) = @_;
|
|
179
|
|
180 my $compara_dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'compara')
|
|
181 or throw("Failed to get compara DBAdaptor");
|
|
182
|
|
183 my $ma = $compara_dba->get_MemberAdaptor
|
|
184 or throw("Failed to get member adaptor");
|
|
185
|
|
186 my $fa = $compara_dba->get_FamilyAdaptor
|
|
187 or throw("Failed to get family adaptor");
|
|
188
|
|
189 my $member = $ma->fetch_by_source_stable_id("ENSEMBLPEP", $translation_stable_id)
|
|
190 or throw("Didn't find family member for $translation_stable_id");
|
|
191
|
|
192 my $fams = $fa->fetch_all_by_Member($member)
|
|
193 or throw("Didn't find a family for $translation_stable_id");
|
|
194
|
|
195 throw("$translation_stable_id is in more than one family") if @$fams > 1;
|
|
196
|
|
197 my $orig_align = $fams->[0]->get_SimpleAlign;
|
|
198
|
|
199 $compara_dba->dbc->disconnect_if_idle;
|
|
200
|
|
201 return _ungap_alignment(
|
|
202 $orig_align,
|
|
203 $translation_stable_id,
|
|
204 $include_query
|
|
205 );
|
|
206 }
|
|
207
|
|
208 sub _percent_id {
|
|
209 my ($q, $a) = @_;
|
|
210
|
|
211 my @q = split //, $q->seq;
|
|
212 my @a = split //, $a->seq;
|
|
213
|
|
214 my $num = scalar(@q);
|
|
215
|
|
216 my $tot = 0.0;
|
|
217
|
|
218 for (my $i = 0; $i < $num; $i++ ) {
|
|
219 $tot++ if $q[$i] eq $a[$i];
|
|
220 }
|
|
221
|
|
222 return ($tot / $num);
|
|
223 }
|
|
224
|
|
225 =head2 dump_alignment_for_polyphen
|
|
226
|
|
227 Arg[1] : string $translation_stable_id - the stable of the Ensembl translation
|
|
228 you want to run PolyPhen on
|
|
229 Arg[2] : string $file - the name of the file you want to write the alignment to
|
|
230 Description : Fetches the Compara protein family containing the specified translation
|
|
231 (if available), ungaps the alignment with respect to the translation, and
|
|
232 writes the alignment to the specified file in the format expected by PolyPhen
|
|
233 Returntype : none
|
|
234 Exceptions : throws if an alignment cannot be found, or if the file cannot be written
|
|
235 Status : At Risk
|
|
236
|
|
237 =cut
|
|
238
|
|
239 sub dump_alignment_for_polyphen {
|
|
240
|
|
241 my ($translation_stable_id, $file) = @_;
|
|
242
|
|
243 # polyphen does not want the query included in the alignment
|
|
244
|
|
245 my ($query_seq, $alignment) = _get_ungapped_alignment($translation_stable_id, 0);
|
|
246
|
|
247 my @seqs = $alignment->each_seq;
|
|
248
|
|
249 unless (scalar(@seqs)) {
|
|
250 throw("No sequences in the alignment for $translation_stable_id");
|
|
251 }
|
|
252
|
|
253 if (length($seqs[0]->seq) > $MAX_PSIC_SEQLEN) {
|
|
254 throw("$translation_stable_id sequence too long for PSIC");
|
|
255 }
|
|
256
|
|
257 # polyphen expects the alignment to be sorted descending by % id
|
|
258
|
|
259 my $percent_id;
|
|
260
|
|
261 for my $seq (@seqs) {
|
|
262 $percent_id->{$seq->id} = _percent_id($query_seq, $seq);
|
|
263 }
|
|
264
|
|
265 my @sorted = sort { $percent_id->{$b->id} <=> $percent_id->{$a->id} } @seqs;
|
|
266
|
|
267 # the format is similar to a clustalw .aln file, but it *must* have a
|
|
268 # 70 character description beginning each line, and then the alignment string
|
|
269 # on the rest of the line with no line breaks between sequences, PSIC
|
|
270 # can also only deal with a fixed maximum sequence length and number
|
|
271 # of sequences on the alignment, these are set in the constants at the
|
|
272 # top of this file
|
|
273
|
|
274 my $num_seqs = scalar(@seqs);
|
|
275
|
|
276 open my $ALN, ">$file" or throw("Failed to open $file for writing");
|
|
277
|
|
278 print $ALN "CLUSTAL $translation_stable_id (".($num_seqs > $MAX_PSIC_SEQS ? $MAX_PSIC_SEQS : $num_seqs).")\n\n";
|
|
279
|
|
280 my $count = 0;
|
|
281
|
|
282 for my $seq (@sorted) {
|
|
283 if ($percent_id->{$seq->id} == 1) {
|
|
284 #warn "Ignoring identical seq ".$seq->id."\n";
|
|
285 #next;
|
|
286 }
|
|
287
|
|
288 my $id = $seq->id;
|
|
289 my $extra = 70 - length($id);
|
|
290
|
|
291 print $ALN $seq->id, ' ' x $extra, $seq->seq, "\n";
|
|
292
|
|
293 last if ++$count >= $MAX_PSIC_SEQS;
|
|
294 }
|
|
295
|
|
296 close $ALN;
|
|
297 }
|
|
298
|
|
299 =head2 dump_alignment_for_sift
|
|
300
|
|
301 Arg[1] : string $translation_stable_id - the stable id of the Ensembl translation
|
|
302 you want to run SIFT on
|
|
303 Arg[2] : string $file - the name of the file you want to write the alignment to
|
|
304 Description : Fetches the Compara protein family containing the specified translation
|
|
305 (if available), ungaps the alignment with respect to the translation, and
|
|
306 writes the alignment to the specified file in the format expected by SIFT
|
|
307 Returntype : none
|
|
308 Exceptions : throws if an alignment cannot be found, or if the file cannot be written
|
|
309 Status : At Risk
|
|
310
|
|
311 =cut
|
|
312
|
|
313 sub dump_alignment_for_sift {
|
|
314
|
|
315 my ($translation_stable_id, $file) = @_;
|
|
316
|
|
317 # SIFT is easy, we just fetch an ungapped alignment including the query
|
|
318 # and dump it in FASTA format
|
|
319
|
|
320 my $aln = _get_ungapped_alignment($translation_stable_id, 1);
|
|
321
|
|
322 open my $ALN, ">$file" or throw("Failed to open $file for writing");
|
|
323
|
|
324 my $alignIO = Bio::AlignIO->newFh(
|
|
325 -interleaved => 0,
|
|
326 -fh => $ALN,
|
|
327 -format => 'fasta',
|
|
328 -idlength => 20,
|
|
329 );
|
|
330
|
|
331 print $alignIO $aln;
|
|
332
|
|
333 close $ALN;
|
|
334 }
|
|
335
|
|
336 1;
|
|
337
|