Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Variation::Utils::ComparaUtils | |
24 | |
25 =head1 DESCRIPTION | |
26 | |
27 This module exports two subroutines, dump_alignment_for_polyphen and | |
28 dump_alignment_for_sift that write Compara alignments to files in the | |
29 formats expected by PolyPhen and SIFT. This allows you to use | |
30 Compara alignments in place of both these tools' alignment pipelines | |
31 | |
32 =cut | |
33 | |
34 package Bio::EnsEMBL::Variation::Utils::ComparaUtils; | |
35 | |
36 use strict; | |
37 use warnings; | |
38 | |
39 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
40 use Bio::EnsEMBL::Registry; | |
41 use Bio::SimpleAlign; | |
42 use Bio::AlignIO; | |
43 use Bio::LocatableSeq; | |
44 | |
45 use base qw(Exporter); | |
46 | |
47 our @EXPORT_OK = qw(dump_alignment_for_polyphen dump_alignment_for_sift); | |
48 | |
49 my $MAX_PSIC_SEQS = 8190; | |
50 my $MAX_PSIC_SEQLEN = 409650; | |
51 | |
52 sub _ungap_alignment { | |
53 | |
54 # turn a gapped alignment into an ungapped alignment | |
55 # with respect to the given query_id, | |
56 # | |
57 # e.g. if our query sequence is no. 1 below, we want to | |
58 # turn this: | |
59 # | |
60 # 1: -ABC--DEFG-H--- | |
61 # 2: ---IJKLMN--OPQR | |
62 # 3: ST------UVWXYZ- | |
63 # | |
64 # into: | |
65 # | |
66 # 1: ABCDEFGH | |
67 # 2: --ILMN-O | |
68 # 3: T----UVX | |
69 | |
70 my ($gapped_alignment, $query_id, $include_query) = @_; | |
71 | |
72 my @seqs = $gapped_alignment->each_seq; | |
73 | |
74 # find the Seq object corresponding to the query id | |
75 | |
76 my $query_seq; | |
77 | |
78 for my $seq (@seqs) { | |
79 $query_seq = $seq; | |
80 last if $seq->display_id eq $query_id; | |
81 } | |
82 | |
83 throw("Could not find query sequence '$query_id' in the alignment") | |
84 unless $query_seq->display_id eq $query_id; | |
85 | |
86 my $qseq = $query_seq->seq; | |
87 | |
88 #print $qseq, " (".length($qseq).")\n"; | |
89 | |
90 # first we split the query sequence into its component parts and gaps, | |
91 # so -ABC--DEFG-H--- will become ('ABC','DEFG','H') and ('-','--','-','---') | |
92 | |
93 # we grep because we don't want any empty strings that otherwise seem to | |
94 # make it through | |
95 | |
96 my @parts = grep {$_} split /-+/, $qseq; | |
97 my @gaps = grep {$_} split /[^-]+/, $qseq; | |
98 | |
99 my @chunks; | |
100 | |
101 # then we compute the coordinates of each sequence chunk, | |
102 # taking into account that we might start with a gap | |
103 | |
104 my $offset = substr($qseq,0,1) eq '-' ? length(shift @gaps) : 0; | |
105 | |
106 # we build up a list of [start, length] pairs for each chunk | |
107 # of sequence, incrementing our offset by the length of each | |
108 # chunk and the gap that follows it, for our example above this | |
109 # will result in ([1,3],[6,4],[11,1]) | |
110 | |
111 for my $part (@parts) { | |
112 | |
113 my $l = length($part); | |
114 | |
115 push @chunks, [$offset, $l]; | |
116 | |
117 #print "$part: $offset - $l\n"; | |
118 | |
119 $offset += $l + length(shift @gaps || ''); | |
120 } | |
121 | |
122 # we then use this list of chunks to obtain the aligned portions | |
123 # of all other members of the alignment and create a new alignment, | |
124 | |
125 my @seq_objs; | |
126 | |
127 for my $seq (@seqs) { | |
128 | |
129 my $old_seq = $seq->seq; | |
130 my $new_seq; | |
131 | |
132 for my $chunk (@chunks) { | |
133 $new_seq .= substr($old_seq, $chunk->[0], $chunk->[1]); | |
134 } | |
135 | |
136 next if $new_seq =~ /^-*$/; | |
137 | |
138 #my $gap_count = ($new_seq =~ tr/-//); | |
139 | |
140 #next if $gap_count / length($new_seq) > 0.1; | |
141 | |
142 if ($seq->display_id eq $query_id) { | |
143 | |
144 $query_seq = Bio::LocatableSeq->new( | |
145 -SEQ => $new_seq, | |
146 -START => 1, | |
147 -END => length($new_seq), | |
148 -ID => 'QUERY', | |
149 -STRAND => 0 | |
150 ); | |
151 | |
152 unshift @seq_objs, $query_seq if $include_query; | |
153 } | |
154 else { | |
155 push @seq_objs, Bio::LocatableSeq->new( | |
156 -SEQ => $new_seq, | |
157 -START => 1, | |
158 -END => length($new_seq), | |
159 -ID => $seq->display_id, | |
160 -STRAND => 0 | |
161 ) | |
162 } | |
163 } | |
164 | |
165 my $new_align = Bio::SimpleAlign->new; | |
166 | |
167 $new_align->add_seq($_) for @seq_objs; | |
168 | |
169 # sometimes we want the query sequence back as well as the new alignment | |
170 | |
171 return wantarray ? ($query_seq, $new_align) : $new_align; | |
172 } | |
173 | |
174 sub _get_ungapped_alignment { | |
175 | |
176 # get an ungapped Bio::SimpleAlign for the given query translation | |
177 | |
178 my ($translation_stable_id, $include_query) = @_; | |
179 | |
180 my $compara_dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'compara') | |
181 or throw("Failed to get compara DBAdaptor"); | |
182 | |
183 my $ma = $compara_dba->get_MemberAdaptor | |
184 or throw("Failed to get member adaptor"); | |
185 | |
186 my $fa = $compara_dba->get_FamilyAdaptor | |
187 or throw("Failed to get family adaptor"); | |
188 | |
189 my $member = $ma->fetch_by_source_stable_id("ENSEMBLPEP", $translation_stable_id) | |
190 or throw("Didn't find family member for $translation_stable_id"); | |
191 | |
192 my $fams = $fa->fetch_all_by_Member($member) | |
193 or throw("Didn't find a family for $translation_stable_id"); | |
194 | |
195 throw("$translation_stable_id is in more than one family") if @$fams > 1; | |
196 | |
197 my $orig_align = $fams->[0]->get_SimpleAlign; | |
198 | |
199 $compara_dba->dbc->disconnect_if_idle; | |
200 | |
201 return _ungap_alignment( | |
202 $orig_align, | |
203 $translation_stable_id, | |
204 $include_query | |
205 ); | |
206 } | |
207 | |
208 sub _percent_id { | |
209 my ($q, $a) = @_; | |
210 | |
211 my @q = split //, $q->seq; | |
212 my @a = split //, $a->seq; | |
213 | |
214 my $num = scalar(@q); | |
215 | |
216 my $tot = 0.0; | |
217 | |
218 for (my $i = 0; $i < $num; $i++ ) { | |
219 $tot++ if $q[$i] eq $a[$i]; | |
220 } | |
221 | |
222 return ($tot / $num); | |
223 } | |
224 | |
225 =head2 dump_alignment_for_polyphen | |
226 | |
227 Arg[1] : string $translation_stable_id - the stable of the Ensembl translation | |
228 you want to run PolyPhen on | |
229 Arg[2] : string $file - the name of the file you want to write the alignment to | |
230 Description : Fetches the Compara protein family containing the specified translation | |
231 (if available), ungaps the alignment with respect to the translation, and | |
232 writes the alignment to the specified file in the format expected by PolyPhen | |
233 Returntype : none | |
234 Exceptions : throws if an alignment cannot be found, or if the file cannot be written | |
235 Status : At Risk | |
236 | |
237 =cut | |
238 | |
239 sub dump_alignment_for_polyphen { | |
240 | |
241 my ($translation_stable_id, $file) = @_; | |
242 | |
243 # polyphen does not want the query included in the alignment | |
244 | |
245 my ($query_seq, $alignment) = _get_ungapped_alignment($translation_stable_id, 0); | |
246 | |
247 my @seqs = $alignment->each_seq; | |
248 | |
249 unless (scalar(@seqs)) { | |
250 throw("No sequences in the alignment for $translation_stable_id"); | |
251 } | |
252 | |
253 if (length($seqs[0]->seq) > $MAX_PSIC_SEQLEN) { | |
254 throw("$translation_stable_id sequence too long for PSIC"); | |
255 } | |
256 | |
257 # polyphen expects the alignment to be sorted descending by % id | |
258 | |
259 my $percent_id; | |
260 | |
261 for my $seq (@seqs) { | |
262 $percent_id->{$seq->id} = _percent_id($query_seq, $seq); | |
263 } | |
264 | |
265 my @sorted = sort { $percent_id->{$b->id} <=> $percent_id->{$a->id} } @seqs; | |
266 | |
267 # the format is similar to a clustalw .aln file, but it *must* have a | |
268 # 70 character description beginning each line, and then the alignment string | |
269 # on the rest of the line with no line breaks between sequences, PSIC | |
270 # can also only deal with a fixed maximum sequence length and number | |
271 # of sequences on the alignment, these are set in the constants at the | |
272 # top of this file | |
273 | |
274 my $num_seqs = scalar(@seqs); | |
275 | |
276 open my $ALN, ">$file" or throw("Failed to open $file for writing"); | |
277 | |
278 print $ALN "CLUSTAL $translation_stable_id (".($num_seqs > $MAX_PSIC_SEQS ? $MAX_PSIC_SEQS : $num_seqs).")\n\n"; | |
279 | |
280 my $count = 0; | |
281 | |
282 for my $seq (@sorted) { | |
283 if ($percent_id->{$seq->id} == 1) { | |
284 #warn "Ignoring identical seq ".$seq->id."\n"; | |
285 #next; | |
286 } | |
287 | |
288 my $id = $seq->id; | |
289 my $extra = 70 - length($id); | |
290 | |
291 print $ALN $seq->id, ' ' x $extra, $seq->seq, "\n"; | |
292 | |
293 last if ++$count >= $MAX_PSIC_SEQS; | |
294 } | |
295 | |
296 close $ALN; | |
297 } | |
298 | |
299 =head2 dump_alignment_for_sift | |
300 | |
301 Arg[1] : string $translation_stable_id - the stable id of the Ensembl translation | |
302 you want to run SIFT on | |
303 Arg[2] : string $file - the name of the file you want to write the alignment to | |
304 Description : Fetches the Compara protein family containing the specified translation | |
305 (if available), ungaps the alignment with respect to the translation, and | |
306 writes the alignment to the specified file in the format expected by SIFT | |
307 Returntype : none | |
308 Exceptions : throws if an alignment cannot be found, or if the file cannot be written | |
309 Status : At Risk | |
310 | |
311 =cut | |
312 | |
313 sub dump_alignment_for_sift { | |
314 | |
315 my ($translation_stable_id, $file) = @_; | |
316 | |
317 # SIFT is easy, we just fetch an ungapped alignment including the query | |
318 # and dump it in FASTA format | |
319 | |
320 my $aln = _get_ungapped_alignment($translation_stable_id, 1); | |
321 | |
322 open my $ALN, ">$file" or throw("Failed to open $file for writing"); | |
323 | |
324 my $alignIO = Bio::AlignIO->newFh( | |
325 -interleaved => 0, | |
326 -fh => $ALN, | |
327 -format => 'fasta', | |
328 -idlength => 20, | |
329 ); | |
330 | |
331 print $alignIO $aln; | |
332 | |
333 close $ALN; | |
334 } | |
335 | |
336 1; | |
337 |