annotate variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/ComparaUtils.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 =head1 LICENSE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4 Genome Research Limited. All rights reserved.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6 This software is distributed under a modified Apache license.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 For license details, please see
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 http://www.ensembl.org/info/about/code_licence.html
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 =head1 CONTACT
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 Please email comments or questions to the public Ensembl
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14 developers list at <dev@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16 Questions may also be sent to the Ensembl help desk at
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 <helpdesk@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 Bio::EnsEMBL::Variation::Utils::ComparaUtils
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 This module exports two subroutines, dump_alignment_for_polyphen and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 dump_alignment_for_sift that write Compara alignments to files in the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 formats expected by PolyPhen and SIFT. This allows you to use
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 Compara alignments in place of both these tools' alignment pipelines
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34 package Bio::EnsEMBL::Variation::Utils::ComparaUtils;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 use warnings;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40 use Bio::EnsEMBL::Registry;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 use Bio::SimpleAlign;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42 use Bio::AlignIO;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 use Bio::LocatableSeq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45 use base qw(Exporter);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 our @EXPORT_OK = qw(dump_alignment_for_polyphen dump_alignment_for_sift);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49 my $MAX_PSIC_SEQS = 8190;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50 my $MAX_PSIC_SEQLEN = 409650;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 sub _ungap_alignment {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 # turn a gapped alignment into an ungapped alignment
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55 # with respect to the given query_id,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 # e.g. if our query sequence is no. 1 below, we want to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58 # turn this:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 # 1: -ABC--DEFG-H---
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61 # 2: ---IJKLMN--OPQR
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62 # 3: ST------UVWXYZ-
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64 # into:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 # 1: ABCDEFGH
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 # 2: --ILMN-O
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 # 3: T----UVX
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 my ($gapped_alignment, $query_id, $include_query) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 my @seqs = $gapped_alignment->each_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 # find the Seq object corresponding to the query id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 my $query_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78 for my $seq (@seqs) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 $query_seq = $seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 last if $seq->display_id eq $query_id;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 throw("Could not find query sequence '$query_id' in the alignment")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 unless $query_seq->display_id eq $query_id;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 my $qseq = $query_seq->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88 #print $qseq, " (".length($qseq).")\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 # first we split the query sequence into its component parts and gaps,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 # so -ABC--DEFG-H--- will become ('ABC','DEFG','H') and ('-','--','-','---')
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 # we grep because we don't want any empty strings that otherwise seem to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 # make it through
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96 my @parts = grep {$_} split /-+/, $qseq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97 my @gaps = grep {$_} split /[^-]+/, $qseq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 my @chunks;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 # then we compute the coordinates of each sequence chunk,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102 # taking into account that we might start with a gap
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 my $offset = substr($qseq,0,1) eq '-' ? length(shift @gaps) : 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106 # we build up a list of [start, length] pairs for each chunk
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107 # of sequence, incrementing our offset by the length of each
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 # chunk and the gap that follows it, for our example above this
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109 # will result in ([1,3],[6,4],[11,1])
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 for my $part (@parts) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 my $l = length($part);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 push @chunks, [$offset, $l];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 #print "$part: $offset - $l\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 $offset += $l + length(shift @gaps || '');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 # we then use this list of chunks to obtain the aligned portions
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 # of all other members of the alignment and create a new alignment,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125 my @seq_objs;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 for my $seq (@seqs) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 my $old_seq = $seq->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130 my $new_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 for my $chunk (@chunks) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133 $new_seq .= substr($old_seq, $chunk->[0], $chunk->[1]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136 next if $new_seq =~ /^-*$/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138 #my $gap_count = ($new_seq =~ tr/-//);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 #next if $gap_count / length($new_seq) > 0.1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 if ($seq->display_id eq $query_id) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 $query_seq = Bio::LocatableSeq->new(
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 -SEQ => $new_seq,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146 -START => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 -END => length($new_seq),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 -ID => 'QUERY',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 -STRAND => 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152 unshift @seq_objs, $query_seq if $include_query;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155 push @seq_objs, Bio::LocatableSeq->new(
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 -SEQ => $new_seq,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 -START => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158 -END => length($new_seq),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 -ID => $seq->display_id,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 -STRAND => 0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 )
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 my $new_align = Bio::SimpleAlign->new;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167 $new_align->add_seq($_) for @seq_objs;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169 # sometimes we want the query sequence back as well as the new alignment
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171 return wantarray ? ($query_seq, $new_align) : $new_align;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174 sub _get_ungapped_alignment {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176 # get an ungapped Bio::SimpleAlign for the given query translation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178 my ($translation_stable_id, $include_query) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 my $compara_dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'compara')
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181 or throw("Failed to get compara DBAdaptor");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183 my $ma = $compara_dba->get_MemberAdaptor
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 or throw("Failed to get member adaptor");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186 my $fa = $compara_dba->get_FamilyAdaptor
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187 or throw("Failed to get family adaptor");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189 my $member = $ma->fetch_by_source_stable_id("ENSEMBLPEP", $translation_stable_id)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190 or throw("Didn't find family member for $translation_stable_id");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192 my $fams = $fa->fetch_all_by_Member($member)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193 or throw("Didn't find a family for $translation_stable_id");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 throw("$translation_stable_id is in more than one family") if @$fams > 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 my $orig_align = $fams->[0]->get_SimpleAlign;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199 $compara_dba->dbc->disconnect_if_idle;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201 return _ungap_alignment(
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202 $orig_align,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203 $translation_stable_id,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204 $include_query
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208 sub _percent_id {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209 my ($q, $a) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211 my @q = split //, $q->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212 my @a = split //, $a->seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214 my $num = scalar(@q);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216 my $tot = 0.0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218 for (my $i = 0; $i < $num; $i++ ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 $tot++ if $q[$i] eq $a[$i];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222 return ($tot / $num);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 =head2 dump_alignment_for_polyphen
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
226
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
227 Arg[1] : string $translation_stable_id - the stable of the Ensembl translation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
228 you want to run PolyPhen on
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
229 Arg[2] : string $file - the name of the file you want to write the alignment to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
230 Description : Fetches the Compara protein family containing the specified translation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
231 (if available), ungaps the alignment with respect to the translation, and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
232 writes the alignment to the specified file in the format expected by PolyPhen
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
233 Returntype : none
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
234 Exceptions : throws if an alignment cannot be found, or if the file cannot be written
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
235 Status : At Risk
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
236
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
237 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
238
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
239 sub dump_alignment_for_polyphen {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
240
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
241 my ($translation_stable_id, $file) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
242
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
243 # polyphen does not want the query included in the alignment
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
244
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
245 my ($query_seq, $alignment) = _get_ungapped_alignment($translation_stable_id, 0);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
246
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
247 my @seqs = $alignment->each_seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
248
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
249 unless (scalar(@seqs)) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
250 throw("No sequences in the alignment for $translation_stable_id");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
251 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
252
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
253 if (length($seqs[0]->seq) > $MAX_PSIC_SEQLEN) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
254 throw("$translation_stable_id sequence too long for PSIC");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
255 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
256
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
257 # polyphen expects the alignment to be sorted descending by % id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
258
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
259 my $percent_id;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
260
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
261 for my $seq (@seqs) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
262 $percent_id->{$seq->id} = _percent_id($query_seq, $seq);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
263 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
264
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
265 my @sorted = sort { $percent_id->{$b->id} <=> $percent_id->{$a->id} } @seqs;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
266
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
267 # the format is similar to a clustalw .aln file, but it *must* have a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
268 # 70 character description beginning each line, and then the alignment string
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
269 # on the rest of the line with no line breaks between sequences, PSIC
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
270 # can also only deal with a fixed maximum sequence length and number
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
271 # of sequences on the alignment, these are set in the constants at the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
272 # top of this file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
273
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
274 my $num_seqs = scalar(@seqs);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
275
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
276 open my $ALN, ">$file" or throw("Failed to open $file for writing");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
277
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
278 print $ALN "CLUSTAL $translation_stable_id (".($num_seqs > $MAX_PSIC_SEQS ? $MAX_PSIC_SEQS : $num_seqs).")\n\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
279
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
280 my $count = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
281
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
282 for my $seq (@sorted) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
283 if ($percent_id->{$seq->id} == 1) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
284 #warn "Ignoring identical seq ".$seq->id."\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
285 #next;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
286 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
287
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
288 my $id = $seq->id;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
289 my $extra = 70 - length($id);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
290
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
291 print $ALN $seq->id, ' ' x $extra, $seq->seq, "\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
292
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
293 last if ++$count >= $MAX_PSIC_SEQS;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
294 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
295
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
296 close $ALN;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
297 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
298
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
299 =head2 dump_alignment_for_sift
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
300
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
301 Arg[1] : string $translation_stable_id - the stable id of the Ensembl translation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
302 you want to run SIFT on
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
303 Arg[2] : string $file - the name of the file you want to write the alignment to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
304 Description : Fetches the Compara protein family containing the specified translation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
305 (if available), ungaps the alignment with respect to the translation, and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
306 writes the alignment to the specified file in the format expected by SIFT
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
307 Returntype : none
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
308 Exceptions : throws if an alignment cannot be found, or if the file cannot be written
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
309 Status : At Risk
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
310
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
311 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
312
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
313 sub dump_alignment_for_sift {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
314
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
315 my ($translation_stable_id, $file) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
316
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
317 # SIFT is easy, we just fetch an ungapped alignment including the query
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
318 # and dump it in FASTA format
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
319
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
320 my $aln = _get_ungapped_alignment($translation_stable_id, 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
321
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
322 open my $ALN, ">$file" or throw("Failed to open $file for writing");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
323
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
324 my $alignIO = Bio::AlignIO->newFh(
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
325 -interleaved => 0,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
326 -fh => $ALN,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
327 -format => 'fasta',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
328 -idlength => 20,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
329 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
330
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
331 print $alignIO $aln;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
332
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
333 close $ALN;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
334 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
335
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
336 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
337