|
0
|
1 =head1 LICENSE
|
|
|
2
|
|
|
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
|
|
|
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
|
|
|
5
|
|
|
6 Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
7 you may not use this file except in compliance with the License.
|
|
|
8 You may obtain a copy of the License at
|
|
|
9
|
|
|
10 http://www.apache.org/licenses/LICENSE-2.0
|
|
|
11
|
|
|
12 Unless required by applicable law or agreed to in writing, software
|
|
|
13 distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
15 See the License for the specific language governing permissions and
|
|
|
16 limitations under the License.
|
|
|
17
|
|
|
18 =head1 CONTACT
|
|
|
19
|
|
|
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
|
|
|
21
|
|
|
22 =cut
|
|
|
23
|
|
|
24 =head1 NAME
|
|
|
25
|
|
|
26 ProteinSeqs
|
|
|
27
|
|
|
28 =head1 SYNOPSIS
|
|
|
29
|
|
|
30 mv ProteinSeqs.pm ~/.vep/Plugins
|
|
|
31 ./vep -i variations.vcf --plugin ProteinSeqs,reference.fa,mutated.fa
|
|
|
32
|
|
|
33 =head1 DESCRIPTION
|
|
|
34
|
|
|
35 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that
|
|
|
36 prints out the reference and mutated protein sequences of any
|
|
|
37 proteins found with non-synonymous mutations in the input file.
|
|
|
38
|
|
|
39 You should supply the name of file where you want to store the
|
|
|
40 reference protein sequences as the first argument, and a file to
|
|
|
41 store the mutated sequences as the second argument.
|
|
|
42
|
|
|
43 Note that, for simplicity, where stop codons are gained the plugin
|
|
|
44 simply substitutes a '*' into the sequence and does not truncate the
|
|
|
45 protein. Where a stop codon is lost any new amino acids encoded by the
|
|
|
46 mutation are appended to the sequence, but the plugin does not attempt
|
|
|
47 to translate until the next downstream stop codon. Also, the protein
|
|
|
48 sequence resulting from each mutation is printed separately, no attempt
|
|
|
49 is made to apply multiple mutations to the same protein.
|
|
|
50
|
|
|
51 =cut
|
|
|
52
|
|
|
53 package ProteinSeqs;
|
|
|
54
|
|
|
55 use strict;
|
|
|
56 use warnings;
|
|
|
57
|
|
|
58 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
59
|
|
|
60 sub version {
|
|
|
61 return '2.4';
|
|
|
62 }
|
|
|
63
|
|
|
64 sub feature_types {
|
|
|
65 return ['Transcript'];
|
|
|
66 }
|
|
|
67
|
|
|
68 sub new {
|
|
|
69 my $class = shift;
|
|
|
70
|
|
|
71 my $self = $class->SUPER::new(@_);
|
|
|
72
|
|
|
73 if($self->{config}->{fork}) {
|
|
|
74 print STDERR "WARNING: Plugin ProteinSeqs is disabling forking\n" unless $self->{config}->{quiet};
|
|
|
75 delete($self->{config}->{fork});
|
|
|
76 }
|
|
|
77
|
|
|
78 # use some default file names if none are supplied
|
|
|
79
|
|
|
80 my $ref_file = $self->params->[0] || 'reference.fa';
|
|
|
81 my $mut_file = $self->params->[1] || 'mutated.fa';
|
|
|
82
|
|
|
83 open $self->{ref_file}, ">$ref_file" or die "Failed to open $ref_file";
|
|
|
84 open $self->{mut_file}, ">$mut_file" or die "Failed to open $mut_file";
|
|
|
85
|
|
|
86 return $self;
|
|
|
87 }
|
|
|
88
|
|
|
89 sub run {
|
|
|
90 my ($self, $tva) = @_;
|
|
|
91
|
|
|
92 # check if we have a mutant amino acid, if not there isn't much we can do!
|
|
|
93
|
|
|
94 if (my $mut_aa = $tva->peptide) {
|
|
|
95
|
|
|
96 # get the peptide coordinates
|
|
|
97
|
|
|
98 my $tl_start = $tva->transcript_variation->translation_start;
|
|
|
99 my $tl_end = $tva->transcript_variation->translation_end;
|
|
|
100
|
|
|
101 # and our reference sequence
|
|
|
102
|
|
|
103 my $ref_seq = $tva->transcript_variation->_peptide;
|
|
|
104
|
|
|
105 # splice the mutant peptide sequence into the reference sequence
|
|
|
106
|
|
|
107 my $mut_seq = $ref_seq;
|
|
|
108
|
|
|
109 substr($mut_seq, $tl_start-1, $tl_end - $tl_start + 1) = $mut_aa;
|
|
|
110
|
|
|
111 # print out our reference and mutant sequences
|
|
|
112
|
|
|
113 my $translation_id = $tva->transcript->translation->stable_id;
|
|
|
114
|
|
|
115 # only print the reference sequence if we haven't printed it yet
|
|
|
116
|
|
|
117 $self->print_fasta($ref_seq, $translation_id, $self->{ref_file})
|
|
|
118 unless $self->{printed_ref}->{$translation_id}++;
|
|
|
119
|
|
|
120 # we always print the mutated sequence as each mutation may have
|
|
|
121 # a different consequence
|
|
|
122
|
|
|
123 $self->print_fasta($mut_seq, $tva->hgvs_protein, $self->{mut_file});
|
|
|
124 }
|
|
|
125
|
|
|
126 # return an empty hashref because we don't want to add
|
|
|
127 # anything to the VEP output file
|
|
|
128
|
|
|
129 return {};
|
|
|
130 }
|
|
|
131
|
|
|
132 sub print_fasta {
|
|
|
133 my ($self, $peptide, $id, $fh) = @_;
|
|
|
134
|
|
|
135 # break the sequence into 80 characters per line
|
|
|
136
|
|
|
137 $peptide =~ s/(.{80})/$1\n/g;
|
|
|
138
|
|
|
139 # get rid of any trailing newline
|
|
|
140
|
|
|
141 chomp $peptide;
|
|
|
142
|
|
|
143 # print the sequence
|
|
|
144
|
|
|
145 print $fh ">$id\n$peptide\n";
|
|
|
146 }
|
|
|
147
|
|
|
148 sub STORABLE_freeze {
|
|
|
149 my ($self, $cloning) = @_;
|
|
|
150 return if $cloning;
|
|
|
151
|
|
|
152 close $self->{ref_file};
|
|
|
153 close $self->{mut_file};
|
|
|
154
|
|
|
155 delete $self->{ref_file};
|
|
|
156 delete $self->{ref_file};
|
|
|
157 }
|
|
|
158
|
|
|
159 sub STORABLE_thaw {
|
|
|
160 my ($self, $cloning) = @_;
|
|
|
161 return if $cloning;
|
|
|
162
|
|
|
163 my $ref_file = $self->params->[0] || 'reference.fa';
|
|
|
164 my $mut_file = $self->params->[1] || 'mutated.fa';
|
|
|
165
|
|
|
166 open $self->{ref_file}, ">>$ref_file" or die "Failed to open $ref_file";
|
|
|
167 open $self->{mut_file}, ">>$mut_file" or die "Failed to open $mut_file";
|
|
|
168 }
|
|
|
169
|
|
|
170 1;
|