Mercurial > repos > dvanzessen > vep_emc
diff dir_plugins/ProteinSeqs.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dir_plugins/ProteinSeqs.pm Mon Jul 15 05:20:39 2019 -0400 @@ -0,0 +1,170 @@ +=head1 LICENSE + +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 CONTACT + + Ensembl <http://www.ensembl.org/info/about/contact/index.html> + +=cut + +=head1 NAME + + ProteinSeqs + +=head1 SYNOPSIS + + mv ProteinSeqs.pm ~/.vep/Plugins + ./vep -i variations.vcf --plugin ProteinSeqs,reference.fa,mutated.fa + +=head1 DESCRIPTION + + This is a plugin for the Ensembl Variant Effect Predictor (VEP) that + prints out the reference and mutated protein sequences of any + proteins found with non-synonymous mutations in the input file. + + You should supply the name of file where you want to store the + reference protein sequences as the first argument, and a file to + store the mutated sequences as the second argument. + + Note that, for simplicity, where stop codons are gained the plugin + simply substitutes a '*' into the sequence and does not truncate the + protein. Where a stop codon is lost any new amino acids encoded by the + mutation are appended to the sequence, but the plugin does not attempt + to translate until the next downstream stop codon. Also, the protein + sequence resulting from each mutation is printed separately, no attempt + is made to apply multiple mutations to the same protein. + +=cut + +package ProteinSeqs; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + +sub version { + return '2.4'; +} + +sub feature_types { + return ['Transcript']; +} + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + if($self->{config}->{fork}) { + print STDERR "WARNING: Plugin ProteinSeqs is disabling forking\n" unless $self->{config}->{quiet}; + delete($self->{config}->{fork}); + } + + # use some default file names if none are supplied + + my $ref_file = $self->params->[0] || 'reference.fa'; + my $mut_file = $self->params->[1] || 'mutated.fa'; + + open $self->{ref_file}, ">$ref_file" or die "Failed to open $ref_file"; + open $self->{mut_file}, ">$mut_file" or die "Failed to open $mut_file"; + + return $self; +} + +sub run { + my ($self, $tva) = @_; + + # check if we have a mutant amino acid, if not there isn't much we can do! + + if (my $mut_aa = $tva->peptide) { + + # get the peptide coordinates + + my $tl_start = $tva->transcript_variation->translation_start; + my $tl_end = $tva->transcript_variation->translation_end; + + # and our reference sequence + + my $ref_seq = $tva->transcript_variation->_peptide; + + # splice the mutant peptide sequence into the reference sequence + + my $mut_seq = $ref_seq; + + substr($mut_seq, $tl_start-1, $tl_end - $tl_start + 1) = $mut_aa; + + # print out our reference and mutant sequences + + my $translation_id = $tva->transcript->translation->stable_id; + + # only print the reference sequence if we haven't printed it yet + + $self->print_fasta($ref_seq, $translation_id, $self->{ref_file}) + unless $self->{printed_ref}->{$translation_id}++; + + # we always print the mutated sequence as each mutation may have + # a different consequence + + $self->print_fasta($mut_seq, $tva->hgvs_protein, $self->{mut_file}); + } + + # return an empty hashref because we don't want to add + # anything to the VEP output file + + return {}; +} + +sub print_fasta { + my ($self, $peptide, $id, $fh) = @_; + + # break the sequence into 80 characters per line + + $peptide =~ s/(.{80})/$1\n/g; + + # get rid of any trailing newline + + chomp $peptide; + + # print the sequence + + print $fh ">$id\n$peptide\n"; +} + +sub STORABLE_freeze { + my ($self, $cloning) = @_; + return if $cloning; + + close $self->{ref_file}; + close $self->{mut_file}; + + delete $self->{ref_file}; + delete $self->{ref_file}; +} + +sub STORABLE_thaw { + my ($self, $cloning) = @_; + return if $cloning; + + my $ref_file = $self->params->[0] || 'reference.fa'; + my $mut_file = $self->params->[1] || 'mutated.fa'; + + open $self->{ref_file}, ">>$ref_file" or die "Failed to open $ref_file"; + open $self->{mut_file}, ">>$mut_file" or die "Failed to open $mut_file"; +} + +1;
