Mercurial > repos > dvanzessen > vep_emc
diff dir_plugins/GeneSplicer.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dir_plugins/GeneSplicer.pm Mon Jul 15 05:20:39 2019 -0400 @@ -0,0 +1,369 @@ +=head1 LICENSE + +Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +Copyright [2016-2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 CONTACT + + Ensembl <http://www.ensembl.org/info/about/contact/index.html> + +=cut + +=head1 NAME + + GeneSplicer + +=head1 SYNOPSIS + + mv GeneSplicer.pm ~/.vep/Plugins + ./vep -i variants.vcf --plugin GeneSplicer,[path_to_genesplicer_bin],[path_to_training_dir],[option1=value],[option2=value] + +=head1 DESCRIPTION + + This is a plugin for the Ensembl Variant Effect Predictor (VEP) that + runs GeneSplicer (https://ccb.jhu.edu/software/genesplicer/) to get + splice site predictions. + + It evaluates a tract of sequence either side of and including the + variant, both in reference and alternate states. The amount of + sequence included either side defaults to 100bp, but can be modified + by passing e.g. "context=50" as a parameter to the plugin. + + Any predicted splicing regions that overlap the variant are reported + in the output with one of four states: no_change, diff, gain, loss + + There follows a "/"-separated string consisting of the following data: + + 1) type (donor, acceptor) + 2) coordinates (start-end) + 3) confidence (Low, Medium, High) + 4) score + + Example: loss/acceptor/727006-727007/High/16.231924 + + If multiple sites are predicted, their reports are separated by ",". + + For diff, the confidence and score for both the reference and alternate + sequences is reported as REF-ALT. + + Example: diff/donor/621915-621914/Medium-Medium/7.020731-6.988368 + + Several parameters can be modified by passing them to the plugin string: + + context : change the amount of sequence added either side of + the variant (default: 100bp) + tmpdir : change the temporary directory used (default: /tmp) + cache_size : change how many sequences' scores are cached in memory + (default: 50) + + Example: --plugin GeneSplicer,$GS/bin/linux/genesplicer,$GS/human,context=200,tmpdir=/mytmp + + On some systems the binaries provided will not execute, but can be compiled from source: + + cd $GS/sources + make + cd - + ./vep [options] --plugin GeneSplicer,$GS/sources/genesplicer,$GS/human + + On Mac OSX the make step is known to fail; the genesplicer.cpp file requires modification: + + cd $GS/sources + perl -pi -e "s/^main /int main /" genesplicer.cpp + make + + +=cut + +package GeneSplicer; + +use strict; +use warnings; + +use Digest::MD5 qw(md5_hex); + +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); + +use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + +our %DEFAULTS = ( + context => 100, + tmpdir => '/tmp', + cache_size => 50, +); + +sub new { + my $class = shift; + + my $self = $class->SUPER::new(@_); + + # we need sequence, so no offline mode unless we have FASTA + die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta}; + + my $params = $self->params; + + my $bin = shift @$params; + die("ERROR: genesplicer binary not specified\n") unless $bin; + die("ERROR: genesplicer binary not found\n") unless -e $bin; + my $test = `$bin 2>&1`; + die("ERROR: failed to run genesplicer binary:\n$test\n") unless $test =~ /^USAGE/; + $self->{_bin} = $bin; + + my $training_dir = shift @$params; + die("ERROR: training directory not specified\n") unless $training_dir; + die("ERROR: training directory not found\n") unless -d $training_dir; + $self->{_training_dir} = $training_dir; + + # defaults + $self->{'_param_'.$_} = $DEFAULTS{$_} for keys %DEFAULTS; + + # REST API passes 1 as first param + shift @$params if $params->[0] && $params->[0] eq '1'; + + # set/override with user params + foreach my $param(@$params) { + my ($key, $val) = split('=', $param); + die("ERROR: Failed to parse parameter $param\n") unless defined($key) && defined($val); + + $self->{'_param_'.$key} = $val; + } + + return $self; +} + +sub feature_types { + return ['Transcript']; +} + +sub get_header_info { + return { + GeneSplicer => "GeneSplicer predictions" + }; +} + +sub run { + my ($self, $tva) = @_; + + my $vf = $tva->variation_feature; + + # get up and downstream sequences + my $up_seq = $vf->{slice}->sub_Slice( + $vf->{start} - $self->{'_param_context'}, + $vf->{start} - 1, + $vf->strand + )->seq; + + my $down_seq = $vf->{slice}->sub_Slice( + $vf->{end} + 1, + $vf->{end} + $self->{'_param_context'}, + $vf->strand + )->seq; + + # create ref seq by grabbing reference TVA + my $ref_seq = join("", + $up_seq, + $tva->transcript_variation->get_reference_TranscriptVariationAllele->variation_feature_seq, + $down_seq + ); + + return {} unless $ref_seq =~ /^[ACGT]+$/; + + # create alt seq + my $alt_allele = $tva->variation_feature_seq; + $alt_allele =~ s/\-//g; + my $alt_seq = $up_seq.$alt_allele.$down_seq; + + + return {} unless $alt_seq =~ /^[ACGT]+$/; + + # reverse comp if strands differ + if($tva->transcript->strand != $vf->strand) { + reverse_comp(\$ref_seq); + reverse_comp(\$alt_seq); + } + + # get results + my $ref_results = $self->results_from_cache($ref_seq) || $self->results_from_seq($ref_seq); + my $alt_results = $self->results_from_cache($alt_seq) || $self->results_from_seq($alt_seq); + + # compare results both ways + my $diff_ref_to_alt = $self->compare_results($ref_results, $alt_results); + my $diff_alt_to_ref = $self->compare_results($alt_results, $ref_results); + + # get VF pos relative to tested sequence + my ($vf_start, $vf_end) = ($self->{'_param_context'} + 1, $self->{'_param_context'} + (($vf->{end} - $vf->{start}) + 1)); + + # get overlapping losses and gains + # and map to chromosome coords + my @losses = + map {$_->{gl} = 'loss'; $_} + @{$diff_ref_to_alt->{lost}}; + + my @gains = + map {$_->{gl} = 'gain'; $_} + @{$diff_alt_to_ref->{lost}}; + + my @diffs = + map {$_->{gl} = 'diff'; $_} + @{$diff_ref_to_alt->{diff}}; + + my $return = join(',', + map { + join('/', + $_->[0]->{gl}, + $_->[0]->{type}, + $_->[1]->{end5}.'-'.$_->[1]->{end3}, + $_->[0]->{confidence}, + $_->[0]->{score} + ) + } + map {[$_, $self->map_ss_coords($_, $vf)]} + grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} + (@losses, @gains, @diffs) + ); + + # probably of interest to report splice sites were found + # but no difference between ref and alt + if(!$return && grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results) { + $return = join(',', + map { + join('/', + 'no_change', + $_->[0]->{type}, + $_->[1]->{end5}.'-'.$_->[1]->{end3}, + $_->[0]->{confidence}, + $_->[0]->{score} + ) + } + map {[$_, $self->map_ss_coords($_, $vf)]} + grep {overlap($vf_start, $vf_end, $_->{end5}, $_->{end3})} @$ref_results + ); + } + + return $return ? { GeneSplicer => $return } : {}; +} + +sub results_from_seq { + my $self = shift; + my $seq = shift; + + # write seqs to file + my $seq_file = $self->{'_param_tmpdir'}."/genesplicer_$$.fa"; + open SEQ, ">$seq_file" or die("ERROR: Could not write to temporary sequence file $seq_file\n"); + print SEQ ">SEQ\n$seq\n"; + close SEQ; + + my $result_file = $self->{'_param_tmpdir'}."/genesplicer_$$.results"; + + my $cmd = sprintf( + '%s %s %s -f %s', + $self->{'_bin'}, + $seq_file, + $self->{'_training_dir'}, + $result_file + ); + + my $output = `$cmd 2>&1`; + unlink($seq_file); + + return [] unless -e $result_file; + + open RES, $result_file; + my @results; + + while(<RES>) { + chomp; + my ($end5, $end3, $score, $confidence, $type) = split; + + push @results, { + end5 => $end5, + end3 => $end3, + score => $score, + confidence => $confidence, + type => $type + }; + } + close RES; + + unlink($result_file); + + push @{$self->{cache}}, { hex => md5_hex($seq), results => \@results}; + shift @{$self->{cache}} while scalar @{$self->{cache}} > $self->{_param_cache_size}; + + return \@results; +} + +sub results_from_cache { + my $self = shift; + my $seq = shift; + + my ($results) = map {$_->{results}} grep {$_->{hex} eq md5_hex($seq)} @{$self->{cache} || []}; + + return $results; +} + +sub compare_results { + my $self = shift; + my $a = shift; + my $b = shift; + + my (@diff, @lost); + + foreach my $res_a(@$a) { + my @match = grep { + $_->{end5} == $res_a->{end5} && + $_->{end3} == $res_a->{end3} && + $_->{type} eq $res_a->{type} + } @$b; + + # result not found in b + if(!@match) { + push @lost, $res_a; + } + + # >1 result found + elsif(scalar @match > 1) { + warn("WARNING: Found two matches?\n"); + } + + # 1 match + elsif($match[0]->{score} != $res_a->{score}) { + my %diff = %$res_a; + $diff{score} .= '-'.$match[0]->{score}; + $diff{confidence} .= '-'.$match[0]->{confidence}; + push @diff, \%diff; + } + } + + return { diff => \@diff, lost => \@lost}; +} + +sub map_ss_coords { + my $self = shift; + my $res = shift; + my $vf = shift; + + my $return = {}; + + foreach my $coord(qw(end5 end3)) { + $return->{$coord} = (($res->{$coord} - $self->{'_param_context'}) + $vf->{start}) - 1; + } + + return $return; +} + +1; +
