diff dir_plugins/MTR.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dir_plugins/MTR.pm	Mon Jul 15 05:20:39 2019 -0400
@@ -0,0 +1,153 @@
+=head1 CONTACT
+
+  Slave Petrovski <slavep@unimelb.edu.au>
+  Michael Silk <silkm@student.unimelb.edu.au>
+
+=cut
+
+=head1 NAME
+
+  MTR (Missense Tolerance Ratio)
+
+=head1 SYNOPSIS
+
+  mv MTR.pm ~/.vep/Plugins
+  curl -O ftp://mtr-viewer.mdhs.unimelb.edu.au/pub/mtrflatfile_1.0.txt.gz
+  curl -O ftp://mtr-viewer.mdhs.unimelb.edu.au/pub/mtrflatfile_1.0.txt.gz.tbi
+  perl variant_effect_predictor.pl -i variations.vcf --plugin MTR,mtrflatfile_1.0.txt.gz
+
+=head1 DESCRIPTION
+
+A VEP plugin that retrieves Missense Tolerance Ratio (MTR) scores for
+variants from a tabix-indexed flat file.
+
+MTR scores quantify the amount of purifying selection acting
+specifically on missense variants in a given window of protein-coding
+sequence. It is estimated across a sliding window of 31 codons and uses
+observed standing variation data from the WES component of the Exome
+Aggregation Consortium Database (ExAC), version 2.0
+(http://gnomad.broadinstitute.org).
+
+Please cite the MTR publication alongside the VEP if you use this resource:
+http://genome.cshlp.org/content/27/10/1715
+
+The Bio::DB::HTS perl library or tabix utility must be installed in your path
+to use this plugin. MTR flat files can be downloaded from:
+ftp://mtr-viewer.mdhs.unimelb.edu.au/pub
+
+NB: Data are available for GRCh37 only
+
+=cut
+
+package MTR;
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
+
+use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
+
+use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
+
+sub new {
+  my $class = shift;
+  my $self = $class->SUPER::new(@_);
+
+  # test tabix
+  die "ERROR: tabix does not seem to be in your path\n" unless `which tabix 2>&1` =~ /tabix$/;
+
+  # get MTR file
+  my $file = $self->params->[0];
+  $self->add_file($file);
+
+  # remote files?
+  if($file =~ /tp\:\/\//) {
+    my $remote_test = `tabix -f $file 1:1-1 2>&1`;
+    if($remote_test && $remote_test !~ /get_local_version/) {
+      die "$remote_test\nERROR: Could not find file or index file for remote annotation file $file\n";
+    }
+  }
+
+  # check files exist
+  else {
+    die "ERROR: MTR file $file not found\n" unless -e $file;
+    die "ERROR: Tabix index file $file\.tbi not found - perhaps you need to create it first?\n" unless -e $file.'.tbi';
+  }
+
+  # get headers and store on self
+  open HEAD, "tabix -fh $file 1:1-1 2>&1 | ";
+  while(<HEAD>) {
+    next unless /^\#/;
+    chomp;
+    $self->{headers} = [split];
+  }
+  close HEAD;
+
+  return $self;
+}
+
+sub feature_types {
+  return ['Transcript'];
+}
+
+sub variation_feature_types {
+  return ['VariationFeature'];
+}
+
+sub get_header_info {
+  my $self = shift;
+  return {
+	  MTR         => 'MTR score',
+	  FDR         => 'MTR false discovery rate adjusted binomial exact test.',
+	  MTR_centile => 'MTR gene-specific percentile'
+	 }
+}
+
+sub run {
+  my ($self, $tva) = @_;
+  my $vf = $tva->variation_feature;
+
+  # get allele, reverse comp if needed
+  my $allele = $tva->variation_feature_seq;
+  reverse_comp(\$allele) if $vf->{strand} < 0;
+
+  return {} unless $allele =~ /^[ACGT]$/;
+
+  my $tr_id = $tva->transcript->stable_id;
+
+  # data is written by pos, allele, transcript ID (feature)
+  # grep lines read in matched on position so that they also are matched on allele and transcript ID
+  my ($res) = grep {
+    $_->{Genomic_position} eq $vf->{start} &&
+    $_->{Genomic_position} eq $vf->{end} &&
+    $_->{alt}              eq $allele &&
+    $_->{Feature}          eq $tr_id
+  } @{$self->get_data($vf->{chr}, $vf->{start}, $vf->{end})};
+
+  # return only the keys defined by get_header_info()
+  return $res ? { map {$_ => $res->{$_}} grep {defined($res->{$_}) && $res->{$_} ne '.'} keys %{$self->get_header_info} } : {};
+}
+
+sub parse_data {
+  my ($self, $line) = @_;
+
+  $line =~ s/\r$//g;
+
+  my @split = split /\t/, $line;
+  
+  # parse data into hash of col names and values
+  my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1));
+
+  return \%data;
+}
+
+sub get_start {  
+  return $_[1]->{'Genomic_position'};
+}
+
+sub get_end {
+  return $_[1]->{'Genomic_position'};
+}
+
+1;