Mercurial > repos > dvanzessen > vep_emc
diff dir_plugins/RefSeqHGVS.pm @ 0:e545d0a25ffe draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:17:17 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dir_plugins/RefSeqHGVS.pm Mon Jul 15 05:17:17 2019 -0400 @@ -0,0 +1,208 @@ +=head1 NAME + + RefSeqHGVS -- provide RefSeq-based HGVS tags for VEP output + +=head1 SYNOPSIS + + cp RefSeqHGVS.pm ~/.vep/Plugins/ (or elsewhere in PERL5LIB) + perl variant_effect_predictor.pl -i variations.vcf --plugin RefSeqHGVS + + Output: + Variant lines with the following addtiional tags: + HGVSc-RefSeq=NM_198156.2:c.403delA;HGVSp-RefSeq=NP_937799.1:p.Arg135GlyfsX26 + +=head1 DESCRIPTION + + RefSeqHGVS is a plugin for Ensembl's Variant Effect Predictor that + provides variant annotatoins in HGVS format [1] using RefSeq accessions + (typically NM and NP). It provides the analog to VEP's HGVSc and HGVSp + annotations, which use Ensembl ENST and ENSP accessions. This module + relies RefSeq data in the OtherFeatures database. + + Converting ENST HGVS tags to RefSeq tags is confounded subtle differences + between exon structures for the same CDS. The plugin is intended to be + conservative by requiring exact matches of both CDS and exon structure + when converting variants. + + [1] http://www.hgvs.org/mutnomen/ + +=head1 BUGS AND LIMITATIONS + +=over + +=item * RefSeq transcripts limited to those in the OtherFeatures database. + +Archived RefSeq transcripts are not available. + +=item * Discrepancies between RefSeq and GRCh37 + + Approximately 20% of RefSeqs differ from the GRCh37 due to substitution + (16.2%), insertion/deletion (3.5%), or both (1.2%) differences. [1] + Variants are annotated by difference with respect to the genome, not the + RefSeq transcript. + + [1] MU2A--reconciling the genome and transcriptome to determine the + effects of base substitutions. +Garla, V., Kong, Y., Szpakowski, S., & Krauthammer, M. (2011). + Bioinformatics, 27(3), 416-8. doi:10.1093/bioinformatics/btq658 + http://www.ncbi.nlm.nih.gov/pubmed/21149339 + +=item * Conservative selection of equivalent RefSeq transcripts. + +Only NM transcripts which are exactly identical to the ENST transcript in +both CDS and exon structure are used. In the future, this might be +relaxed to exclude only regions of mismatch. Currently, this implies that +HGVS tags are not constructued when the RefSeq differs from the reference +geneome. + +=back + +=head1 CONTACT + + Reece Hart <reecehart@gmail.com> + +=head1 LICENSE + + This code and all rights to it are hereby donated to EnsEMBL project by + Reece Hart and Locus Development. + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=cut + + +package RefSeqHGVS; + +use strict; +use warnings; + +use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); + +use Data::Dumper; + +my %mt_cache; # cache of ENST -> NM mappings + +sub version { + return '2.3'; +} + +sub new { + my $class = shift; + my $self = $class->SUPER::new(@_); + my $reg = 'Bio::EnsEMBL::Registry'; + + $self->{ofsa} = $reg->get_adaptor('Human', 'otherfeatures', 'Slice') + or die "Failed to create transcript adaptor in human otherfeatures database\n"; + $self->{ofta} = $reg->get_adaptor('Human', 'otherfeatures', 'Transcript') + or die "Failed to create transcript adaptor in human otherfeatures database\n"; + + return $self; +} + +sub feature_types { + return ['Transcript']; +} + +sub variant_feature_types { + return ['VariationFeature']; +} + +sub get_header_info { + return { + 'HGVSc-RefSeq' => "HGVSc using RefSeq transcripts accessions", + 'HGVSp-RefSeq' => "HGVSp using RefSeq protein accessions", + }; +} + +sub run { + my ($self, $tva) = @_; + my $t = $tva->transcript; + my %rv; + + my $ofsa = $self->{ofsa}; + my $ofta = $self->{ofta}; + + # mappabale_transcripts have identical CDS and exon structure + my @mappable_transcripts = $self->get_mappable_transcripts($t); + my @transcript_acs = map { $_->display_id() } @mappable_transcripts; + my @protein_acs = grep {defined $_} map { $_->translation()->display_id() } @mappable_transcripts; + + # substitute accessions for those in mappable transcripts/proteins + my @hgvsc_rs; + if (defined (my $hgvsc = $tva->hgvs_coding())) { + @hgvsc_rs = map {__subst_hgvs_ac($hgvsc ,$_)} @transcript_acs; + } + my @hgvsp_rs; + if (defined (my $hgvsp = $tva->hgvs_protein())) { + @hgvsp_rs = map {__subst_hgvs_ac($hgvsp,$_)} @protein_acs; + } + + $rv{'HGVSc-RefSeq'} = join(';',@hgvsc_rs) if @hgvsc_rs; + $rv{'HGVSp-RefSeq'} = join(';',@hgvsp_rs) if @hgvsp_rs; + return \%rv; +} + + +sub get_mappable_transcripts { + my ($self,$t) = @_; + my $key = $t->display_id(); + if (not exists $mt_cache{$key}) { + @{$mt_cache{$key}} = $self->_get_mappable_transcripts($t); + } + return @{$mt_cache{$key}}; +} + + +############################################################################ +## internal methods +sub _get_mappable_transcripts { + my ($self,$t) = @_; + my $cds_seq = $t->translateable_seq(); + my $exon_structure = __tx_exon_str($t); + + # cds_seq is empty for pseudogenes; no mapping possible + return () if ($cds_seq eq ''); + + # get overlapping transcripts from other features + my @tx = @{ $self->{ofta}->fetch_all_by_Slice($t->feature_Slice()) }; + + # limit to NMs with standard-format + @tx = grep { $_->display_id() =~ m/^NM_\d+\.\d+$/ } @tx; + + # limit to transcripts with same CDS + @tx = grep { $_->translateable_seq() eq $cds_seq } @tx; + + # limit to transcripts with identical chromosomal exon structure + @tx = grep { __tx_exon_str($_->transform('chromosome')) eq $exon_structure } @tx; + + return @tx; +} + + +############################################################################ +## function (not methods) + +sub __tx_exon_str { + my $t = shift; + return join(',', map {sprintf('[%s,%s]', $_->start(), $_->end())} + @{ $t->get_all_translateable_Exons() } ); +} + +sub __subst_hgvs_ac { + my ($hgvs,$ac) = @_; + $hgvs =~ s/^[^:]+:/$ac:/; + return $hgvs; +} + +sub __uniq { + return keys %{{ map {$_=>1} @_ }}; +} + + +1;
