comparison dir_plugins/RefSeqHGVS.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
comparison
equal deleted inserted replaced
2:17c98d091710 3:49397129aec0
1 =head1 NAME
2
3 RefSeqHGVS -- provide RefSeq-based HGVS tags for VEP output
4
5 =head1 SYNOPSIS
6
7 cp RefSeqHGVS.pm ~/.vep/Plugins/ (or elsewhere in PERL5LIB)
8 perl variant_effect_predictor.pl -i variations.vcf --plugin RefSeqHGVS
9
10 Output:
11 Variant lines with the following addtiional tags:
12 HGVSc-RefSeq=NM_198156.2:c.403delA;HGVSp-RefSeq=NP_937799.1:p.Arg135GlyfsX26
13
14 =head1 DESCRIPTION
15
16 RefSeqHGVS is a plugin for Ensembl's Variant Effect Predictor that
17 provides variant annotatoins in HGVS format [1] using RefSeq accessions
18 (typically NM and NP). It provides the analog to VEP's HGVSc and HGVSp
19 annotations, which use Ensembl ENST and ENSP accessions. This module
20 relies RefSeq data in the OtherFeatures database.
21
22 Converting ENST HGVS tags to RefSeq tags is confounded subtle differences
23 between exon structures for the same CDS. The plugin is intended to be
24 conservative by requiring exact matches of both CDS and exon structure
25 when converting variants.
26
27 [1] http://www.hgvs.org/mutnomen/
28
29 =head1 BUGS AND LIMITATIONS
30
31 =over
32
33 =item * RefSeq transcripts limited to those in the OtherFeatures database.
34
35 Archived RefSeq transcripts are not available.
36
37 =item * Discrepancies between RefSeq and GRCh37
38
39 Approximately 20% of RefSeqs differ from the GRCh37 due to substitution
40 (16.2%), insertion/deletion (3.5%), or both (1.2%) differences. [1]
41 Variants are annotated by difference with respect to the genome, not the
42 RefSeq transcript.
43
44 [1] MU2A--reconciling the genome and transcriptome to determine the
45 effects of base substitutions.
46 Garla, V., Kong, Y., Szpakowski, S., & Krauthammer, M. (2011).
47 Bioinformatics, 27(3), 416-8. doi:10.1093/bioinformatics/btq658
48 http://www.ncbi.nlm.nih.gov/pubmed/21149339
49
50 =item * Conservative selection of equivalent RefSeq transcripts.
51
52 Only NM transcripts which are exactly identical to the ENST transcript in
53 both CDS and exon structure are used. In the future, this might be
54 relaxed to exclude only regions of mismatch. Currently, this implies that
55 HGVS tags are not constructued when the RefSeq differs from the reference
56 geneome.
57
58 =back
59
60 =head1 CONTACT
61
62 Reece Hart <reecehart@gmail.com>
63
64 =head1 LICENSE
65
66 This code and all rights to it are hereby donated to EnsEMBL project by
67 Reece Hart and Locus Development.
68
69 Copyright (c) 1999-2011 The European Bioinformatics Institute and
70 Genome Research Limited. All rights reserved.
71
72 This software is distributed under a modified Apache license.
73 For license details, please see
74
75 http://www.ensembl.org/info/about/code_licence.html
76
77 =cut
78
79
80 package RefSeqHGVS;
81
82 use strict;
83 use warnings;
84
85 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
86
87 use Data::Dumper;
88
89 my %mt_cache; # cache of ENST -> NM mappings
90
91 sub version {
92 return '2.3';
93 }
94
95 sub new {
96 my $class = shift;
97 my $self = $class->SUPER::new(@_);
98 my $reg = 'Bio::EnsEMBL::Registry';
99
100 $self->{ofsa} = $reg->get_adaptor('Human', 'otherfeatures', 'Slice')
101 or die "Failed to create transcript adaptor in human otherfeatures database\n";
102 $self->{ofta} = $reg->get_adaptor('Human', 'otherfeatures', 'Transcript')
103 or die "Failed to create transcript adaptor in human otherfeatures database\n";
104
105 return $self;
106 }
107
108 sub feature_types {
109 return ['Transcript'];
110 }
111
112 sub variant_feature_types {
113 return ['VariationFeature'];
114 }
115
116 sub get_header_info {
117 return {
118 'HGVSc-RefSeq' => "HGVSc using RefSeq transcripts accessions",
119 'HGVSp-RefSeq' => "HGVSp using RefSeq protein accessions",
120 };
121 }
122
123 sub run {
124 my ($self, $tva) = @_;
125 my $t = $tva->transcript;
126 my %rv;
127
128 my $ofsa = $self->{ofsa};
129 my $ofta = $self->{ofta};
130
131 # mappabale_transcripts have identical CDS and exon structure
132 my @mappable_transcripts = $self->get_mappable_transcripts($t);
133 my @transcript_acs = map { $_->display_id() } @mappable_transcripts;
134 my @protein_acs = grep {defined $_} map { $_->translation()->display_id() } @mappable_transcripts;
135
136 # substitute accessions for those in mappable transcripts/proteins
137 my @hgvsc_rs;
138 if (defined (my $hgvsc = $tva->hgvs_coding())) {
139 @hgvsc_rs = map {__subst_hgvs_ac($hgvsc ,$_)} @transcript_acs;
140 }
141 my @hgvsp_rs;
142 if (defined (my $hgvsp = $tva->hgvs_protein())) {
143 @hgvsp_rs = map {__subst_hgvs_ac($hgvsp,$_)} @protein_acs;
144 }
145
146 $rv{'HGVSc-RefSeq'} = join(';',@hgvsc_rs) if @hgvsc_rs;
147 $rv{'HGVSp-RefSeq'} = join(';',@hgvsp_rs) if @hgvsp_rs;
148 return \%rv;
149 }
150
151
152 sub get_mappable_transcripts {
153 my ($self,$t) = @_;
154 my $key = $t->display_id();
155 if (not exists $mt_cache{$key}) {
156 @{$mt_cache{$key}} = $self->_get_mappable_transcripts($t);
157 }
158 return @{$mt_cache{$key}};
159 }
160
161
162 ############################################################################
163 ## internal methods
164 sub _get_mappable_transcripts {
165 my ($self,$t) = @_;
166 my $cds_seq = $t->translateable_seq();
167 my $exon_structure = __tx_exon_str($t);
168
169 # cds_seq is empty for pseudogenes; no mapping possible
170 return () if ($cds_seq eq '');
171
172 # get overlapping transcripts from other features
173 my @tx = @{ $self->{ofta}->fetch_all_by_Slice($t->feature_Slice()) };
174
175 # limit to NMs with standard-format
176 @tx = grep { $_->display_id() =~ m/^NM_\d+\.\d+$/ } @tx;
177
178 # limit to transcripts with same CDS
179 @tx = grep { $_->translateable_seq() eq $cds_seq } @tx;
180
181 # limit to transcripts with identical chromosomal exon structure
182 @tx = grep { __tx_exon_str($_->transform('chromosome')) eq $exon_structure } @tx;
183
184 return @tx;
185 }
186
187
188 ############################################################################
189 ## function (not methods)
190
191 sub __tx_exon_str {
192 my $t = shift;
193 return join(',', map {sprintf('[%s,%s]', $_->start(), $_->end())}
194 @{ $t->get_all_translateable_Exons() } );
195 }
196
197 sub __subst_hgvs_ac {
198 my ($hgvs,$ac) = @_;
199 $hgvs =~ s/^[^:]+:/$ac:/;
200 return $hgvs;
201 }
202
203 sub __uniq {
204 return keys %{{ map {$_=>1} @_ }};
205 }
206
207
208 1;