|
0
|
1 =head1 NAME
|
|
|
2
|
|
|
3 RefSeqHGVS -- provide RefSeq-based HGVS tags for VEP output
|
|
|
4
|
|
|
5 =head1 SYNOPSIS
|
|
|
6
|
|
|
7 cp RefSeqHGVS.pm ~/.vep/Plugins/ (or elsewhere in PERL5LIB)
|
|
|
8 perl variant_effect_predictor.pl -i variations.vcf --plugin RefSeqHGVS
|
|
|
9
|
|
|
10 Output:
|
|
|
11 Variant lines with the following addtiional tags:
|
|
|
12 HGVSc-RefSeq=NM_198156.2:c.403delA;HGVSp-RefSeq=NP_937799.1:p.Arg135GlyfsX26
|
|
|
13
|
|
|
14 =head1 DESCRIPTION
|
|
|
15
|
|
|
16 RefSeqHGVS is a plugin for Ensembl's Variant Effect Predictor that
|
|
|
17 provides variant annotatoins in HGVS format [1] using RefSeq accessions
|
|
|
18 (typically NM and NP). It provides the analog to VEP's HGVSc and HGVSp
|
|
|
19 annotations, which use Ensembl ENST and ENSP accessions. This module
|
|
|
20 relies RefSeq data in the OtherFeatures database.
|
|
|
21
|
|
|
22 Converting ENST HGVS tags to RefSeq tags is confounded subtle differences
|
|
|
23 between exon structures for the same CDS. The plugin is intended to be
|
|
|
24 conservative by requiring exact matches of both CDS and exon structure
|
|
|
25 when converting variants.
|
|
|
26
|
|
|
27 [1] http://www.hgvs.org/mutnomen/
|
|
|
28
|
|
|
29 =head1 BUGS AND LIMITATIONS
|
|
|
30
|
|
|
31 =over
|
|
|
32
|
|
|
33 =item * RefSeq transcripts limited to those in the OtherFeatures database.
|
|
|
34
|
|
|
35 Archived RefSeq transcripts are not available.
|
|
|
36
|
|
|
37 =item * Discrepancies between RefSeq and GRCh37
|
|
|
38
|
|
|
39 Approximately 20% of RefSeqs differ from the GRCh37 due to substitution
|
|
|
40 (16.2%), insertion/deletion (3.5%), or both (1.2%) differences. [1]
|
|
|
41 Variants are annotated by difference with respect to the genome, not the
|
|
|
42 RefSeq transcript.
|
|
|
43
|
|
|
44 [1] MU2A--reconciling the genome and transcriptome to determine the
|
|
|
45 effects of base substitutions.
|
|
|
46 Garla, V., Kong, Y., Szpakowski, S., & Krauthammer, M. (2011).
|
|
|
47 Bioinformatics, 27(3), 416-8. doi:10.1093/bioinformatics/btq658
|
|
|
48 http://www.ncbi.nlm.nih.gov/pubmed/21149339
|
|
|
49
|
|
|
50 =item * Conservative selection of equivalent RefSeq transcripts.
|
|
|
51
|
|
|
52 Only NM transcripts which are exactly identical to the ENST transcript in
|
|
|
53 both CDS and exon structure are used. In the future, this might be
|
|
|
54 relaxed to exclude only regions of mismatch. Currently, this implies that
|
|
|
55 HGVS tags are not constructued when the RefSeq differs from the reference
|
|
|
56 geneome.
|
|
|
57
|
|
|
58 =back
|
|
|
59
|
|
|
60 =head1 CONTACT
|
|
|
61
|
|
|
62 Reece Hart <reecehart@gmail.com>
|
|
|
63
|
|
|
64 =head1 LICENSE
|
|
|
65
|
|
|
66 This code and all rights to it are hereby donated to EnsEMBL project by
|
|
|
67 Reece Hart and Locus Development.
|
|
|
68
|
|
|
69 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
|
70 Genome Research Limited. All rights reserved.
|
|
|
71
|
|
|
72 This software is distributed under a modified Apache license.
|
|
|
73 For license details, please see
|
|
|
74
|
|
|
75 http://www.ensembl.org/info/about/code_licence.html
|
|
|
76
|
|
|
77 =cut
|
|
|
78
|
|
|
79
|
|
|
80 package RefSeqHGVS;
|
|
|
81
|
|
|
82 use strict;
|
|
|
83 use warnings;
|
|
|
84
|
|
|
85 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
|
|
|
86
|
|
|
87 use Data::Dumper;
|
|
|
88
|
|
|
89 my %mt_cache; # cache of ENST -> NM mappings
|
|
|
90
|
|
|
91 sub version {
|
|
|
92 return '2.3';
|
|
|
93 }
|
|
|
94
|
|
|
95 sub new {
|
|
|
96 my $class = shift;
|
|
|
97 my $self = $class->SUPER::new(@_);
|
|
|
98 my $reg = 'Bio::EnsEMBL::Registry';
|
|
|
99
|
|
|
100 $self->{ofsa} = $reg->get_adaptor('Human', 'otherfeatures', 'Slice')
|
|
|
101 or die "Failed to create transcript adaptor in human otherfeatures database\n";
|
|
|
102 $self->{ofta} = $reg->get_adaptor('Human', 'otherfeatures', 'Transcript')
|
|
|
103 or die "Failed to create transcript adaptor in human otherfeatures database\n";
|
|
|
104
|
|
|
105 return $self;
|
|
|
106 }
|
|
|
107
|
|
|
108 sub feature_types {
|
|
|
109 return ['Transcript'];
|
|
|
110 }
|
|
|
111
|
|
|
112 sub variant_feature_types {
|
|
|
113 return ['VariationFeature'];
|
|
|
114 }
|
|
|
115
|
|
|
116 sub get_header_info {
|
|
|
117 return {
|
|
|
118 'HGVSc-RefSeq' => "HGVSc using RefSeq transcripts accessions",
|
|
|
119 'HGVSp-RefSeq' => "HGVSp using RefSeq protein accessions",
|
|
|
120 };
|
|
|
121 }
|
|
|
122
|
|
|
123 sub run {
|
|
|
124 my ($self, $tva) = @_;
|
|
|
125 my $t = $tva->transcript;
|
|
|
126 my %rv;
|
|
|
127
|
|
|
128 my $ofsa = $self->{ofsa};
|
|
|
129 my $ofta = $self->{ofta};
|
|
|
130
|
|
|
131 # mappabale_transcripts have identical CDS and exon structure
|
|
|
132 my @mappable_transcripts = $self->get_mappable_transcripts($t);
|
|
|
133 my @transcript_acs = map { $_->display_id() } @mappable_transcripts;
|
|
|
134 my @protein_acs = grep {defined $_} map { $_->translation()->display_id() } @mappable_transcripts;
|
|
|
135
|
|
|
136 # substitute accessions for those in mappable transcripts/proteins
|
|
|
137 my @hgvsc_rs;
|
|
|
138 if (defined (my $hgvsc = $tva->hgvs_coding())) {
|
|
|
139 @hgvsc_rs = map {__subst_hgvs_ac($hgvsc ,$_)} @transcript_acs;
|
|
|
140 }
|
|
|
141 my @hgvsp_rs;
|
|
|
142 if (defined (my $hgvsp = $tva->hgvs_protein())) {
|
|
|
143 @hgvsp_rs = map {__subst_hgvs_ac($hgvsp,$_)} @protein_acs;
|
|
|
144 }
|
|
|
145
|
|
|
146 $rv{'HGVSc-RefSeq'} = join(';',@hgvsc_rs) if @hgvsc_rs;
|
|
|
147 $rv{'HGVSp-RefSeq'} = join(';',@hgvsp_rs) if @hgvsp_rs;
|
|
|
148 return \%rv;
|
|
|
149 }
|
|
|
150
|
|
|
151
|
|
|
152 sub get_mappable_transcripts {
|
|
|
153 my ($self,$t) = @_;
|
|
|
154 my $key = $t->display_id();
|
|
|
155 if (not exists $mt_cache{$key}) {
|
|
|
156 @{$mt_cache{$key}} = $self->_get_mappable_transcripts($t);
|
|
|
157 }
|
|
|
158 return @{$mt_cache{$key}};
|
|
|
159 }
|
|
|
160
|
|
|
161
|
|
|
162 ############################################################################
|
|
|
163 ## internal methods
|
|
|
164 sub _get_mappable_transcripts {
|
|
|
165 my ($self,$t) = @_;
|
|
|
166 my $cds_seq = $t->translateable_seq();
|
|
|
167 my $exon_structure = __tx_exon_str($t);
|
|
|
168
|
|
|
169 # cds_seq is empty for pseudogenes; no mapping possible
|
|
|
170 return () if ($cds_seq eq '');
|
|
|
171
|
|
|
172 # get overlapping transcripts from other features
|
|
|
173 my @tx = @{ $self->{ofta}->fetch_all_by_Slice($t->feature_Slice()) };
|
|
|
174
|
|
|
175 # limit to NMs with standard-format
|
|
|
176 @tx = grep { $_->display_id() =~ m/^NM_\d+\.\d+$/ } @tx;
|
|
|
177
|
|
|
178 # limit to transcripts with same CDS
|
|
|
179 @tx = grep { $_->translateable_seq() eq $cds_seq } @tx;
|
|
|
180
|
|
|
181 # limit to transcripts with identical chromosomal exon structure
|
|
|
182 @tx = grep { __tx_exon_str($_->transform('chromosome')) eq $exon_structure } @tx;
|
|
|
183
|
|
|
184 return @tx;
|
|
|
185 }
|
|
|
186
|
|
|
187
|
|
|
188 ############################################################################
|
|
|
189 ## function (not methods)
|
|
|
190
|
|
|
191 sub __tx_exon_str {
|
|
|
192 my $t = shift;
|
|
|
193 return join(',', map {sprintf('[%s,%s]', $_->start(), $_->end())}
|
|
|
194 @{ $t->get_all_translateable_Exons() } );
|
|
|
195 }
|
|
|
196
|
|
|
197 sub __subst_hgvs_ac {
|
|
|
198 my ($hgvs,$ac) = @_;
|
|
|
199 $hgvs =~ s/^[^:]+:/$ac:/;
|
|
|
200 return $hgvs;
|
|
|
201 }
|
|
|
202
|
|
|
203 sub __uniq {
|
|
|
204 return keys %{{ map {$_=>1} @_ }};
|
|
|
205 }
|
|
|
206
|
|
|
207
|
|
|
208 1;
|