annotate dir_plugins/RefSeqHGVS.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
1 =head1 NAME
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
2
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
3 RefSeqHGVS -- provide RefSeq-based HGVS tags for VEP output
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
4
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
5 =head1 SYNOPSIS
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
6
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
7 cp RefSeqHGVS.pm ~/.vep/Plugins/ (or elsewhere in PERL5LIB)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
8 perl variant_effect_predictor.pl -i variations.vcf --plugin RefSeqHGVS
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
9
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
10 Output:
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
11 Variant lines with the following addtiional tags:
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
12 HGVSc-RefSeq=NM_198156.2:c.403delA;HGVSp-RefSeq=NP_937799.1:p.Arg135GlyfsX26
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
13
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
14 =head1 DESCRIPTION
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
15
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
16 RefSeqHGVS is a plugin for Ensembl's Variant Effect Predictor that
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
17 provides variant annotatoins in HGVS format [1] using RefSeq accessions
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
18 (typically NM and NP). It provides the analog to VEP's HGVSc and HGVSp
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
19 annotations, which use Ensembl ENST and ENSP accessions. This module
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
20 relies RefSeq data in the OtherFeatures database.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
21
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
22 Converting ENST HGVS tags to RefSeq tags is confounded subtle differences
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
23 between exon structures for the same CDS. The plugin is intended to be
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
24 conservative by requiring exact matches of both CDS and exon structure
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
25 when converting variants.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
26
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
27 [1] http://www.hgvs.org/mutnomen/
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
28
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
29 =head1 BUGS AND LIMITATIONS
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
30
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
31 =over
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
32
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
33 =item * RefSeq transcripts limited to those in the OtherFeatures database.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
34
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
35 Archived RefSeq transcripts are not available.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
36
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
37 =item * Discrepancies between RefSeq and GRCh37
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
38
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
39 Approximately 20% of RefSeqs differ from the GRCh37 due to substitution
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
40 (16.2%), insertion/deletion (3.5%), or both (1.2%) differences. [1]
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
41 Variants are annotated by difference with respect to the genome, not the
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
42 RefSeq transcript.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
43
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
44 [1] MU2A--reconciling the genome and transcriptome to determine the
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
45 effects of base substitutions.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
46 Garla, V., Kong, Y., Szpakowski, S., & Krauthammer, M. (2011).
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
47 Bioinformatics, 27(3), 416-8. doi:10.1093/bioinformatics/btq658
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
48 http://www.ncbi.nlm.nih.gov/pubmed/21149339
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
49
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
50 =item * Conservative selection of equivalent RefSeq transcripts.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
51
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
52 Only NM transcripts which are exactly identical to the ENST transcript in
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
53 both CDS and exon structure are used. In the future, this might be
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
54 relaxed to exclude only regions of mismatch. Currently, this implies that
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
55 HGVS tags are not constructued when the RefSeq differs from the reference
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
56 geneome.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
57
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
58 =back
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
59
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
60 =head1 CONTACT
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
61
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
62 Reece Hart <reecehart@gmail.com>
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
63
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
64 =head1 LICENSE
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
65
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
66 This code and all rights to it are hereby donated to EnsEMBL project by
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
67 Reece Hart and Locus Development.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
68
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
69 Copyright (c) 1999-2011 The European Bioinformatics Institute and
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
70 Genome Research Limited. All rights reserved.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
71
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
72 This software is distributed under a modified Apache license.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
73 For license details, please see
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
74
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
75 http://www.ensembl.org/info/about/code_licence.html
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
76
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
77 =cut
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
78
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
79
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
80 package RefSeqHGVS;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
81
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
82 use strict;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
83 use warnings;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
84
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
85 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
86
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
87 use Data::Dumper;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
88
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
89 my %mt_cache; # cache of ENST -> NM mappings
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
90
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
91 sub version {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
92 return '2.3';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
93 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
94
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
95 sub new {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
96 my $class = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
97 my $self = $class->SUPER::new(@_);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
98 my $reg = 'Bio::EnsEMBL::Registry';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
99
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
100 $self->{ofsa} = $reg->get_adaptor('Human', 'otherfeatures', 'Slice')
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
101 or die "Failed to create transcript adaptor in human otherfeatures database\n";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
102 $self->{ofta} = $reg->get_adaptor('Human', 'otherfeatures', 'Transcript')
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
103 or die "Failed to create transcript adaptor in human otherfeatures database\n";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
104
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
105 return $self;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
106 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
107
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
108 sub feature_types {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
109 return ['Transcript'];
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
110 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
111
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
112 sub variant_feature_types {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
113 return ['VariationFeature'];
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
114 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
115
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
116 sub get_header_info {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
117 return {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
118 'HGVSc-RefSeq' => "HGVSc using RefSeq transcripts accessions",
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
119 'HGVSp-RefSeq' => "HGVSp using RefSeq protein accessions",
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
120 };
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
121 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
122
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
123 sub run {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
124 my ($self, $tva) = @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
125 my $t = $tva->transcript;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
126 my %rv;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
127
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
128 my $ofsa = $self->{ofsa};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
129 my $ofta = $self->{ofta};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
130
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
131 # mappabale_transcripts have identical CDS and exon structure
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
132 my @mappable_transcripts = $self->get_mappable_transcripts($t);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
133 my @transcript_acs = map { $_->display_id() } @mappable_transcripts;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
134 my @protein_acs = grep {defined $_} map { $_->translation()->display_id() } @mappable_transcripts;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
135
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
136 # substitute accessions for those in mappable transcripts/proteins
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
137 my @hgvsc_rs;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
138 if (defined (my $hgvsc = $tva->hgvs_coding())) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
139 @hgvsc_rs = map {__subst_hgvs_ac($hgvsc ,$_)} @transcript_acs;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
140 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
141 my @hgvsp_rs;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
142 if (defined (my $hgvsp = $tva->hgvs_protein())) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
143 @hgvsp_rs = map {__subst_hgvs_ac($hgvsp,$_)} @protein_acs;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
144 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
145
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
146 $rv{'HGVSc-RefSeq'} = join(';',@hgvsc_rs) if @hgvsc_rs;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
147 $rv{'HGVSp-RefSeq'} = join(';',@hgvsp_rs) if @hgvsp_rs;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
148 return \%rv;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
149 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
150
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
151
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
152 sub get_mappable_transcripts {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
153 my ($self,$t) = @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
154 my $key = $t->display_id();
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
155 if (not exists $mt_cache{$key}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
156 @{$mt_cache{$key}} = $self->_get_mappable_transcripts($t);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
157 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
158 return @{$mt_cache{$key}};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
159 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
160
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
161
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
162 ############################################################################
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
163 ## internal methods
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
164 sub _get_mappable_transcripts {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
165 my ($self,$t) = @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
166 my $cds_seq = $t->translateable_seq();
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
167 my $exon_structure = __tx_exon_str($t);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
168
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
169 # cds_seq is empty for pseudogenes; no mapping possible
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
170 return () if ($cds_seq eq '');
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
171
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
172 # get overlapping transcripts from other features
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
173 my @tx = @{ $self->{ofta}->fetch_all_by_Slice($t->feature_Slice()) };
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
174
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
175 # limit to NMs with standard-format
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
176 @tx = grep { $_->display_id() =~ m/^NM_\d+\.\d+$/ } @tx;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
177
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
178 # limit to transcripts with same CDS
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
179 @tx = grep { $_->translateable_seq() eq $cds_seq } @tx;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
180
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
181 # limit to transcripts with identical chromosomal exon structure
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
182 @tx = grep { __tx_exon_str($_->transform('chromosome')) eq $exon_structure } @tx;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
183
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
184 return @tx;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
185 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
186
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
187
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
188 ############################################################################
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
189 ## function (not methods)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
190
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
191 sub __tx_exon_str {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
192 my $t = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
193 return join(',', map {sprintf('[%s,%s]', $_->start(), $_->end())}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
194 @{ $t->get_all_translateable_Exons() } );
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
195 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
196
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
197 sub __subst_hgvs_ac {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
198 my ($hgvs,$ac) = @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
199 $hgvs =~ s/^[^:]+:/$ac:/;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
200 return $hgvs;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
201 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
202
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
203 sub __uniq {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
204 return keys %{{ map {$_=>1} @_ }};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
205 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
206
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
207
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
208 1;