Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/RefSeqHGVS.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
comparison
equal
deleted
inserted
replaced
| 2:17c98d091710 | 3:49397129aec0 |
|---|---|
| 1 =head1 NAME | |
| 2 | |
| 3 RefSeqHGVS -- provide RefSeq-based HGVS tags for VEP output | |
| 4 | |
| 5 =head1 SYNOPSIS | |
| 6 | |
| 7 cp RefSeqHGVS.pm ~/.vep/Plugins/ (or elsewhere in PERL5LIB) | |
| 8 perl variant_effect_predictor.pl -i variations.vcf --plugin RefSeqHGVS | |
| 9 | |
| 10 Output: | |
| 11 Variant lines with the following addtiional tags: | |
| 12 HGVSc-RefSeq=NM_198156.2:c.403delA;HGVSp-RefSeq=NP_937799.1:p.Arg135GlyfsX26 | |
| 13 | |
| 14 =head1 DESCRIPTION | |
| 15 | |
| 16 RefSeqHGVS is a plugin for Ensembl's Variant Effect Predictor that | |
| 17 provides variant annotatoins in HGVS format [1] using RefSeq accessions | |
| 18 (typically NM and NP). It provides the analog to VEP's HGVSc and HGVSp | |
| 19 annotations, which use Ensembl ENST and ENSP accessions. This module | |
| 20 relies RefSeq data in the OtherFeatures database. | |
| 21 | |
| 22 Converting ENST HGVS tags to RefSeq tags is confounded subtle differences | |
| 23 between exon structures for the same CDS. The plugin is intended to be | |
| 24 conservative by requiring exact matches of both CDS and exon structure | |
| 25 when converting variants. | |
| 26 | |
| 27 [1] http://www.hgvs.org/mutnomen/ | |
| 28 | |
| 29 =head1 BUGS AND LIMITATIONS | |
| 30 | |
| 31 =over | |
| 32 | |
| 33 =item * RefSeq transcripts limited to those in the OtherFeatures database. | |
| 34 | |
| 35 Archived RefSeq transcripts are not available. | |
| 36 | |
| 37 =item * Discrepancies between RefSeq and GRCh37 | |
| 38 | |
| 39 Approximately 20% of RefSeqs differ from the GRCh37 due to substitution | |
| 40 (16.2%), insertion/deletion (3.5%), or both (1.2%) differences. [1] | |
| 41 Variants are annotated by difference with respect to the genome, not the | |
| 42 RefSeq transcript. | |
| 43 | |
| 44 [1] MU2A--reconciling the genome and transcriptome to determine the | |
| 45 effects of base substitutions. | |
| 46 Garla, V., Kong, Y., Szpakowski, S., & Krauthammer, M. (2011). | |
| 47 Bioinformatics, 27(3), 416-8. doi:10.1093/bioinformatics/btq658 | |
| 48 http://www.ncbi.nlm.nih.gov/pubmed/21149339 | |
| 49 | |
| 50 =item * Conservative selection of equivalent RefSeq transcripts. | |
| 51 | |
| 52 Only NM transcripts which are exactly identical to the ENST transcript in | |
| 53 both CDS and exon structure are used. In the future, this might be | |
| 54 relaxed to exclude only regions of mismatch. Currently, this implies that | |
| 55 HGVS tags are not constructued when the RefSeq differs from the reference | |
| 56 geneome. | |
| 57 | |
| 58 =back | |
| 59 | |
| 60 =head1 CONTACT | |
| 61 | |
| 62 Reece Hart <reecehart@gmail.com> | |
| 63 | |
| 64 =head1 LICENSE | |
| 65 | |
| 66 This code and all rights to it are hereby donated to EnsEMBL project by | |
| 67 Reece Hart and Locus Development. | |
| 68 | |
| 69 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
| 70 Genome Research Limited. All rights reserved. | |
| 71 | |
| 72 This software is distributed under a modified Apache license. | |
| 73 For license details, please see | |
| 74 | |
| 75 http://www.ensembl.org/info/about/code_licence.html | |
| 76 | |
| 77 =cut | |
| 78 | |
| 79 | |
| 80 package RefSeqHGVS; | |
| 81 | |
| 82 use strict; | |
| 83 use warnings; | |
| 84 | |
| 85 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | |
| 86 | |
| 87 use Data::Dumper; | |
| 88 | |
| 89 my %mt_cache; # cache of ENST -> NM mappings | |
| 90 | |
| 91 sub version { | |
| 92 return '2.3'; | |
| 93 } | |
| 94 | |
| 95 sub new { | |
| 96 my $class = shift; | |
| 97 my $self = $class->SUPER::new(@_); | |
| 98 my $reg = 'Bio::EnsEMBL::Registry'; | |
| 99 | |
| 100 $self->{ofsa} = $reg->get_adaptor('Human', 'otherfeatures', 'Slice') | |
| 101 or die "Failed to create transcript adaptor in human otherfeatures database\n"; | |
| 102 $self->{ofta} = $reg->get_adaptor('Human', 'otherfeatures', 'Transcript') | |
| 103 or die "Failed to create transcript adaptor in human otherfeatures database\n"; | |
| 104 | |
| 105 return $self; | |
| 106 } | |
| 107 | |
| 108 sub feature_types { | |
| 109 return ['Transcript']; | |
| 110 } | |
| 111 | |
| 112 sub variant_feature_types { | |
| 113 return ['VariationFeature']; | |
| 114 } | |
| 115 | |
| 116 sub get_header_info { | |
| 117 return { | |
| 118 'HGVSc-RefSeq' => "HGVSc using RefSeq transcripts accessions", | |
| 119 'HGVSp-RefSeq' => "HGVSp using RefSeq protein accessions", | |
| 120 }; | |
| 121 } | |
| 122 | |
| 123 sub run { | |
| 124 my ($self, $tva) = @_; | |
| 125 my $t = $tva->transcript; | |
| 126 my %rv; | |
| 127 | |
| 128 my $ofsa = $self->{ofsa}; | |
| 129 my $ofta = $self->{ofta}; | |
| 130 | |
| 131 # mappabale_transcripts have identical CDS and exon structure | |
| 132 my @mappable_transcripts = $self->get_mappable_transcripts($t); | |
| 133 my @transcript_acs = map { $_->display_id() } @mappable_transcripts; | |
| 134 my @protein_acs = grep {defined $_} map { $_->translation()->display_id() } @mappable_transcripts; | |
| 135 | |
| 136 # substitute accessions for those in mappable transcripts/proteins | |
| 137 my @hgvsc_rs; | |
| 138 if (defined (my $hgvsc = $tva->hgvs_coding())) { | |
| 139 @hgvsc_rs = map {__subst_hgvs_ac($hgvsc ,$_)} @transcript_acs; | |
| 140 } | |
| 141 my @hgvsp_rs; | |
| 142 if (defined (my $hgvsp = $tva->hgvs_protein())) { | |
| 143 @hgvsp_rs = map {__subst_hgvs_ac($hgvsp,$_)} @protein_acs; | |
| 144 } | |
| 145 | |
| 146 $rv{'HGVSc-RefSeq'} = join(';',@hgvsc_rs) if @hgvsc_rs; | |
| 147 $rv{'HGVSp-RefSeq'} = join(';',@hgvsp_rs) if @hgvsp_rs; | |
| 148 return \%rv; | |
| 149 } | |
| 150 | |
| 151 | |
| 152 sub get_mappable_transcripts { | |
| 153 my ($self,$t) = @_; | |
| 154 my $key = $t->display_id(); | |
| 155 if (not exists $mt_cache{$key}) { | |
| 156 @{$mt_cache{$key}} = $self->_get_mappable_transcripts($t); | |
| 157 } | |
| 158 return @{$mt_cache{$key}}; | |
| 159 } | |
| 160 | |
| 161 | |
| 162 ############################################################################ | |
| 163 ## internal methods | |
| 164 sub _get_mappable_transcripts { | |
| 165 my ($self,$t) = @_; | |
| 166 my $cds_seq = $t->translateable_seq(); | |
| 167 my $exon_structure = __tx_exon_str($t); | |
| 168 | |
| 169 # cds_seq is empty for pseudogenes; no mapping possible | |
| 170 return () if ($cds_seq eq ''); | |
| 171 | |
| 172 # get overlapping transcripts from other features | |
| 173 my @tx = @{ $self->{ofta}->fetch_all_by_Slice($t->feature_Slice()) }; | |
| 174 | |
| 175 # limit to NMs with standard-format | |
| 176 @tx = grep { $_->display_id() =~ m/^NM_\d+\.\d+$/ } @tx; | |
| 177 | |
| 178 # limit to transcripts with same CDS | |
| 179 @tx = grep { $_->translateable_seq() eq $cds_seq } @tx; | |
| 180 | |
| 181 # limit to transcripts with identical chromosomal exon structure | |
| 182 @tx = grep { __tx_exon_str($_->transform('chromosome')) eq $exon_structure } @tx; | |
| 183 | |
| 184 return @tx; | |
| 185 } | |
| 186 | |
| 187 | |
| 188 ############################################################################ | |
| 189 ## function (not methods) | |
| 190 | |
| 191 sub __tx_exon_str { | |
| 192 my $t = shift; | |
| 193 return join(',', map {sprintf('[%s,%s]', $_->start(), $_->end())} | |
| 194 @{ $t->get_all_translateable_Exons() } ); | |
| 195 } | |
| 196 | |
| 197 sub __subst_hgvs_ac { | |
| 198 my ($hgvs,$ac) = @_; | |
| 199 $hgvs =~ s/^[^:]+:/$ac:/; | |
| 200 return $hgvs; | |
| 201 } | |
| 202 | |
| 203 sub __uniq { | |
| 204 return keys %{{ map {$_=>1} @_ }}; | |
| 205 } | |
| 206 | |
| 207 | |
| 208 1; |
