Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/AncestralAllele.pm @ 3:49397129aec0 draft
Uploaded
| author | dvanzessen |
|---|---|
| date | Mon, 15 Jul 2019 05:20:39 -0400 |
| parents | e545d0a25ffe |
| children |
comparison
equal
deleted
inserted
replaced
| 2:17c98d091710 | 3:49397129aec0 |
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | |
| 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | |
| 5 | |
| 6 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 you may not use this file except in compliance with the License. | |
| 8 You may obtain a copy of the License at | |
| 9 | |
| 10 http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | |
| 12 Unless required by applicable law or agreed to in writing, software | |
| 13 distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 See the License for the specific language governing permissions and | |
| 16 limitations under the License. | |
| 17 | |
| 18 =head1 CONTACT | |
| 19 | |
| 20 Ensembl <http://www.ensembl.org/info/about/contact/index.html> | |
| 21 | |
| 22 =cut | |
| 23 | |
| 24 =head1 NAME | |
| 25 | |
| 26 AncestralAllele | |
| 27 | |
| 28 =head1 SYNOPSIS | |
| 29 | |
| 30 mv AncestralAllele.pm ~/.vep/Plugins | |
| 31 ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz | |
| 32 | |
| 33 =head1 DESCRIPTION | |
| 34 | |
| 35 A VEP plugin that retrieves ancestral allele sequences from a FASTA file. | |
| 36 | |
| 37 Ensembl produces FASTA file dumps of the ancestral sequences of key species. | |
| 38 They are available from ftp://ftp.ensembl.org/pub/current_fasta/ancestral_alleles/ | |
| 39 | |
| 40 For optimal retrieval speed, you should pre-process the FASTA files into a single | |
| 41 bgzipped file that can be accessed via Bio::DB::HTS::Faidx (installed by VEP's | |
| 42 INSTALL.pl): | |
| 43 | |
| 44 wget ftp://ftp.ensembl.org/pub/release-90/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38_e86.tar.gz | |
| 45 tar xfz homo_sapiens_ancestor_GRCh38_e86.tar.gz | |
| 46 cat homo_sapiens_ancestor_GRCh38_e86/*.fa | bgzip -c > homo_sapiens_ancestor_GRCh38_e86.fa.gz | |
| 47 rm -rf homo_sapiens_ancestor_GRCh38_e86/ homo_sapiens_ancestor_GRCh38_e86.tar.gz | |
| 48 ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz | |
| 49 | |
| 50 The plugin is also compatible with Bio::DB::Fasta and an uncompressed FASTA file. | |
| 51 | |
| 52 Note the first time you run the plugin with a newly generated FASTA file it will | |
| 53 spend some time indexing the file. DO NOT INTERRUPT THIS PROCESS, particularly | |
| 54 if you do not have Bio::DB::HTS installed. | |
| 55 | |
| 56 Special cases: | |
| 57 "-" represents an insertion | |
| 58 "?" indicates the chromosome could not be looked up in the FASTA | |
| 59 | |
| 60 =cut | |
| 61 | |
| 62 package AncestralAllele; | |
| 63 | |
| 64 use strict; | |
| 65 use warnings; | |
| 66 | |
| 67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 68 | |
| 69 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | |
| 70 | |
| 71 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | |
| 72 | |
| 73 our $CAN_USE_FAIDX; | |
| 74 our $DEBUG = 0; | |
| 75 | |
| 76 BEGIN { | |
| 77 if (eval { require Bio::DB::HTS::Faidx; 1 }) { | |
| 78 $CAN_USE_FAIDX = 1; | |
| 79 } | |
| 80 } | |
| 81 | |
| 82 sub new { | |
| 83 my $class = shift; | |
| 84 | |
| 85 my $self = $class->SUPER::new(@_); | |
| 86 | |
| 87 # get file | |
| 88 my $fasta_file = $self->params->[0] || ""; | |
| 89 die("ERROR: Ancestral FASTA file $fasta_file not provided or found\n") unless $fasta_file && -e $fasta_file; | |
| 90 $self->fasta_file($fasta_file); | |
| 91 | |
| 92 # get type of index we're creating/using | |
| 93 my $type = $fasta_file =~ /\.gz$/ ? 'Bio::DB::HTS::Faidx' : 'Bio::DB::Fasta'; | |
| 94 die("ERROR: Bio::DB::HTS required to access compressed FASTA file $fasta_file\n") if $type eq 'Bio::DB::HTS::Faidx' && !$CAN_USE_FAIDX; | |
| 95 $self->index_type($type); | |
| 96 | |
| 97 # init DB here so indexing doesn't happen at some point in the middle of VEP run | |
| 98 $self->fasta_db; | |
| 99 | |
| 100 return $self; | |
| 101 } | |
| 102 | |
| 103 sub feature_types { | |
| 104 return ['Feature','Intergenic']; | |
| 105 } | |
| 106 | |
| 107 sub get_header_info { | |
| 108 return { AA => 'Ancestral allele from '.$_[0]->fasta_file } | |
| 109 } | |
| 110 | |
| 111 sub run { | |
| 112 my ($self, $tva) = @_; | |
| 113 | |
| 114 my $vf = $tva->variation_feature; | |
| 115 | |
| 116 my ($start, $end) = ($vf->{start}, $vf->{end}); | |
| 117 | |
| 118 # cache AA on VF, prevents repeat lookups for multiple TVAs | |
| 119 if(!exists($vf->{_vep_plugin_AncestralAllele})) { | |
| 120 | |
| 121 # insertion | |
| 122 return { AA => $vf->{_vep_plugin_AncestralAllele} = '-' } if $start > $end; | |
| 123 | |
| 124 my $sr_name = $self->get_sr_name($vf->{chr}); | |
| 125 | |
| 126 # chr not in FASTA | |
| 127 return { AA => $vf->{_vep_plugin_AncestralAllele} = '?' } unless $sr_name; | |
| 128 | |
| 129 my $fasta_db = $self->fasta_db; | |
| 130 my $type = $self->index_type; | |
| 131 my ($seq, $length); | |
| 132 | |
| 133 if($type eq 'Bio::DB::HTS::Faidx') { | |
| 134 my $location_string = $sr_name.":".$start."-".$end ; | |
| 135 ($seq, $length) = $fasta_db->get_sequence($location_string); | |
| 136 } | |
| 137 elsif($type eq 'Bio::DB::Fasta') { | |
| 138 $seq = $fasta_db->seq($sr_name, $start => $end); | |
| 139 } | |
| 140 else { | |
| 141 throw("ERROR: Don't know how to fetch sequence from a ".ref($fasta_db)."\n"); | |
| 142 } | |
| 143 | |
| 144 reverse_comp(\$seq) if $vf->{strand} < 0; | |
| 145 | |
| 146 $vf->{_vep_plugin_AncestralAllele} = $seq; | |
| 147 } | |
| 148 | |
| 149 return { AA => $vf->{_vep_plugin_AncestralAllele} }; | |
| 150 } | |
| 151 | |
| 152 # getter/setter for fasta file path | |
| 153 sub fasta_file { | |
| 154 my $self = shift; | |
| 155 $self->{_fasta_file} = shift if @_; | |
| 156 return $self->{_fasta_file}; | |
| 157 } | |
| 158 | |
| 159 # getter/setter for fasta index type | |
| 160 sub index_type { | |
| 161 my $self = shift; | |
| 162 $self->{_index_type} = shift if @_; | |
| 163 return $self->{_index_type}; | |
| 164 } | |
| 165 | |
| 166 ## CRIBBED FROM Bio::EnsEMBL::Variation::Utils::FastaSequence | |
| 167 ## MOVE THIS CODE TO SHARED UTILS SPACE? | |
| 168 # creates the FASTA DB object | |
| 169 # returns either a Bio::DB::Fasta or a Bio::DB::HTS::Faidx | |
| 170 # uses a lock file to avoid partial indexes getting used | |
| 171 sub fasta_db { | |
| 172 my $self = shift; | |
| 173 | |
| 174 # if we've forked we need to re-connect | |
| 175 # but make sure we only do this once per fork | |
| 176 my $prev_pid = $self->{_prev_pid} ||= $$; | |
| 177 if($prev_pid ne $$) { | |
| 178 delete $self->{_fasta_db}; | |
| 179 $self->{_prev_pid} = $$; | |
| 180 } | |
| 181 | |
| 182 if(!exists($self->{_fasta_db})) { | |
| 183 my $fasta = $self->fasta_file; | |
| 184 my $type = $self->index_type; | |
| 185 | |
| 186 # check lock file | |
| 187 my $lock_file = $fasta; | |
| 188 $lock_file .= -d $fasta ? '/.vep.lock' : '.vep.lock'; | |
| 189 | |
| 190 # lock file exists, indexing failed | |
| 191 if(-e $lock_file) { | |
| 192 print STDERR "Lock file found, removing to allow re-indexing\n" if $DEBUG; | |
| 193 for(qw(.fai .index .gzi /directory.index /directory.fai .vep.lock)) { | |
| 194 unlink($fasta.$_) if -e $fasta.$_; | |
| 195 } | |
| 196 } | |
| 197 | |
| 198 my $index_exists = 0; | |
| 199 | |
| 200 for my $fn(map {$fasta.$_} qw(.fai .index .gzi /directory.index /directory.fai)) { | |
| 201 if(-e $fn) { | |
| 202 $index_exists = 1; | |
| 203 last; | |
| 204 } | |
| 205 } | |
| 206 | |
| 207 # create lock file | |
| 208 unless($index_exists) { | |
| 209 open LOCK, ">$lock_file" or throw("ERROR: Could not write to FASTA lock file $lock_file\n"); | |
| 210 print LOCK "1\n"; | |
| 211 close LOCK; | |
| 212 print STDERR "Indexing $fasta\n" if $DEBUG; | |
| 213 } | |
| 214 | |
| 215 # run indexing | |
| 216 my $fasta_db; | |
| 217 if($type eq 'Bio::DB::HTS::Faidx' && $CAN_USE_FAIDX) { | |
| 218 $fasta_db = Bio::DB::HTS::Faidx->new($fasta); | |
| 219 } | |
| 220 elsif(!$type || $type eq 'Bio::DB::Fasta') { | |
| 221 $fasta_db = Bio::DB::Fasta->new($fasta); | |
| 222 } | |
| 223 else { | |
| 224 throw("ERROR: Don't know how to index with type $type\n"); | |
| 225 } | |
| 226 print STDERR "Finished indexing\n" if $DEBUG; | |
| 227 | |
| 228 throw("ERROR: Unable to create FASTA DB\n") unless $fasta_db; | |
| 229 | |
| 230 # remove lock file | |
| 231 unlink($lock_file) unless $index_exists; | |
| 232 | |
| 233 $self->{_fasta_db} = $fasta_db; | |
| 234 } | |
| 235 | |
| 236 return $self->{_fasta_db}; | |
| 237 } | |
| 238 | |
| 239 # gets seq region name as it appears in the FASTA | |
| 240 # the file contains names like "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1" | |
| 241 # so want to map from "10" or "chr10" to "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1" | |
| 242 sub get_sr_name { | |
| 243 my $self = shift; | |
| 244 my $sr_name = shift; | |
| 245 | |
| 246 my $map = $self->fasta_name_map; | |
| 247 my $fasta_db = $self->fasta_db; | |
| 248 | |
| 249 unless($map->{$sr_name}) { | |
| 250 foreach my $alt( | |
| 251 $sr_name =~ /^chr/i ? (substr($sr_name, 3)) : ('chr'.$sr_name, 'CHR'.$sr_name) | |
| 252 ) { | |
| 253 if($map->{$alt}) { | |
| 254 print STDERR "USING SYNONYM $alt FOR $sr_name\n" if $DEBUG; | |
| 255 $sr_name = $alt; | |
| 256 last; | |
| 257 } | |
| 258 } | |
| 259 } | |
| 260 | |
| 261 return $map->{$sr_name} || undef; | |
| 262 } | |
| 263 | |
| 264 sub fasta_name_map { | |
| 265 my ($self, $chr) = @_; | |
| 266 | |
| 267 if(!exists($self->{_name_map})) { | |
| 268 my $fasta_db = $self->fasta_db; | |
| 269 | |
| 270 my %map = map {(split(':', $_))[2] => $_} | |
| 271 $fasta_db->isa('Bio::DB::Fasta') ? | |
| 272 (grep {$_ !~ /^\_\_/} $fasta_db->get_all_primary_ids) : | |
| 273 ($fasta_db->get_all_sequence_ids); | |
| 274 | |
| 275 $self->{_name_map} = \%map; | |
| 276 } | |
| 277 | |
| 278 return $self->{_name_map}; | |
| 279 } | |
| 280 | |
| 281 1; |
