annotate dir_plugins/AncestralAllele.pm @ 3:49397129aec0 draft

Uploaded
author dvanzessen
date Mon, 15 Jul 2019 05:20:39 -0400
parents e545d0a25ffe
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
1 =head1 LICENSE
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
2
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
4 Copyright [2016-2018] EMBL-European Bioinformatics Institute
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
5
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
6 Licensed under the Apache License, Version 2.0 (the "License");
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
7 you may not use this file except in compliance with the License.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
8 You may obtain a copy of the License at
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
9
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
10 http://www.apache.org/licenses/LICENSE-2.0
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
11
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
12 Unless required by applicable law or agreed to in writing, software
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
13 distributed under the License is distributed on an "AS IS" BASIS,
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
15 See the License for the specific language governing permissions and
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
16 limitations under the License.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
17
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
18 =head1 CONTACT
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
19
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
20 Ensembl <http://www.ensembl.org/info/about/contact/index.html>
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
21
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
22 =cut
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
23
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
24 =head1 NAME
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
25
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
26 AncestralAllele
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
27
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
28 =head1 SYNOPSIS
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
29
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
30 mv AncestralAllele.pm ~/.vep/Plugins
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
31 ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
32
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
33 =head1 DESCRIPTION
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
34
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
35 A VEP plugin that retrieves ancestral allele sequences from a FASTA file.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
36
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
37 Ensembl produces FASTA file dumps of the ancestral sequences of key species.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
38 They are available from ftp://ftp.ensembl.org/pub/current_fasta/ancestral_alleles/
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
39
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
40 For optimal retrieval speed, you should pre-process the FASTA files into a single
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
41 bgzipped file that can be accessed via Bio::DB::HTS::Faidx (installed by VEP's
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
42 INSTALL.pl):
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
43
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
44 wget ftp://ftp.ensembl.org/pub/release-90/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38_e86.tar.gz
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
45 tar xfz homo_sapiens_ancestor_GRCh38_e86.tar.gz
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
46 cat homo_sapiens_ancestor_GRCh38_e86/*.fa | bgzip -c > homo_sapiens_ancestor_GRCh38_e86.fa.gz
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
47 rm -rf homo_sapiens_ancestor_GRCh38_e86/ homo_sapiens_ancestor_GRCh38_e86.tar.gz
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
48 ./vep -i variations.vcf --plugin AncestralAllele,homo_sapiens_ancestor_GRCh38_e86.fa.gz
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
49
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
50 The plugin is also compatible with Bio::DB::Fasta and an uncompressed FASTA file.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
51
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
52 Note the first time you run the plugin with a newly generated FASTA file it will
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
53 spend some time indexing the file. DO NOT INTERRUPT THIS PROCESS, particularly
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
54 if you do not have Bio::DB::HTS installed.
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
55
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
56 Special cases:
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
57 "-" represents an insertion
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
58 "?" indicates the chromosome could not be looked up in the FASTA
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
59
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
60 =cut
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
61
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
62 package AncestralAllele;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
63
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
64 use strict;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
65 use warnings;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
66
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
67 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
68
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
69 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
70
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
71 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
72
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
73 our $CAN_USE_FAIDX;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
74 our $DEBUG = 0;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
75
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
76 BEGIN {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
77 if (eval { require Bio::DB::HTS::Faidx; 1 }) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
78 $CAN_USE_FAIDX = 1;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
79 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
80 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
81
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
82 sub new {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
83 my $class = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
84
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
85 my $self = $class->SUPER::new(@_);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
86
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
87 # get file
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
88 my $fasta_file = $self->params->[0] || "";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
89 die("ERROR: Ancestral FASTA file $fasta_file not provided or found\n") unless $fasta_file && -e $fasta_file;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
90 $self->fasta_file($fasta_file);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
91
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
92 # get type of index we're creating/using
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
93 my $type = $fasta_file =~ /\.gz$/ ? 'Bio::DB::HTS::Faidx' : 'Bio::DB::Fasta';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
94 die("ERROR: Bio::DB::HTS required to access compressed FASTA file $fasta_file\n") if $type eq 'Bio::DB::HTS::Faidx' && !$CAN_USE_FAIDX;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
95 $self->index_type($type);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
96
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
97 # init DB here so indexing doesn't happen at some point in the middle of VEP run
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
98 $self->fasta_db;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
99
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
100 return $self;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
101 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
102
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
103 sub feature_types {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
104 return ['Feature','Intergenic'];
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
105 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
106
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
107 sub get_header_info {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
108 return { AA => 'Ancestral allele from '.$_[0]->fasta_file }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
109 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
110
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
111 sub run {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
112 my ($self, $tva) = @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
113
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
114 my $vf = $tva->variation_feature;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
115
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
116 my ($start, $end) = ($vf->{start}, $vf->{end});
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
117
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
118 # cache AA on VF, prevents repeat lookups for multiple TVAs
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
119 if(!exists($vf->{_vep_plugin_AncestralAllele})) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
120
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
121 # insertion
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
122 return { AA => $vf->{_vep_plugin_AncestralAllele} = '-' } if $start > $end;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
123
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
124 my $sr_name = $self->get_sr_name($vf->{chr});
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
125
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
126 # chr not in FASTA
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
127 return { AA => $vf->{_vep_plugin_AncestralAllele} = '?' } unless $sr_name;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
128
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
129 my $fasta_db = $self->fasta_db;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
130 my $type = $self->index_type;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
131 my ($seq, $length);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
132
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
133 if($type eq 'Bio::DB::HTS::Faidx') {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
134 my $location_string = $sr_name.":".$start."-".$end ;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
135 ($seq, $length) = $fasta_db->get_sequence($location_string);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
136 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
137 elsif($type eq 'Bio::DB::Fasta') {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
138 $seq = $fasta_db->seq($sr_name, $start => $end);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
139 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
140 else {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
141 throw("ERROR: Don't know how to fetch sequence from a ".ref($fasta_db)."\n");
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
142 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
143
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
144 reverse_comp(\$seq) if $vf->{strand} < 0;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
145
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
146 $vf->{_vep_plugin_AncestralAllele} = $seq;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
147 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
148
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
149 return { AA => $vf->{_vep_plugin_AncestralAllele} };
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
150 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
151
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
152 # getter/setter for fasta file path
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
153 sub fasta_file {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
154 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
155 $self->{_fasta_file} = shift if @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
156 return $self->{_fasta_file};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
157 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
158
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
159 # getter/setter for fasta index type
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
160 sub index_type {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
161 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
162 $self->{_index_type} = shift if @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
163 return $self->{_index_type};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
164 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
165
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
166 ## CRIBBED FROM Bio::EnsEMBL::Variation::Utils::FastaSequence
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
167 ## MOVE THIS CODE TO SHARED UTILS SPACE?
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
168 # creates the FASTA DB object
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
169 # returns either a Bio::DB::Fasta or a Bio::DB::HTS::Faidx
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
170 # uses a lock file to avoid partial indexes getting used
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
171 sub fasta_db {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
172 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
173
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
174 # if we've forked we need to re-connect
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
175 # but make sure we only do this once per fork
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
176 my $prev_pid = $self->{_prev_pid} ||= $$;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
177 if($prev_pid ne $$) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
178 delete $self->{_fasta_db};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
179 $self->{_prev_pid} = $$;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
180 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
181
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
182 if(!exists($self->{_fasta_db})) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
183 my $fasta = $self->fasta_file;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
184 my $type = $self->index_type;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
185
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
186 # check lock file
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
187 my $lock_file = $fasta;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
188 $lock_file .= -d $fasta ? '/.vep.lock' : '.vep.lock';
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
189
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
190 # lock file exists, indexing failed
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
191 if(-e $lock_file) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
192 print STDERR "Lock file found, removing to allow re-indexing\n" if $DEBUG;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
193 for(qw(.fai .index .gzi /directory.index /directory.fai .vep.lock)) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
194 unlink($fasta.$_) if -e $fasta.$_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
195 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
196 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
197
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
198 my $index_exists = 0;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
199
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
200 for my $fn(map {$fasta.$_} qw(.fai .index .gzi /directory.index /directory.fai)) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
201 if(-e $fn) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
202 $index_exists = 1;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
203 last;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
204 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
205 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
206
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
207 # create lock file
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
208 unless($index_exists) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
209 open LOCK, ">$lock_file" or throw("ERROR: Could not write to FASTA lock file $lock_file\n");
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
210 print LOCK "1\n";
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
211 close LOCK;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
212 print STDERR "Indexing $fasta\n" if $DEBUG;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
213 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
214
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
215 # run indexing
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
216 my $fasta_db;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
217 if($type eq 'Bio::DB::HTS::Faidx' && $CAN_USE_FAIDX) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
218 $fasta_db = Bio::DB::HTS::Faidx->new($fasta);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
219 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
220 elsif(!$type || $type eq 'Bio::DB::Fasta') {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
221 $fasta_db = Bio::DB::Fasta->new($fasta);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
222 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
223 else {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
224 throw("ERROR: Don't know how to index with type $type\n");
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
225 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
226 print STDERR "Finished indexing\n" if $DEBUG;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
227
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
228 throw("ERROR: Unable to create FASTA DB\n") unless $fasta_db;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
229
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
230 # remove lock file
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
231 unlink($lock_file) unless $index_exists;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
232
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
233 $self->{_fasta_db} = $fasta_db;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
234 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
235
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
236 return $self->{_fasta_db};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
237 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
238
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
239 # gets seq region name as it appears in the FASTA
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
240 # the file contains names like "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1"
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
241 # so want to map from "10" or "chr10" to "ANCESTOR_for_chromosome:GRCh38:10:1:133797422:1"
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
242 sub get_sr_name {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
243 my $self = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
244 my $sr_name = shift;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
245
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
246 my $map = $self->fasta_name_map;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
247 my $fasta_db = $self->fasta_db;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
248
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
249 unless($map->{$sr_name}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
250 foreach my $alt(
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
251 $sr_name =~ /^chr/i ? (substr($sr_name, 3)) : ('chr'.$sr_name, 'CHR'.$sr_name)
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
252 ) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
253 if($map->{$alt}) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
254 print STDERR "USING SYNONYM $alt FOR $sr_name\n" if $DEBUG;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
255 $sr_name = $alt;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
256 last;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
257 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
258 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
259 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
260
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
261 return $map->{$sr_name} || undef;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
262 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
263
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
264 sub fasta_name_map {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
265 my ($self, $chr) = @_;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
266
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
267 if(!exists($self->{_name_map})) {
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
268 my $fasta_db = $self->fasta_db;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
269
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
270 my %map = map {(split(':', $_))[2] => $_}
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
271 $fasta_db->isa('Bio::DB::Fasta') ?
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
272 (grep {$_ !~ /^\_\_/} $fasta_db->get_all_primary_ids) :
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
273 ($fasta_db->get_all_sequence_ids);
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
274
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
275 $self->{_name_map} = \%map;
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
276 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
277
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
278 return $self->{_name_map};
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
279 }
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
280
e545d0a25ffe Uploaded
dvanzessen
parents:
diff changeset
281 1;