comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 LICENSE
4
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
6 Genome Research Limited. All rights reserved.
7
8 This software is distributed under a modified Apache license.
9 For license details, please see
10
11 http://www.ensembl.org/info/about/code_licence.html
12
13 =head1 CONTACT
14
15 Please email comments or questions to the public Ensembl
16 developers list at <dev@ensembl.org>.
17
18 Questions may also be sent to the Ensembl help desk at
19 <helpdesk@ensembl.org>.
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer
24
25 =head1 DESCRIPTION
26
27 Creates 2bit file of the given GZipped file. The resulting index
28 is created under the parameter location I<base_path> in blat/index. The filename
29 is prefixed with the port number of the blat server this file should be
30 run on.
31
32 The module also performs filtering of non-reference sequence regions
33 and can filter the redundant Y chromosome piece for human (as 2bit does
34 not like repeated sequence region names).
35
36 Allowed parameters are:
37
38 =over 8
39
40 =item file - The file to index
41
42 =item program - The location of the faToTwoBit program
43
44 =item port_offset - Value to add onto the species_id from the website DB
45 to name the file correctly
46
47 =item base_path - The base of the dumps
48
49 =item index - The type of file to index; supported values are empty,
50 I<dna>, I<dna_sm> or I<dna_rm>. If specified we will look for this
51 string in the filename surrounded by '.' e.g. .dna.
52
53 =back
54
55 The registry should also have a DBAdaptor for the website schema
56 registered under the species B<multi> and the group B<web> for species_id to
57 Blat port number.
58
59 =cut
60
61
62 package Bio::EnsEMBL::Pipeline::FASTA::BlatIndexer;
63
64 use strict;
65 use warnings;
66 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;
67
68 use File::Spec;
69 use File::stat;
70 use Bio::EnsEMBL::Utils::IO qw/work_with_file/;
71 use Bio::EnsEMBL::Utils::Exception qw/throw/;
72 use Bio::EnsEMBL::Registry;
73
74 sub param_defaults {
75 my ($self) = @_;
76 return {
77 program => 'faToTwoBit',
78 port_offset => 30000,
79 'index' => 'dna', #or dna_rm and dna_sm
80 };
81 }
82
83 sub fetch_input {
84 my ($self) = @_;
85 $self->assert_executable($self->param('program'));
86 $self->assert_executable('zcat');
87 $self->assert_executable('gunzip');
88 return;
89 }
90
91 sub run {
92 my ($self) = @_;
93 if($self->run_indexing()) {
94 $self->SUPER::run();
95 }
96 return;
97 }
98
99 sub run_indexing {
100 my ($self) = @_;
101 my $index = $self->param('index');
102 if($index) {
103 my $file = $self->param('file');
104 return (index($file, ".${index}.") > -1) ? 1 : 0;
105 }
106 return 1;
107 }
108
109 sub index_file {
110 my ($self, $file) = @_;
111
112 my $target_file = $self->target_file();
113 my $cmd = sprintf(q{%s %s %s},
114 $self->param('program'), $file, $target_file);
115
116 $self->info('About to run "%s"', $cmd);
117 my $output = `$cmd 2>&1`;
118 my $rc = $? >> 8;
119 throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
120 unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
121
122 #Check the file size. If it's 16 bytes then reject as that is an empty file for 2bit
123 my $filesize = stat($target_file)->size();
124 if($filesize <= 16) {
125 unlink $file;
126 my $msg = sprintf(
127 'The file %s produced a 2bit file %d byte(s). Lower than 17 bytes therefore empty 2 bit file',
128 $file, $filesize
129 );
130 $self->throw($msg);
131 }
132
133 return;
134 }
135
136 sub decompress {
137 my ($self) = @_;
138
139 #If we have no non-reference seq regions then use normal decompress
140 if(! $self->has_non_refs()) {
141 return $self->SUPER::decompress();
142 }
143
144 #Filter for non-refs
145 my $source = $self->param('file');
146 my $target_dir = $self->target_dir();
147 my ($vol, $dir, $file) = File::Spec->splitpath($source);
148 $file =~ s/.gz$//;
149 my $target = File::Spec->catdir($target_dir, $file);
150
151 my $allowed_regions = $self->allowed_regions();
152
153 $self->info('Decompressing %s to %s', $source, $target);
154
155 open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target";
156 work_with_file($target, 'w', sub {
157 my ($trg_fh) = @_;
158 my $transfer = 0;
159 while(my $line = <$src_fh>) {
160 #HEADER
161 if(index($line, '>') == 0) {
162 #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name
163 my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/;
164 $transfer = ($allowed_regions->{$name}) ? 1 : 0;
165 if($transfer) {
166 $self->info('%s was an allowed Slice', $name);
167 }
168 else {
169 $self->info('%s will be skipped; not a reference Slice', $name);
170 }
171 }
172 print $trg_fh $line if $transfer;
173 }
174 });
175 close($src_fh);
176
177 return $target;
178 }
179
180 sub allowed_regions {
181 my ($self) = @_;
182 my $filter_human = 1;
183 my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)};
184 my %hash = map { $_->name() => 1 } @slices;
185 return \%hash;
186 }
187
188 #Filename like 30061.Homo_sapiens.GRCh37.2bit
189 sub target_filename {
190 my ($self) = @_;
191 my $port = $self->blat_port();
192 my $name = $self->web_name();
193 my $assembly = $self->assembly();
194 return join(q{.}, $port, $name, $assembly, '2bit');
195 }
196
197 sub target_file {
198 my ($self) = @_;
199 my $target_dir = $self->target_dir();
200 my $target_filename = $self->target_filename();
201 return File::Spec->catfile($target_dir, $target_filename);
202 return;
203 }
204
205 sub target_dir {
206 my ($self) = @_;
207 return $self->get_dir('blat', $self->param('index'));
208 }
209
210 sub blat_port {
211 my ($self) = @_;
212 my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'web');
213 my $id = $dba->dbc()->sql_helper()->execute_single_result(
214 -SQL => 'select species_id from species where name =?',
215 -PARAMS => [$self->web_name()]
216 );
217 return $id + $self->param('port_offset');
218 }
219
220 sub has_non_refs {
221 my ($self) = @_;
222 my $sql = <<'SQL';
223 select count(*)
224 from attrib_type at
225 join seq_region_attrib sra using (attrib_type_id)
226 join seq_region sr using (seq_region_id)
227 join coord_system cs using (coord_system_id)
228 where cs.species_id =?
229 and at.code =?
230 SQL
231 my $dba = $self->get_DBAdaptor();
232 return $dba->dbc()->sql_helper()->execute_single_result(
233 -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']);
234 }
235
236 1;