0
|
1 =pod
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 CONTACT
|
|
14
|
|
15 Please email comments or questions to the public Ensembl
|
|
16 developers list at <dev@ensembl.org>.
|
|
17
|
|
18 Questions may also be sent to the Ensembl help desk at
|
|
19 <helpdesk@ensembl.org>.
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 Creates 2bit file of the given GZipped file. The resulting index
|
|
28 is created under the parameter location I<base_path> in blat/index. The filename
|
|
29 is prefixed with the port number of the blat server this file should be
|
|
30 run on.
|
|
31
|
|
32 The module also performs filtering of non-reference sequence regions
|
|
33 and can filter the redundant Y chromosome piece for human (as 2bit does
|
|
34 not like repeated sequence region names).
|
|
35
|
|
36 Allowed parameters are:
|
|
37
|
|
38 =over 8
|
|
39
|
|
40 =item file - The file to index
|
|
41
|
|
42 =item program - The location of the faToTwoBit program
|
|
43
|
|
44 =item port_offset - Value to add onto the species_id from the website DB
|
|
45 to name the file correctly
|
|
46
|
|
47 =item base_path - The base of the dumps
|
|
48
|
|
49 =item index - The type of file to index; supported values are empty,
|
|
50 I<dna>, I<dna_sm> or I<dna_rm>. If specified we will look for this
|
|
51 string in the filename surrounded by '.' e.g. .dna.
|
|
52
|
|
53 =back
|
|
54
|
|
55 The registry should also have a DBAdaptor for the website schema
|
|
56 registered under the species B<multi> and the group B<web> for species_id to
|
|
57 Blat port number.
|
|
58
|
|
59 =cut
|
|
60
|
|
61
|
|
62 package Bio::EnsEMBL::Pipeline::FASTA::BlatIndexer;
|
|
63
|
|
64 use strict;
|
|
65 use warnings;
|
|
66 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;
|
|
67
|
|
68 use File::Spec;
|
|
69 use File::stat;
|
|
70 use Bio::EnsEMBL::Utils::IO qw/work_with_file/;
|
|
71 use Bio::EnsEMBL::Utils::Exception qw/throw/;
|
|
72 use Bio::EnsEMBL::Registry;
|
|
73
|
|
74 sub param_defaults {
|
|
75 my ($self) = @_;
|
|
76 return {
|
|
77 program => 'faToTwoBit',
|
|
78 port_offset => 30000,
|
|
79 'index' => 'dna', #or dna_rm and dna_sm
|
|
80 };
|
|
81 }
|
|
82
|
|
83 sub fetch_input {
|
|
84 my ($self) = @_;
|
|
85 $self->assert_executable($self->param('program'));
|
|
86 $self->assert_executable('zcat');
|
|
87 $self->assert_executable('gunzip');
|
|
88 return;
|
|
89 }
|
|
90
|
|
91 sub run {
|
|
92 my ($self) = @_;
|
|
93 if($self->run_indexing()) {
|
|
94 $self->SUPER::run();
|
|
95 }
|
|
96 return;
|
|
97 }
|
|
98
|
|
99 sub run_indexing {
|
|
100 my ($self) = @_;
|
|
101 my $index = $self->param('index');
|
|
102 if($index) {
|
|
103 my $file = $self->param('file');
|
|
104 return (index($file, ".${index}.") > -1) ? 1 : 0;
|
|
105 }
|
|
106 return 1;
|
|
107 }
|
|
108
|
|
109 sub index_file {
|
|
110 my ($self, $file) = @_;
|
|
111
|
|
112 my $target_file = $self->target_file();
|
|
113 my $cmd = sprintf(q{%s %s %s},
|
|
114 $self->param('program'), $file, $target_file);
|
|
115
|
|
116 $self->info('About to run "%s"', $cmd);
|
|
117 my $output = `$cmd 2>&1`;
|
|
118 my $rc = $? >> 8;
|
|
119 throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
|
|
120 unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
|
|
121
|
|
122 #Check the file size. If it's 16 bytes then reject as that is an empty file for 2bit
|
|
123 my $filesize = stat($target_file)->size();
|
|
124 if($filesize <= 16) {
|
|
125 unlink $file;
|
|
126 my $msg = sprintf(
|
|
127 'The file %s produced a 2bit file %d byte(s). Lower than 17 bytes therefore empty 2 bit file',
|
|
128 $file, $filesize
|
|
129 );
|
|
130 $self->throw($msg);
|
|
131 }
|
|
132
|
|
133 return;
|
|
134 }
|
|
135
|
|
136 sub decompress {
|
|
137 my ($self) = @_;
|
|
138
|
|
139 #If we have no non-reference seq regions then use normal decompress
|
|
140 if(! $self->has_non_refs()) {
|
|
141 return $self->SUPER::decompress();
|
|
142 }
|
|
143
|
|
144 #Filter for non-refs
|
|
145 my $source = $self->param('file');
|
|
146 my $target_dir = $self->target_dir();
|
|
147 my ($vol, $dir, $file) = File::Spec->splitpath($source);
|
|
148 $file =~ s/.gz$//;
|
|
149 my $target = File::Spec->catdir($target_dir, $file);
|
|
150
|
|
151 my $allowed_regions = $self->allowed_regions();
|
|
152
|
|
153 $self->info('Decompressing %s to %s', $source, $target);
|
|
154
|
|
155 open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target";
|
|
156 work_with_file($target, 'w', sub {
|
|
157 my ($trg_fh) = @_;
|
|
158 my $transfer = 0;
|
|
159 while(my $line = <$src_fh>) {
|
|
160 #HEADER
|
|
161 if(index($line, '>') == 0) {
|
|
162 #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name
|
|
163 my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/;
|
|
164 $transfer = ($allowed_regions->{$name}) ? 1 : 0;
|
|
165 if($transfer) {
|
|
166 $self->info('%s was an allowed Slice', $name);
|
|
167 }
|
|
168 else {
|
|
169 $self->info('%s will be skipped; not a reference Slice', $name);
|
|
170 }
|
|
171 }
|
|
172 print $trg_fh $line if $transfer;
|
|
173 }
|
|
174 });
|
|
175 close($src_fh);
|
|
176
|
|
177 return $target;
|
|
178 }
|
|
179
|
|
180 sub allowed_regions {
|
|
181 my ($self) = @_;
|
|
182 my $filter_human = 1;
|
|
183 my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)};
|
|
184 my %hash = map { $_->name() => 1 } @slices;
|
|
185 return \%hash;
|
|
186 }
|
|
187
|
|
188 #Filename like 30061.Homo_sapiens.GRCh37.2bit
|
|
189 sub target_filename {
|
|
190 my ($self) = @_;
|
|
191 my $port = $self->blat_port();
|
|
192 my $name = $self->web_name();
|
|
193 my $assembly = $self->assembly();
|
|
194 return join(q{.}, $port, $name, $assembly, '2bit');
|
|
195 }
|
|
196
|
|
197 sub target_file {
|
|
198 my ($self) = @_;
|
|
199 my $target_dir = $self->target_dir();
|
|
200 my $target_filename = $self->target_filename();
|
|
201 return File::Spec->catfile($target_dir, $target_filename);
|
|
202 return;
|
|
203 }
|
|
204
|
|
205 sub target_dir {
|
|
206 my ($self) = @_;
|
|
207 return $self->get_dir('blat', $self->param('index'));
|
|
208 }
|
|
209
|
|
210 sub blat_port {
|
|
211 my ($self) = @_;
|
|
212 my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'web');
|
|
213 my $id = $dba->dbc()->sql_helper()->execute_single_result(
|
|
214 -SQL => 'select species_id from species where name =?',
|
|
215 -PARAMS => [$self->web_name()]
|
|
216 );
|
|
217 return $id + $self->param('port_offset');
|
|
218 }
|
|
219
|
|
220 sub has_non_refs {
|
|
221 my ($self) = @_;
|
|
222 my $sql = <<'SQL';
|
|
223 select count(*)
|
|
224 from attrib_type at
|
|
225 join seq_region_attrib sra using (attrib_type_id)
|
|
226 join seq_region sr using (seq_region_id)
|
|
227 join coord_system cs using (coord_system_id)
|
|
228 where cs.species_id =?
|
|
229 and at.code =?
|
|
230 SQL
|
|
231 my $dba = $self->get_DBAdaptor();
|
|
232 return $dba->dbc()->sql_helper()->execute_single_result(
|
|
233 -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']);
|
|
234 }
|
|
235
|
|
236 1;
|