Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/BlatIndexer.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 LICENSE | |
4 | |
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
6 Genome Research Limited. All rights reserved. | |
7 | |
8 This software is distributed under a modified Apache license. | |
9 For license details, please see | |
10 | |
11 http://www.ensembl.org/info/about/code_licence.html | |
12 | |
13 =head1 CONTACT | |
14 | |
15 Please email comments or questions to the public Ensembl | |
16 developers list at <dev@ensembl.org>. | |
17 | |
18 Questions may also be sent to the Ensembl help desk at | |
19 <helpdesk@ensembl.org>. | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Pipeline::FASTA::BlastIndexer | |
24 | |
25 =head1 DESCRIPTION | |
26 | |
27 Creates 2bit file of the given GZipped file. The resulting index | |
28 is created under the parameter location I<base_path> in blat/index. The filename | |
29 is prefixed with the port number of the blat server this file should be | |
30 run on. | |
31 | |
32 The module also performs filtering of non-reference sequence regions | |
33 and can filter the redundant Y chromosome piece for human (as 2bit does | |
34 not like repeated sequence region names). | |
35 | |
36 Allowed parameters are: | |
37 | |
38 =over 8 | |
39 | |
40 =item file - The file to index | |
41 | |
42 =item program - The location of the faToTwoBit program | |
43 | |
44 =item port_offset - Value to add onto the species_id from the website DB | |
45 to name the file correctly | |
46 | |
47 =item base_path - The base of the dumps | |
48 | |
49 =item index - The type of file to index; supported values are empty, | |
50 I<dna>, I<dna_sm> or I<dna_rm>. If specified we will look for this | |
51 string in the filename surrounded by '.' e.g. .dna. | |
52 | |
53 =back | |
54 | |
55 The registry should also have a DBAdaptor for the website schema | |
56 registered under the species B<multi> and the group B<web> for species_id to | |
57 Blat port number. | |
58 | |
59 =cut | |
60 | |
61 | |
62 package Bio::EnsEMBL::Pipeline::FASTA::BlatIndexer; | |
63 | |
64 use strict; | |
65 use warnings; | |
66 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/; | |
67 | |
68 use File::Spec; | |
69 use File::stat; | |
70 use Bio::EnsEMBL::Utils::IO qw/work_with_file/; | |
71 use Bio::EnsEMBL::Utils::Exception qw/throw/; | |
72 use Bio::EnsEMBL::Registry; | |
73 | |
74 sub param_defaults { | |
75 my ($self) = @_; | |
76 return { | |
77 program => 'faToTwoBit', | |
78 port_offset => 30000, | |
79 'index' => 'dna', #or dna_rm and dna_sm | |
80 }; | |
81 } | |
82 | |
83 sub fetch_input { | |
84 my ($self) = @_; | |
85 $self->assert_executable($self->param('program')); | |
86 $self->assert_executable('zcat'); | |
87 $self->assert_executable('gunzip'); | |
88 return; | |
89 } | |
90 | |
91 sub run { | |
92 my ($self) = @_; | |
93 if($self->run_indexing()) { | |
94 $self->SUPER::run(); | |
95 } | |
96 return; | |
97 } | |
98 | |
99 sub run_indexing { | |
100 my ($self) = @_; | |
101 my $index = $self->param('index'); | |
102 if($index) { | |
103 my $file = $self->param('file'); | |
104 return (index($file, ".${index}.") > -1) ? 1 : 0; | |
105 } | |
106 return 1; | |
107 } | |
108 | |
109 sub index_file { | |
110 my ($self, $file) = @_; | |
111 | |
112 my $target_file = $self->target_file(); | |
113 my $cmd = sprintf(q{%s %s %s}, | |
114 $self->param('program'), $file, $target_file); | |
115 | |
116 $self->info('About to run "%s"', $cmd); | |
117 my $output = `$cmd 2>&1`; | |
118 my $rc = $? >> 8; | |
119 throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc; | |
120 unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!"; | |
121 | |
122 #Check the file size. If it's 16 bytes then reject as that is an empty file for 2bit | |
123 my $filesize = stat($target_file)->size(); | |
124 if($filesize <= 16) { | |
125 unlink $file; | |
126 my $msg = sprintf( | |
127 'The file %s produced a 2bit file %d byte(s). Lower than 17 bytes therefore empty 2 bit file', | |
128 $file, $filesize | |
129 ); | |
130 $self->throw($msg); | |
131 } | |
132 | |
133 return; | |
134 } | |
135 | |
136 sub decompress { | |
137 my ($self) = @_; | |
138 | |
139 #If we have no non-reference seq regions then use normal decompress | |
140 if(! $self->has_non_refs()) { | |
141 return $self->SUPER::decompress(); | |
142 } | |
143 | |
144 #Filter for non-refs | |
145 my $source = $self->param('file'); | |
146 my $target_dir = $self->target_dir(); | |
147 my ($vol, $dir, $file) = File::Spec->splitpath($source); | |
148 $file =~ s/.gz$//; | |
149 my $target = File::Spec->catdir($target_dir, $file); | |
150 | |
151 my $allowed_regions = $self->allowed_regions(); | |
152 | |
153 $self->info('Decompressing %s to %s', $source, $target); | |
154 | |
155 open my $src_fh, '-|', "zcat $source" or throw "Cannot decompress $source to $target"; | |
156 work_with_file($target, 'w', sub { | |
157 my ($trg_fh) = @_; | |
158 my $transfer = 0; | |
159 while(my $line = <$src_fh>) { | |
160 #HEADER | |
161 if(index($line, '>') == 0) { | |
162 #regex is looking for NNN:NNN:NNN:1:80:1 i.e. the name | |
163 my ($name) = $line =~ />.+\s(.+:.+:.+:\d+:\d+:\d+)/; | |
164 $transfer = ($allowed_regions->{$name}) ? 1 : 0; | |
165 if($transfer) { | |
166 $self->info('%s was an allowed Slice', $name); | |
167 } | |
168 else { | |
169 $self->info('%s will be skipped; not a reference Slice', $name); | |
170 } | |
171 } | |
172 print $trg_fh $line if $transfer; | |
173 } | |
174 }); | |
175 close($src_fh); | |
176 | |
177 return $target; | |
178 } | |
179 | |
180 sub allowed_regions { | |
181 my ($self) = @_; | |
182 my $filter_human = 1; | |
183 my @slices = grep { $_->is_reference() } @{$self->get_Slices('core', $filter_human)}; | |
184 my %hash = map { $_->name() => 1 } @slices; | |
185 return \%hash; | |
186 } | |
187 | |
188 #Filename like 30061.Homo_sapiens.GRCh37.2bit | |
189 sub target_filename { | |
190 my ($self) = @_; | |
191 my $port = $self->blat_port(); | |
192 my $name = $self->web_name(); | |
193 my $assembly = $self->assembly(); | |
194 return join(q{.}, $port, $name, $assembly, '2bit'); | |
195 } | |
196 | |
197 sub target_file { | |
198 my ($self) = @_; | |
199 my $target_dir = $self->target_dir(); | |
200 my $target_filename = $self->target_filename(); | |
201 return File::Spec->catfile($target_dir, $target_filename); | |
202 return; | |
203 } | |
204 | |
205 sub target_dir { | |
206 my ($self) = @_; | |
207 return $self->get_dir('blat', $self->param('index')); | |
208 } | |
209 | |
210 sub blat_port { | |
211 my ($self) = @_; | |
212 my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor('multi', 'web'); | |
213 my $id = $dba->dbc()->sql_helper()->execute_single_result( | |
214 -SQL => 'select species_id from species where name =?', | |
215 -PARAMS => [$self->web_name()] | |
216 ); | |
217 return $id + $self->param('port_offset'); | |
218 } | |
219 | |
220 sub has_non_refs { | |
221 my ($self) = @_; | |
222 my $sql = <<'SQL'; | |
223 select count(*) | |
224 from attrib_type at | |
225 join seq_region_attrib sra using (attrib_type_id) | |
226 join seq_region sr using (seq_region_id) | |
227 join coord_system cs using (coord_system_id) | |
228 where cs.species_id =? | |
229 and at.code =? | |
230 SQL | |
231 my $dba = $self->get_DBAdaptor(); | |
232 return $dba->dbc()->sql_helper()->execute_single_result( | |
233 -SQL => $sql, -PARAMS => [$dba->species_id(), 'non_ref']); | |
234 } | |
235 | |
236 1; |