0
|
1 package Bio::EnsEMBL::Pipeline::Base;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use base qw/Bio::EnsEMBL::Hive::Process/;
|
|
6
|
|
7 use Bio::EnsEMBL::Utils::Exception qw/throw/;
|
|
8 use Bio::EnsEMBL::Utils::IO qw/work_with_file/;
|
|
9 use Bio::EnsEMBL::Utils::Scalar qw/check_ref/;
|
|
10 use File::Find;
|
|
11 use File::Spec;
|
|
12 use File::Path qw/mkpath/;
|
|
13 use POSIX qw/strftime/;
|
|
14
|
|
15 # Takes in a key, checks if the current $self->param() was an empty array
|
|
16 # and replaces it with the value from $self->param_defaults()
|
|
17 sub reset_empty_array_param {
|
|
18 my ($self, $key) = @_;
|
|
19 my $param_defaults = $self->param_defaults();
|
|
20 my $current = $self->param($key);
|
|
21 my $replacement = $self->param_defaults()->{$key};
|
|
22 if(check_ref($current, 'ARRAY') && check_ref($replacement, 'ARRAY')) {
|
|
23 if(! @{$current}) {
|
|
24 $self->fine('Restting param %s because the given array was empty', $key);
|
|
25 $self->param($key, $replacement);
|
|
26 }
|
|
27 }
|
|
28 return;
|
|
29 }
|
|
30
|
|
31 =head2 get_Slices
|
|
32
|
|
33 Arg[1] : String type of DB to use (defaults to core)
|
|
34 Arg[2] : Boolean should we filter the slices if it is human
|
|
35 Example : my $slices = $self->get_Slices('core', 1);
|
|
36 Description : Basic get_Slices() method to return all distinct slices
|
|
37 for a species but also optionally filters for the
|
|
38 first portion of Human Y which is a non-informative region
|
|
39 (composed solely of N's). The code will only filter for
|
|
40 GRCh37 forcing the developer to update the test for other
|
|
41 regions.
|
|
42 Returntype : ArrayRef[Bio::EnsEMBL::Slice]
|
|
43 Exceptions : Thrown if you are filtering Human but also are not on GRCh37
|
|
44
|
|
45 =cut
|
|
46
|
|
47 sub get_Slices {
|
|
48 my ($self, $type, $filter_human) = @_;
|
|
49 my $dba = $self->get_DBAdaptor($type);
|
|
50 throw "Cannot get a DB adaptor" unless $dba;
|
|
51
|
|
52 my $sa = $dba->get_SliceAdaptor();
|
|
53 my @slices = @{$sa->fetch_all('toplevel', undef, 1, undef, undef)};
|
|
54
|
|
55 if($filter_human) {
|
|
56 my $production_name = $self->production_name();
|
|
57 if($production_name eq 'homo_sapiens') {
|
|
58 my ($cs) = @{$dba->get_CoordSystem()->fetch_all()};
|
|
59 my $expected = 'GRCh37';
|
|
60 if($cs->version() ne $expected) {
|
|
61 throw sprintf(q{Cannot continue as %s's coordinate system %s is not the expected %s }, $production_name, $cs->version(), $expected);
|
|
62 }
|
|
63 @slices = grep {
|
|
64 if($_->seq_region_name() eq 'Y' && $_->end() < 2649521) {
|
|
65 $self->info('Filtering small Y slice');
|
|
66 0;
|
|
67 }
|
|
68 else {
|
|
69 1;
|
|
70 }
|
|
71 } @slices;
|
|
72 }
|
|
73 }
|
|
74
|
|
75 return [ sort { $a->length() <=> $b->length() } @slices ];
|
|
76 }
|
|
77
|
|
78 # Registry is loaded by Hive (see beekeeper_extra_cmdline_options() in conf)
|
|
79 sub get_DBAdaptor {
|
|
80 my ($self, $type) = @_;
|
|
81 my $species = $self->param('species');
|
|
82 $type ||= 'core';
|
|
83 return Bio::EnsEMBL::Registry->get_DBAdaptor($species, $type);
|
|
84 }
|
|
85
|
|
86 sub cleanup_DBAdaptor {
|
|
87 my ($self, $type) = @_;
|
|
88 my $dba = $self->get_DBAdaptor($type);
|
|
89 $dba->clear_caches;
|
|
90 $dba->dbc->disconnect_if_idle;
|
|
91 return;
|
|
92 }
|
|
93
|
|
94 sub get_dir {
|
|
95 my ($self, @extras) = @_;
|
|
96 my $base_dir = $self->param('base_path');
|
|
97 my $dir = File::Spec->catdir($base_dir, @extras);
|
|
98 mkpath($dir);
|
|
99 return $dir;
|
|
100 }
|
|
101
|
|
102 sub web_name {
|
|
103 my ($self) = @_;
|
|
104 # my $mc = $self->get_DBAdaptor()->get_MetaContainer();
|
|
105 # my $name = $mc->single_value_by_key('species.url'); # change back
|
|
106 my $name = ucfirst($self->production_name());
|
|
107 return $name;
|
|
108 }
|
|
109
|
|
110 sub scientific_name {
|
|
111 my ($self) = @_;
|
|
112 my $dba = $self->get_DBAdaptor();
|
|
113 my $mc = $dba->get_MetaContainer();
|
|
114 my $name = $mc->get_scientific_name();
|
|
115 $dba->dbc()->disconnect_if_idle();
|
|
116 return $name;
|
|
117 }
|
|
118
|
|
119 sub assembly {
|
|
120 my ($self) = @_;
|
|
121 my $dba = $self->get_DBAdaptor();
|
|
122 return $dba->get_CoordSystemAdaptor()->fetch_all()->[0]->version();
|
|
123 }
|
|
124
|
|
125 sub production_name {
|
|
126 my ($self, $name) = @_;
|
|
127 my $dba;
|
|
128 if($name) {
|
|
129 $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($name, 'core');
|
|
130 }
|
|
131 else {
|
|
132 $dba = $self->get_DBAdaptor();
|
|
133 }
|
|
134 my $mc = $dba->get_MetaContainer();
|
|
135 my $prod = $mc->get_production_name();
|
|
136 $dba->dbc()->disconnect_if_idle();
|
|
137 return $prod;
|
|
138 }
|
|
139
|
|
140 # Closes file handle, and deletes the file stub if no data was written to
|
|
141 # the file handle (using tell). We can also only close a file handle and unlink
|
|
142 # the data if it was open otherwise we just ignore it
|
|
143 # Returns success if we managed to delete the file
|
|
144
|
|
145 sub tidy_file_handle {
|
|
146 my ($self, $fh, $path) = @_;
|
|
147 if($fh->opened()) {
|
|
148 my $unlink = ($fh->tell() == 0) ? 1 : 0;
|
|
149 $fh->close();
|
|
150 if($unlink && -f $path) {
|
|
151 unlink($path);
|
|
152 return 1;
|
|
153 }
|
|
154 }
|
|
155 return 0;
|
|
156 }
|
|
157
|
|
158 sub info {
|
|
159 my ($self, $msg, @params) = @_;
|
|
160 if ($self->debug() > 1) {
|
|
161 my $formatted_msg;
|
|
162 if(scalar(@params)) {
|
|
163 $formatted_msg = sprintf($msg, @params);
|
|
164 }
|
|
165 else {
|
|
166 $formatted_msg = $msg;
|
|
167 }
|
|
168 printf STDERR "INFO [%s]: %s %s\n", $self->_memory_consumption(), strftime('%c',localtime()), $formatted_msg;
|
|
169 }
|
|
170 return
|
|
171 }
|
|
172
|
|
173 sub fine {
|
|
174 my ($self, $msg, @params) = @_;
|
|
175 if ($self->debug() > 2) {
|
|
176 my $formatted_msg;
|
|
177 if(scalar(@params)) {
|
|
178 $formatted_msg = sprintf($msg, @params);
|
|
179 }
|
|
180 else {
|
|
181 $formatted_msg = $msg;
|
|
182 }
|
|
183 printf STDERR "FINE [%s]: %s %s\n", $self->_memory_consumption(), strftime('%c',localtime()), $formatted_msg;
|
|
184 }
|
|
185 return
|
|
186 }
|
|
187
|
|
188 sub _memory_consumption {
|
|
189 my ($self) = @_;
|
|
190 my $content = `ps -o rss $$ | grep -v RSS`;
|
|
191 return q{?MB} if $? >> 8 != 0;
|
|
192 $content =~ s/\s+//g;
|
|
193 my $mem = $content/1024;
|
|
194 return sprintf('%.2fMB', $mem);
|
|
195 }
|
|
196
|
|
197 sub find_files {
|
|
198 my ($self, $dir, $boolean_callback) = @_;
|
|
199 $self->throw("Cannot find path $dir") unless -d $dir;
|
|
200 my @files;
|
|
201 find(sub {
|
|
202 my $path = $File::Find::name;
|
|
203 if($boolean_callback->($_)) {
|
|
204 push(@files, $path);
|
|
205 }
|
|
206 }, $dir);
|
|
207 return \@files;
|
|
208 }
|
|
209
|
|
210 sub unlink_all_files {
|
|
211 my ($self, $dir) = @_;
|
|
212 $self->info('Removing files from the directory %s', $dir);
|
|
213 #Delete anything which is a file & not the current or higher directory
|
|
214 my $boolean_callback = sub {
|
|
215 return ( $_[0] =~ /^\.\.?$/) ? 0 : 1;
|
|
216 };
|
|
217 my $files = $self->find_files($dir, $boolean_callback);
|
|
218 foreach my $file (@{$files}) {
|
|
219 $self->fine('Unlinking %s', $file);
|
|
220 unlink $file;
|
|
221 }
|
|
222 $self->info('Removed %d file(s)', scalar(@{$files}));
|
|
223 return;
|
|
224 }
|
|
225
|
|
226 sub assert_executable {
|
|
227 my ($self, $exe) = @_;
|
|
228 if(! -x $exe) {
|
|
229 my $output = `which $exe 2>&1`;
|
|
230 chomp $output;
|
|
231 my $rc = $? >> 8;
|
|
232 if($rc != 0) {
|
|
233 my $possible_location = `locate -l 1 $exe 2>&1`;
|
|
234 my $loc_rc = $? >> 8;
|
|
235 if($loc_rc != 0) {
|
|
236 my $msg = 'Cannot find the executable "%s" after trying "which" and "locate -l 1". Please ensure it is on your PATH or use an absolute location and try again';
|
|
237 $self->throw(sprintf($msg, $exe));
|
|
238 }
|
|
239 }
|
|
240 }
|
|
241 return 1;
|
|
242 }
|
|
243
|
|
244 1;
|