0
|
1 package Bio::EnsEMBL::Utils::IO;
|
|
2
|
|
3 =pod
|
|
4
|
|
5 =head1 LICENSE
|
|
6
|
|
7 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
8 Genome Research Limited. All rights reserved.
|
|
9
|
|
10 This software is distributed under a modified Apache license.
|
|
11 For license details, please see
|
|
12
|
|
13 http://www.ensembl.org/info/about/code_licence.html
|
|
14
|
|
15 =head1 CONTACT
|
|
16
|
|
17 Please email comments or questions to the public Ensembl
|
|
18 developers list at <dev@ensembl.org>.
|
|
19
|
|
20 Questions may also be sent to the Ensembl help desk at
|
|
21 <helpdesk@ensembl.org>.
|
|
22
|
|
23 =cut
|
|
24
|
|
25 =pod
|
|
26
|
|
27 =head1 NAME
|
|
28
|
|
29 Bio::EnsEMBL::Utils::IO
|
|
30
|
|
31 =head1 SYNOPSIS
|
|
32
|
|
33 use Bio::EnsEMBL::Utils::IO qw/slurp work_with_file slurp_to_array fh_to_array/;
|
|
34 #or
|
|
35 # use Bio::EnsEMBL::Utils::IO qw/:slurp/; #brings in any method starting with slurp
|
|
36 # use Bio::EnsEMBL::Utils::IO qw/:array/; #brings in any method which ends with _array
|
|
37 # use Bio::EnsEMBL::Utils::IO qw/:gz/; #brings all methods which start with gz_
|
|
38 # use Bio::EnsEMBL::Utils::IO qw/:all/; #brings all methods in
|
|
39
|
|
40 #As a scalar
|
|
41 my $file_contents = slurp('/my/file/location.txt');
|
|
42 print length($file_contents);
|
|
43
|
|
44 #As a ref
|
|
45 my $file_contents_ref = slurp('/my/file/location.txt', 1);
|
|
46 print length($$file_contents_ref);
|
|
47
|
|
48 #Sending it to an array
|
|
49 my $array = slurp_to_array('/my/location');
|
|
50 work_with_file('/my/location', 'r', sub {
|
|
51 $array = process_to_array($_[0], sub {
|
|
52 #Gives us input line by line
|
|
53 return "INPUT: $_";
|
|
54 });
|
|
55 });
|
|
56
|
|
57 #Simplified vesion but without the post processing
|
|
58 $array = fh_to_array($fh);
|
|
59
|
|
60 #Sending this back out to another file
|
|
61 work_with_file('/my/file/newlocation.txt', 'w', sub {
|
|
62 my ($fh) = @_;
|
|
63 print $fh $$file_contents_ref;
|
|
64 return;
|
|
65 });
|
|
66
|
|
67 #Gzipping the data to another file
|
|
68 gz_work_with_file('/my/file.gz', 'w', sub {
|
|
69 my ($fh) = @_;
|
|
70 print $fh $$file_contents_ref;
|
|
71 return;
|
|
72 });
|
|
73
|
|
74 #Working with a set of lines manually
|
|
75 work_with_file('/my/file', 'r', sub {
|
|
76 my ($fh) = @_;
|
|
77 iterate_lines($fh, sub {
|
|
78 my ($line) = @_;
|
|
79 print $line; #Send the line in the file back out
|
|
80 return;
|
|
81 });
|
|
82 return;
|
|
83 });
|
|
84
|
|
85 #Doing the same in one go
|
|
86 iterate_file('/my/file', sub {
|
|
87 my ($line) = @_;
|
|
88 print $line; #Send the line in the file back out
|
|
89 return;
|
|
90 });
|
|
91
|
|
92 #Move all data from one file handle to another. Bit like a copy
|
|
93 move_data($src_fh, $trg_fh);
|
|
94
|
|
95 =head1 DESCRIPTION
|
|
96
|
|
97 A collection of subroutines aimed to helping IO based operations
|
|
98
|
|
99 =head1 METHODS
|
|
100
|
|
101 See subroutines.
|
|
102
|
|
103 =head1 MAINTAINER
|
|
104
|
|
105 $Author: ady $
|
|
106
|
|
107 =head1 VERSION
|
|
108
|
|
109 $Revision: 1.10 $
|
|
110
|
|
111 =cut
|
|
112
|
|
113 use strict;
|
|
114 use warnings;
|
|
115
|
|
116 use base qw(Exporter);
|
|
117
|
|
118 our $GZIP_OK = 0;
|
|
119 our @EXPORT_OK = qw/slurp slurp_to_array fh_to_array process_to_array work_with_file gz_slurp gz_slurp_to_array gz_work_with_file iterate_file iterate_lines move_data/;
|
|
120 our %EXPORT_TAGS = (
|
|
121 all => [@EXPORT_OK],
|
|
122 slurp => [qw/slurp slurp_to_array gz_slurp gz_slurp_to_array/],
|
|
123 array => [qw/fh_to_array process_to_array slurp_to_array gz_slurp_to_array/],
|
|
124 gz => [qw/gz_slurp gz_slurp_to_array gz_work_with_file/],
|
|
125 iterate => [qw/iterate_file iterate_lines/],
|
|
126 );
|
|
127 use Bio::EnsEMBL::Utils::Exception qw(throw);
|
|
128 use Bio::EnsEMBL::Utils::Scalar qw(:assert);
|
|
129 use IO::File;
|
|
130 eval {
|
|
131 require IO::Compress::Gzip;
|
|
132 require IO::Uncompress::Gunzip;
|
|
133 $GZIP_OK = 1;
|
|
134 };
|
|
135
|
|
136 =head2 slurp()
|
|
137
|
|
138 Arg [1] : string $file
|
|
139 Arg [2] : boolean; $want_ref
|
|
140 Arg [3] : boolean; $binary
|
|
141 Indicates if we want to return a scalar reference
|
|
142 Description : Forces the contents of a file into a scalar. This is the
|
|
143 fastest way to get a file into memory in Perl. You can also
|
|
144 get a scalar reference back to avoid copying the file contents
|
|
145 in Scalar references. If the input file is binary then specify
|
|
146 with the binary flag
|
|
147 Returntype : Scalar or reference of the file contents depending on arg 2
|
|
148 Example : my $contents = slurp('/tmp/file.txt');
|
|
149 Exceptions : If the file did not exist or was not readable
|
|
150 Status : Stable
|
|
151
|
|
152 =cut
|
|
153
|
|
154 sub slurp {
|
|
155 my ($file, $want_ref, $binary) = @_;
|
|
156 my $contents = q{};
|
|
157 work_with_file($file, 'r', sub {
|
|
158 my ($fh) = @_;
|
|
159 binmode($fh) if $binary;
|
|
160 my $size_left = -s $file;
|
|
161 while( $size_left > 0 ) {
|
|
162 my $read_cnt = sysread($fh, $contents, $size_left, length($contents));
|
|
163 unless( $read_cnt ) {
|
|
164 throw "read error in file $file: $!" ;
|
|
165 last;
|
|
166 }
|
|
167 $size_left -= $read_cnt ;
|
|
168 }
|
|
169 return;
|
|
170 });
|
|
171 return ($want_ref) ? \$contents : $contents;
|
|
172 }
|
|
173
|
|
174 =head2 gz_slurp()
|
|
175
|
|
176 Arg [1] : string $file
|
|
177 Arg [2] : boolean; $want_ref Indicates if we want to return a scalar reference
|
|
178 Arg [3] : boolean; $binary
|
|
179 Arg [4] : HashRef arguments to pass into IO compression layers
|
|
180 Description : Forces the contents of a file into a scalar. This is the
|
|
181 fastest way to get a file into memory in Perl. You can also
|
|
182 get a scalar reference back to avoid copying the file contents
|
|
183 in Scalar references. If the input file is binary then specify
|
|
184 with the binary flag
|
|
185 Returntype : Scalar or reference of the file contents depending on arg 2
|
|
186 Example : my $contents = slurp('/tmp/file.txt.gz');
|
|
187 Exceptions : If the file did not exist or was not readable
|
|
188 Status : Stable
|
|
189
|
|
190 =cut
|
|
191
|
|
192 sub gz_slurp {
|
|
193 my ($file, $want_ref, $binary, $args) = @_;
|
|
194 my $contents;
|
|
195 gz_work_with_file($file, 'r', sub {
|
|
196 my ($fh) = @_;
|
|
197 local $/ = undef;
|
|
198 binmode($fh) if $binary;
|
|
199 $contents = <$fh>;
|
|
200 return;
|
|
201 }, $args);
|
|
202 return ($want_ref) ? \$contents : $contents;
|
|
203 }
|
|
204
|
|
205 =head2 slurp_to_array()
|
|
206
|
|
207 Arg [1] : string $file
|
|
208 Arg [2] : boolean $chomp
|
|
209 Description : Sends the contents of the given file into an ArrayRef
|
|
210 Returntype : ArrayRef
|
|
211 Example : my $contents_array = slurp_to_array('/tmp/file.txt');
|
|
212 Exceptions : If the file did not exist or was not readable
|
|
213 Status : Stable
|
|
214
|
|
215 =cut
|
|
216
|
|
217 sub slurp_to_array {
|
|
218 my ($file, $chomp) = @_;
|
|
219 my $contents;
|
|
220 work_with_file($file, 'r', sub {
|
|
221 my ($fh) = @_;
|
|
222 $contents = fh_to_array($fh, $chomp);
|
|
223 return;
|
|
224 });
|
|
225 return $contents;
|
|
226 }
|
|
227
|
|
228 =head2 gz_slurp_to_array()
|
|
229
|
|
230 Arg [1] : string $file
|
|
231 Arg [2] : boolean $chomp
|
|
232 Arg [3] : HashRef arguments to pass into IO compression layers
|
|
233 Description : Sends the contents of the given gzipped file into an ArrayRef
|
|
234 Returntype : ArrayRef
|
|
235 Example : my $contents_array = slurp_to_array('/tmp/file.txt.gz');
|
|
236 Exceptions : If the file did not exist or was not readable
|
|
237 Status : Stable
|
|
238
|
|
239 =cut
|
|
240
|
|
241 sub gz_slurp_to_array {
|
|
242 my ($file, $chomp, $args) = @_;
|
|
243 my $contents;
|
|
244 gz_work_with_file($file, 'r', sub {
|
|
245 my ($fh) = @_;
|
|
246 $contents = fh_to_array($fh, $chomp);
|
|
247 return;
|
|
248 }, $args);
|
|
249 return $contents;
|
|
250 }
|
|
251
|
|
252 =head2 fh_to_array()
|
|
253
|
|
254 Arg [1] : Glob/IO::Handle $fh
|
|
255 Arg [2] : boolean $chomp
|
|
256 Description : Sends the contents of the given filehandle into an ArrayRef.
|
|
257 Will perform chomp on each line if specified. If you require
|
|
258 any more advanced line based processing then see
|
|
259 L<process_to_array>.
|
|
260 Returntype : ArrayRef
|
|
261 Example : my $contents_array = fh_to_array($fh);
|
|
262 Exceptions : None
|
|
263 Status : Stable
|
|
264
|
|
265 =cut
|
|
266
|
|
267 sub fh_to_array {
|
|
268 my ($fh, $chomp) = @_;
|
|
269 if($chomp) {
|
|
270 return process_to_array($fh, sub {
|
|
271 my ($line) = @_;
|
|
272 chomp($line);
|
|
273 return $line;
|
|
274 });
|
|
275 }
|
|
276 my @contents = <$fh>;
|
|
277 return \@contents;
|
|
278 }
|
|
279
|
|
280 =head2 process_to_array
|
|
281
|
|
282 Arg [1] : Glob/IO::Handle $fh
|
|
283 Arg [2] : CodeRef $callback
|
|
284 Description : Sends the contents of the given file handle into an ArrayRef
|
|
285 via the processing callback. Assumes line based input.
|
|
286 Returntype : ArrayRef
|
|
287 Example : my $array = process_to_array($fh, sub { return "INPUT: $_"; });
|
|
288 Exceptions : If the fh did not exist or if a callback was not given.
|
|
289 Status : Stable
|
|
290
|
|
291 =cut
|
|
292
|
|
293 sub process_to_array {
|
|
294 my ($fh, $callback) = @_;
|
|
295 assert_file_handle($fh, 'FileHandle');
|
|
296 assert_ref($callback, 'CODE', 'callback');
|
|
297 my @contents;
|
|
298 iterate_lines($fh, sub {
|
|
299 my ($line) = @_;
|
|
300 push(@contents, $callback->($line));
|
|
301 return;
|
|
302 });
|
|
303 return \@contents;
|
|
304 }
|
|
305
|
|
306 =head2 iterate_lines
|
|
307
|
|
308 Arg [1] : Glob/IO::Handle $fh
|
|
309 Arg [2] : CodeRef $callback
|
|
310 Description : Iterates through each line from the given file handle and
|
|
311 hands them to the callback one by one
|
|
312 Returntype : None
|
|
313 Example : iterate_lines($fh, sub { print "INPUT: $_"; });
|
|
314 Exceptions : If the fh did not exist or if a callback was not given.
|
|
315 Status : Stable
|
|
316
|
|
317 =cut
|
|
318
|
|
319 sub iterate_lines {
|
|
320 my ($fh, $callback) = @_;
|
|
321 assert_file_handle($fh, 'FileHandle');
|
|
322 assert_ref($callback, 'CODE', 'callback');
|
|
323 while( my $line = <$fh> ) {
|
|
324 $callback->($line);
|
|
325 }
|
|
326 return;
|
|
327 }
|
|
328
|
|
329 =head2 iterate_file
|
|
330
|
|
331 Arg [1] : string $file
|
|
332 Arg [3] : CodeRef the callback which is used to iterate the lines in
|
|
333 the file
|
|
334 Description : Iterates through each line from the given file and
|
|
335 hands them to the callback one by one
|
|
336 Returntype : None
|
|
337 Example : iterate_file('/my/file', sub { print "INPUT: $_"; });
|
|
338 Exceptions : If the file did not exist or if a callback was not given.
|
|
339 Status : Stable
|
|
340
|
|
341 =cut
|
|
342
|
|
343
|
|
344 sub iterate_file {
|
|
345 my ($file, $callback) = @_;
|
|
346 work_with_file($file, 'r', sub {
|
|
347 my ($fh) = @_;
|
|
348 iterate_lines($fh, $callback);
|
|
349 return;
|
|
350 });
|
|
351 return;
|
|
352 }
|
|
353
|
|
354
|
|
355
|
|
356 =head2 work_with_file()
|
|
357
|
|
358 Arg [1] : string $file
|
|
359 Arg [2] : string; $mode
|
|
360 Supports all modes specified by the C<open()> function as well as those
|
|
361 supported by IO::File
|
|
362 Arg [3] : CodeRef the callback which is given the open file handle as
|
|
363 its only argument
|
|
364 Description : Performs the nitty gritty of checking if a file handle is open
|
|
365 and closing the resulting filehandle down.
|
|
366 Returntype : None
|
|
367 Example : work_with_file('/tmp/out.txt', 'w', sub {
|
|
368 my ($fh) = @_;
|
|
369 print $fh 'hello';
|
|
370 return;
|
|
371 });
|
|
372 Exceptions : If we could not work with the file due to permissions
|
|
373 Status : Stable
|
|
374
|
|
375 =cut
|
|
376
|
|
377 sub work_with_file {
|
|
378 my ($file, $mode, $callback) = @_;
|
|
379 throw "We need a file name to open" if ! $file;
|
|
380 throw "We need a mode to open the requested file with" if ! $mode;
|
|
381 assert_ref($callback, 'CODE', 'callback');
|
|
382 my $fh = IO::File->new($file, $mode) or
|
|
383 throw "Cannot open '${file}' in mode '${mode}': $!";
|
|
384 $callback->($fh);
|
|
385 close($fh) or throw "Cannot close FH from ${file}: $!";
|
|
386 return;
|
|
387 }
|
|
388
|
|
389 =head2 gz_work_with_file()
|
|
390
|
|
391 Arg [1] : string $file
|
|
392 Arg [2] : string; $mode
|
|
393 Supports modes like C<r>, C<w>, C<\>> and C<\<>
|
|
394 Arg [3] : CodeRef the callback which is given the open file handle as
|
|
395 its only argument
|
|
396 Arg [4] : HashRef used to pass options into the IO
|
|
397 compression/uncompression modules
|
|
398 Description : Performs the nitty gritty of checking if a file handle is open
|
|
399 and closing the resulting filehandle down.
|
|
400 Returntype : None
|
|
401 Example : work_with_file('/tmp/out.txt.gz', 'w', sub {
|
|
402 my ($fh) = @_;
|
|
403 print $fh 'hello';
|
|
404 return;
|
|
405 });
|
|
406 Exceptions : If we could not work with the file due to permissions
|
|
407 Status : Stable
|
|
408
|
|
409 =cut
|
|
410
|
|
411 sub gz_work_with_file {
|
|
412 my ($file, $mode, $callback, $args) = @_;
|
|
413 throw "IO::Compress was not available"if ! $GZIP_OK;
|
|
414 throw "We need a file name to open" if ! $file;
|
|
415 throw "We need a mode to open the requested file with" if ! $mode;
|
|
416 assert_ref($callback, 'CODE', 'callback');
|
|
417 $args ||= {};
|
|
418
|
|
419 my $fh;
|
|
420 {
|
|
421 no warnings qw/once/;
|
|
422 if($mode =~ '>$' || $mode eq 'w') {
|
|
423 $args->{Append} = 1 if $mode =~ />>$/;
|
|
424 $fh = IO::Compress::Gzip->new($file, %$args) or throw "Cannot open '$file' for writing: $IO::Compress::Gzip::GzipError";
|
|
425 }
|
|
426 elsif($mode eq '<' || $mode eq 'r') {
|
|
427 $fh = IO::Uncompress::Gunzip->new($file, %$args) or throw "Cannot open '$file' for writing: $IO::Uncompress::Gunzip::GunzipError";
|
|
428 }
|
|
429 else {
|
|
430 throw "Could not decipher a mode from '$mode'";
|
|
431 }
|
|
432 };
|
|
433 $callback->($fh);
|
|
434 close($fh) or throw "Cannot close FH from ${file}: $!";
|
|
435 return;
|
|
436 }
|
|
437
|
|
438 =head2 move_data
|
|
439
|
|
440 Arg [1] : FileHandle $src_fh
|
|
441 Arg [2] : FileHandle $trg_fh
|
|
442 Arg [3] : int $buffer. Defaults to 8KB
|
|
443 Description : Moves data from the given source filehandle to the target one
|
|
444 using a 8KB buffer or user specified buffer
|
|
445 Returntype : None
|
|
446 Example : move_data($src_fh, $trg_fh, 16*1024); # copy in 16KB chunks
|
|
447 Exceptions : If inputs were not as expected
|
|
448
|
|
449 =cut
|
|
450
|
|
451 sub move_data {
|
|
452 my ($src_fh, $trg_fh, $buffer_size) = @_;
|
|
453 assert_file_handle($src_fh, 'SourceFileHandle');
|
|
454 assert_file_handle($trg_fh, 'TargetFileHandle');
|
|
455
|
|
456 $buffer_size ||= 8192; #Default 8KB
|
|
457 my $buffer;
|
|
458 while(1) {
|
|
459 my $read = sysread($src_fh, $buffer, $buffer_size);
|
|
460 if(! defined $read) {
|
|
461 throw "Error whilst reading from filehandle: $!";
|
|
462 }
|
|
463 if($read == 0) {
|
|
464 last;
|
|
465 }
|
|
466 my $written = syswrite($trg_fh, $buffer);
|
|
467 if(!defined $written) {
|
|
468 throw "Error whilst writing to filehandle: $!";
|
|
469 }
|
|
470 }
|
|
471 return;
|
|
472 }
|
|
473
|
|
474 1;
|