Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/Utils/IO.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,474 @@ +package Bio::EnsEMBL::Utils::IO; + +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=pod + +=head1 NAME + +Bio::EnsEMBL::Utils::IO + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::IO qw/slurp work_with_file slurp_to_array fh_to_array/; + #or + # use Bio::EnsEMBL::Utils::IO qw/:slurp/; #brings in any method starting with slurp + # use Bio::EnsEMBL::Utils::IO qw/:array/; #brings in any method which ends with _array + # use Bio::EnsEMBL::Utils::IO qw/:gz/; #brings all methods which start with gz_ + # use Bio::EnsEMBL::Utils::IO qw/:all/; #brings all methods in + + #As a scalar + my $file_contents = slurp('/my/file/location.txt'); + print length($file_contents); + + #As a ref + my $file_contents_ref = slurp('/my/file/location.txt', 1); + print length($$file_contents_ref); + + #Sending it to an array + my $array = slurp_to_array('/my/location'); + work_with_file('/my/location', 'r', sub { + $array = process_to_array($_[0], sub { + #Gives us input line by line + return "INPUT: $_"; + }); + }); + + #Simplified vesion but without the post processing + $array = fh_to_array($fh); + + #Sending this back out to another file + work_with_file('/my/file/newlocation.txt', 'w', sub { + my ($fh) = @_; + print $fh $$file_contents_ref; + return; + }); + + #Gzipping the data to another file + gz_work_with_file('/my/file.gz', 'w', sub { + my ($fh) = @_; + print $fh $$file_contents_ref; + return; + }); + + #Working with a set of lines manually + work_with_file('/my/file', 'r', sub { + my ($fh) = @_; + iterate_lines($fh, sub { + my ($line) = @_; + print $line; #Send the line in the file back out + return; + }); + return; + }); + + #Doing the same in one go + iterate_file('/my/file', sub { + my ($line) = @_; + print $line; #Send the line in the file back out + return; + }); + + #Move all data from one file handle to another. Bit like a copy + move_data($src_fh, $trg_fh); + +=head1 DESCRIPTION + +A collection of subroutines aimed to helping IO based operations + +=head1 METHODS + +See subroutines. + +=head1 MAINTAINER + +$Author: ady $ + +=head1 VERSION + +$Revision: 1.10 $ + +=cut + +use strict; +use warnings; + +use base qw(Exporter); + +our $GZIP_OK = 0; +our @EXPORT_OK = qw/slurp slurp_to_array fh_to_array process_to_array work_with_file gz_slurp gz_slurp_to_array gz_work_with_file iterate_file iterate_lines move_data/; +our %EXPORT_TAGS = ( + all => [@EXPORT_OK], + slurp => [qw/slurp slurp_to_array gz_slurp gz_slurp_to_array/], + array => [qw/fh_to_array process_to_array slurp_to_array gz_slurp_to_array/], + gz => [qw/gz_slurp gz_slurp_to_array gz_work_with_file/], + iterate => [qw/iterate_file iterate_lines/], +); +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Utils::Scalar qw(:assert); +use IO::File; +eval { + require IO::Compress::Gzip; + require IO::Uncompress::Gunzip; + $GZIP_OK = 1; +}; + +=head2 slurp() + + Arg [1] : string $file + Arg [2] : boolean; $want_ref + Arg [3] : boolean; $binary + Indicates if we want to return a scalar reference + Description : Forces the contents of a file into a scalar. This is the + fastest way to get a file into memory in Perl. You can also + get a scalar reference back to avoid copying the file contents + in Scalar references. If the input file is binary then specify + with the binary flag + Returntype : Scalar or reference of the file contents depending on arg 2 + Example : my $contents = slurp('/tmp/file.txt'); + Exceptions : If the file did not exist or was not readable + Status : Stable + +=cut + +sub slurp { + my ($file, $want_ref, $binary) = @_; + my $contents = q{}; + work_with_file($file, 'r', sub { + my ($fh) = @_; + binmode($fh) if $binary; + my $size_left = -s $file; + while( $size_left > 0 ) { + my $read_cnt = sysread($fh, $contents, $size_left, length($contents)); + unless( $read_cnt ) { + throw "read error in file $file: $!" ; + last; + } + $size_left -= $read_cnt ; + } + return; + }); + return ($want_ref) ? \$contents : $contents; +} + +=head2 gz_slurp() + + Arg [1] : string $file + Arg [2] : boolean; $want_ref Indicates if we want to return a scalar reference + Arg [3] : boolean; $binary + Arg [4] : HashRef arguments to pass into IO compression layers + Description : Forces the contents of a file into a scalar. This is the + fastest way to get a file into memory in Perl. You can also + get a scalar reference back to avoid copying the file contents + in Scalar references. If the input file is binary then specify + with the binary flag + Returntype : Scalar or reference of the file contents depending on arg 2 + Example : my $contents = slurp('/tmp/file.txt.gz'); + Exceptions : If the file did not exist or was not readable + Status : Stable + +=cut + +sub gz_slurp { + my ($file, $want_ref, $binary, $args) = @_; + my $contents; + gz_work_with_file($file, 'r', sub { + my ($fh) = @_; + local $/ = undef; + binmode($fh) if $binary; + $contents = <$fh>; + return; + }, $args); + return ($want_ref) ? \$contents : $contents; +} + +=head2 slurp_to_array() + + Arg [1] : string $file + Arg [2] : boolean $chomp + Description : Sends the contents of the given file into an ArrayRef + Returntype : ArrayRef + Example : my $contents_array = slurp_to_array('/tmp/file.txt'); + Exceptions : If the file did not exist or was not readable + Status : Stable + +=cut + +sub slurp_to_array { + my ($file, $chomp) = @_; + my $contents; + work_with_file($file, 'r', sub { + my ($fh) = @_; + $contents = fh_to_array($fh, $chomp); + return; + }); + return $contents; +} + +=head2 gz_slurp_to_array() + + Arg [1] : string $file + Arg [2] : boolean $chomp + Arg [3] : HashRef arguments to pass into IO compression layers + Description : Sends the contents of the given gzipped file into an ArrayRef + Returntype : ArrayRef + Example : my $contents_array = slurp_to_array('/tmp/file.txt.gz'); + Exceptions : If the file did not exist or was not readable + Status : Stable + +=cut + +sub gz_slurp_to_array { + my ($file, $chomp, $args) = @_; + my $contents; + gz_work_with_file($file, 'r', sub { + my ($fh) = @_; + $contents = fh_to_array($fh, $chomp); + return; + }, $args); + return $contents; +} + +=head2 fh_to_array() + + Arg [1] : Glob/IO::Handle $fh + Arg [2] : boolean $chomp + Description : Sends the contents of the given filehandle into an ArrayRef. + Will perform chomp on each line if specified. If you require + any more advanced line based processing then see + L<process_to_array>. + Returntype : ArrayRef + Example : my $contents_array = fh_to_array($fh); + Exceptions : None + Status : Stable + +=cut + +sub fh_to_array { + my ($fh, $chomp) = @_; + if($chomp) { + return process_to_array($fh, sub { + my ($line) = @_; + chomp($line); + return $line; + }); + } + my @contents = <$fh>; + return \@contents; +} + +=head2 process_to_array + + Arg [1] : Glob/IO::Handle $fh + Arg [2] : CodeRef $callback + Description : Sends the contents of the given file handle into an ArrayRef + via the processing callback. Assumes line based input. + Returntype : ArrayRef + Example : my $array = process_to_array($fh, sub { return "INPUT: $_"; }); + Exceptions : If the fh did not exist or if a callback was not given. + Status : Stable + +=cut + +sub process_to_array { + my ($fh, $callback) = @_; + assert_file_handle($fh, 'FileHandle'); + assert_ref($callback, 'CODE', 'callback'); + my @contents; + iterate_lines($fh, sub { + my ($line) = @_; + push(@contents, $callback->($line)); + return; + }); + return \@contents; +} + +=head2 iterate_lines + + Arg [1] : Glob/IO::Handle $fh + Arg [2] : CodeRef $callback + Description : Iterates through each line from the given file handle and + hands them to the callback one by one + Returntype : None + Example : iterate_lines($fh, sub { print "INPUT: $_"; }); + Exceptions : If the fh did not exist or if a callback was not given. + Status : Stable + +=cut + +sub iterate_lines { + my ($fh, $callback) = @_; + assert_file_handle($fh, 'FileHandle'); + assert_ref($callback, 'CODE', 'callback'); + while( my $line = <$fh> ) { + $callback->($line); + } + return; +} + +=head2 iterate_file + + Arg [1] : string $file + Arg [3] : CodeRef the callback which is used to iterate the lines in + the file + Description : Iterates through each line from the given file and + hands them to the callback one by one + Returntype : None + Example : iterate_file('/my/file', sub { print "INPUT: $_"; }); + Exceptions : If the file did not exist or if a callback was not given. + Status : Stable + +=cut + + +sub iterate_file { + my ($file, $callback) = @_; + work_with_file($file, 'r', sub { + my ($fh) = @_; + iterate_lines($fh, $callback); + return; + }); + return; +} + + + +=head2 work_with_file() + + Arg [1] : string $file + Arg [2] : string; $mode + Supports all modes specified by the C<open()> function as well as those + supported by IO::File + Arg [3] : CodeRef the callback which is given the open file handle as + its only argument + Description : Performs the nitty gritty of checking if a file handle is open + and closing the resulting filehandle down. + Returntype : None + Example : work_with_file('/tmp/out.txt', 'w', sub { + my ($fh) = @_; + print $fh 'hello'; + return; + }); + Exceptions : If we could not work with the file due to permissions + Status : Stable + +=cut + +sub work_with_file { + my ($file, $mode, $callback) = @_; + throw "We need a file name to open" if ! $file; + throw "We need a mode to open the requested file with" if ! $mode; + assert_ref($callback, 'CODE', 'callback'); + my $fh = IO::File->new($file, $mode) or + throw "Cannot open '${file}' in mode '${mode}': $!"; + $callback->($fh); + close($fh) or throw "Cannot close FH from ${file}: $!"; + return; +} + +=head2 gz_work_with_file() + + Arg [1] : string $file + Arg [2] : string; $mode + Supports modes like C<r>, C<w>, C<\>> and C<\<> + Arg [3] : CodeRef the callback which is given the open file handle as + its only argument + Arg [4] : HashRef used to pass options into the IO + compression/uncompression modules + Description : Performs the nitty gritty of checking if a file handle is open + and closing the resulting filehandle down. + Returntype : None + Example : work_with_file('/tmp/out.txt.gz', 'w', sub { + my ($fh) = @_; + print $fh 'hello'; + return; + }); + Exceptions : If we could not work with the file due to permissions + Status : Stable + +=cut + +sub gz_work_with_file { + my ($file, $mode, $callback, $args) = @_; + throw "IO::Compress was not available"if ! $GZIP_OK; + throw "We need a file name to open" if ! $file; + throw "We need a mode to open the requested file with" if ! $mode; + assert_ref($callback, 'CODE', 'callback'); + $args ||= {}; + + my $fh; + { + no warnings qw/once/; + if($mode =~ '>$' || $mode eq 'w') { + $args->{Append} = 1 if $mode =~ />>$/; + $fh = IO::Compress::Gzip->new($file, %$args) or throw "Cannot open '$file' for writing: $IO::Compress::Gzip::GzipError"; + } + elsif($mode eq '<' || $mode eq 'r') { + $fh = IO::Uncompress::Gunzip->new($file, %$args) or throw "Cannot open '$file' for writing: $IO::Uncompress::Gunzip::GunzipError"; + } + else { + throw "Could not decipher a mode from '$mode'"; + } + }; + $callback->($fh); + close($fh) or throw "Cannot close FH from ${file}: $!"; + return; +} + +=head2 move_data + + Arg [1] : FileHandle $src_fh + Arg [2] : FileHandle $trg_fh + Arg [3] : int $buffer. Defaults to 8KB + Description : Moves data from the given source filehandle to the target one + using a 8KB buffer or user specified buffer + Returntype : None + Example : move_data($src_fh, $trg_fh, 16*1024); # copy in 16KB chunks + Exceptions : If inputs were not as expected + +=cut + +sub move_data { + my ($src_fh, $trg_fh, $buffer_size) = @_; + assert_file_handle($src_fh, 'SourceFileHandle'); + assert_file_handle($trg_fh, 'TargetFileHandle'); + + $buffer_size ||= 8192; #Default 8KB + my $buffer; + while(1) { + my $read = sysread($src_fh, $buffer, $buffer_size); + if(! defined $read) { + throw "Error whilst reading from filehandle: $!"; + } + if($read == 0) { + last; + } + my $written = syswrite($trg_fh, $buffer); + if(!defined $written) { + throw "Error whilst writing to filehandle: $!"; + } + } + return; +} + +1;