Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Utils/IO/FASTASerializer.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/FASTASerializer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,248 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::IO::FASTASerializer + +=head1 SYNOPSIS + + my $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle); + $serializer->chunk_factor(1000); + $serializer->line_width(60); + $serializer->print_Seq($slice); + + $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle, + sub { + my $slice = shift; + return "Custom header"; + } + ); + +=head1 DESCRIPTION + + Replacement for SeqDumper, making better use of shared code. Outputs FASTA + format with optional custom header and formatting parameters. Set line_width + and chunk_factor to dictate buffer size depending on application. A 60kb + buffer is used by default with a line width of 60 characters. + + Custom headers are set by supplying an anonymous subroutine to new(). Custom + header code must accept a Slice or Bio::PrimarySeqI compliant object as + argument and return a string. + + The custom header method can be overridden later through set_custom_header() + but this is not normally necessary. + +=cut + +package Bio::EnsEMBL::Utils::IO::FASTASerializer; + +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use Bio::EnsEMBL::Utils::Scalar qw/assert_ref check_ref/; + +use base qw(Bio::EnsEMBL::Utils::IO::Serializer); + +=head2 new + + Arg [1] : Filehandle (optional) + Arg [2] : CODEREF subroutine for writing custom headers + Arg [3] : [optional] Chunking size (integer) + Arg [4] : [optional] Line width (integer) + Example : $dumper = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,$header_function,1000,60); + Description: Constructor + Allows the specification of a custom function for rendering + header lines. + Returntype : Bio::EnsEMBL::Utils::IO::FASTASerializer; + Exceptions : none + Caller : general + +=cut + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + my $filehandle = shift; + my $header_function = shift; + my $chunk_factor = shift; + my $line_width = shift; + + my $self = $class->SUPER::new($filehandle); + + $self->{'header_function'} = $header_function; + $self->{'line_width'} = ($line_width)? $line_width : 60; + $self->{'chunk_factor'} = ($chunk_factor)? $chunk_factor : 1000; + # gives a 60kb buffer by default, increase for higher database and disk efficiency. + + # TODO: Check this error trap works as intended + if ( defined($self->{'header_function'}) ) { + if (ref($self->{'header_function'}) ne "CODE") { + throw("Custom header function must be an anonymous subroutine when instantiating FASTASerializer");} + } + else { + $self->{'header_function'} = sub { + my $slice = shift; + + if(check_ref($slice, 'Bio::EnsEMBL::Slice')) { + my $id = $slice->seq_region_name; + my $seqtype = 'dna'; + my $idtype = $slice->coord_system->name; + my $location = $slice->name; + + return "$id $seqtype:$idtype $location"; + } + else { + # must be a Bio::Seq , or we're doomed + + return $slice->display_id; + } + }; + + } + + return $self; +} + +=head2 print_metadata + + Arg [1] : Bio::EnsEMBL::Slice + Description: Printing header lines into FASTA files. Usually handled + internally to the serializer. + Returntype : None + Caller : print_Seq +=cut + +sub print_metadata { + my $self = shift; + my $slice = shift; + my $fh = $self->{'filehandle'}; + my $function = $self->header_function(); + my $metadata = $function->($slice); + print $fh '>'.$metadata."\n"; +} + +=head2 print_Seq + + Arg [1] : Bio::EnsEMBL::Slice or other Bio::PrimarySeqI compliant object + + Description: Serializes the slice into FASTA format. Buffering is used + While other Bioperl PrimarySeqI implementations can be used, + a custom header function will be required to accommodate it. + + Returntype : None + +=cut + +sub print_Seq { + my $self = shift; + my $slice = shift; + my $fh = $self->{'filehandle'}; + + $self->print_metadata($slice); + my $width = $self->{line_width}; + + # set buffer size + my $chunk_size = $self->{'chunk_factor'} * $width; + + my $start = 1; + my $end = $slice->length(); + + #chunk the sequence to conserve memory, and print + + my $here = $start; + + while($here <= $end) { + my $there = $here + $chunk_size - 1; + $there = $end if($there > $end); + my $seq = $slice->subseq($here, $there); + $seq =~ s/(.{1,$width})/$1\n/g; + print $fh $seq or die "Error writing to file handle"; + $here = $there + 1; + } + + if ($slice->length > 0) {$self->{'achieved_something'} = 1;} + +} + +=head2 line_width + + Arg [1] : Integer e.g. 60 or 80 + Description: Set and get FASTA format line width. Default is 60 + Returntype : Integer + +=cut + +sub line_width { + my $self = shift; + my $line_width = shift; + if ($line_width) { $self->{'line_width'} = $line_width }; + return $self->{'line_width'} +} + +=head2 chunk_factor + Arg [1] : Integer e.g. 1000 + Description: Set and get the multiplier used to dictate buffer size + Chunk factor x line width = buffer size in bases. + Returntype : Integer +=cut + +sub chunk_factor { + my $self = shift; + my $chunk_factor = shift; + if ($chunk_factor) { $self->{'chunk_factor'} = $chunk_factor}; + return $self->{'chunk_factor'} +} + +=head2 set_custom_header + + Arg [1] : CODE reference + Description: Set the custom header function. Normally this is done at + construction time, but can be overridden here. + Example : $serializer->set_custom_header( sub { return 'New header'}); + Returntype : + +=cut + +sub set_custom_header { + my ($self, $new_header_function) = @_; + $self->header_function($new_header_function); + return; +} + +=head2 header_function + + Arg [1] : CODE reference (optional) + Description: Getter/setter for the custom header code + Example : $serializer->header_function( sub { return 'New header'}); + Returntype : CODE + +=cut + +sub header_function { + my ($self, $header_function) = @_; + if($header_function) { + assert_ref($header_function, 'CODE', 'header_function'); + $self->{header_function} = $header_function; + } + return $self->{header_function}; +} + +1;