Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SeqIO/largefasta.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/largefasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,170 @@ +# $Id: largefasta.pm,v 1.18 2002/12/27 19:42:32 birney Exp $ +# BioPerl module for Bio::SeqIO::largefasta +# +# Cared for by Jason Stajich +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself +# _history +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::largefasta - method i/o on very large fasta sequence files + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from fasta flat +file databases. + +This module handles very large sequence files by using the +Bio::Seq::LargePrimarySeq module to store all the sequence data in +a file. This can be a problem if you have limited disk space on your +computer because this will effectively cause 2 copies of the sequence +file to reside on disk for the life of the +Bio::Seq::LargePrimarySeq object. The default location for this is +specified by the L<File::Spec>-E<gt>tmpdir routine which is usually /tmp +on UNIX. If a sequence file is larger than the swap space (capacity +of the /tmp dir) this could cause problems for the machine. It is +possible to set the directory where the temporary file is located by +adding the following line to your code BEFORE calling next_seq. See +L<Bio::Seq::LargePrimarySeq> for more information. + + $Bio::Seq::LargePrimarySeq::DEFAULT_TEMP_DIR = 'newdir'; + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Jason Stajich + +Email: jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::largefasta; +use vars qw(@ISA $FASTALINELEN); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +$FASTALINELEN = 60; +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::LargePrimarySeq')); + } +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : NONE + +=cut + +sub next_seq { + my ($self) = @_; +# local $/ = "\n"; + my $largeseq = $self->sequence_factory->create(); + my ($id,$fulldesc,$entry); + my $count = 0; + my $seen = 0; + while( defined ($entry = $self->_readline) ) { + if( $seen == 1 && $entry =~ /^\s*>/ ) { + $self->_pushback($entry); + return $largeseq; + } +# if ( ($entry eq '>') || eof($self->_fh) ) { $seen = 1; next; } + if ( ($entry eq '>') ) { $seen = 1; next; } + elsif( $entry =~ /\s*>(.+?)$/ ) { + $seen = 1; + ($id,$fulldesc) = ($1 =~ /^\s*(\S+)\s*(.*)$/) + or $self->warn("Can't parse fasta header"); + $largeseq->display_id($id); + $largeseq->primary_id($id); + $largeseq->desc($fulldesc); + } else { + $entry =~ s/\s+//g; + $largeseq->add_sequence_as_string($entry); + } + (++$count % 1000 == 0 && $self->verbose() > 0) && print "line $count\n"; + } + if( ! $seen ) { return undef; } + return $largeseq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + my $top = $seq->id(); + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + $self->_print (">",$top,"\n"); + my $end = $seq->length(); + my $start = 1; + while( $start < $end ) { + my $stop = $start + $FASTALINELEN - 1; + $stop = $end if( $stop > $end ); + $self->_print($seq->subseq($start,$stop), "\n"); + $start += $FASTALINELEN; + } + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1;