diff variant_effect_predictor/Bio/Seq/LargePrimarySeq.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Seq/LargePrimarySeq.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,292 @@
+# $Id: LargePrimarySeq.pm,v 1.27 2002/12/01 00:05:21 jason Exp $
+#
+# BioPerl module for Bio::Seq::LargePrimarySeq
+#
+# Cared for by Ewan Birney <birney@ebi.ac.uk>
+#
+# Copyright Ewan Birney
+#
+# You may distribute this module under the same terms as perl itself
+#
+# updated to utilize File::Temp - jason 2000-12-12
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Seq::LargePrimarySeq - PrimarySeq object that stores sequence as
+files in the tempdir (as found by File::Temp) or the default method in
+Bio::Root::Root
+
+=head1 SYNOPSIS
+
+  # normal primary seq usage
+
+=head1 DESCRIPTION
+
+This object stores a sequence as a series of files in a temporary
+directory. The aim is to allow someone the ability to store very large
+sequences (eg, E<gt> 100MBases) in a file system without running out of memory
+(eg, on a 64 MB real memory machine!). 
+
+Of course, to actually make use of this functionality, the programs
+which use this object B<must> not call $primary_seq-E<gt>seq otherwise the
+entire sequence will come out into memory and probably paste your
+machine. However, calls $primary_seq-E<gt>subseq(10,100) will cause only
+90 characters to be brought into real memory.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this
+and other Bioperl modules. Send your comments and suggestions preferably
+ to one of the Bioperl mailing lists.
+Your participation is much appreciated.
+
+  bioperl-l@bioperl.org               - General discussion
+  http://bio.perl.org/MailList.html   - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Ewan Birney, Jason Stajich
+
+Email birney@ebi.ac.uk
+Email jason@bioperl.org
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::Seq::LargePrimarySeq;
+use vars qw($AUTOLOAD @ISA);
+use strict;
+
+use Bio::PrimarySeq;
+use Bio::Root::IO;
+
+@ISA = qw(Bio::PrimarySeq Bio::Root::IO);
+
+sub new {
+    my ($class, %params) = @_;
+    
+    # don't let PrimarySeq set seq until we have 
+    # opened filehandle
+
+    my $seq = $params{'-seq'} || $params{'-SEQ'};
+    if($seq ) {
+	delete $params{'-seq'};
+	delete $params{'-SEQ'};
+    }
+    my $self = $class->SUPER::new(%params);
+    $self->_initialize_io(%params);
+    my $tempdir = $self->tempdir( CLEANUP => 1);
+    my ($tfh,$file) = $self->tempfile( DIR => $tempdir );
+
+    $tfh     && $self->_fh($tfh);
+    $file    && $self->_filename($file);    
+    $self->length(0);
+    $seq && $self->seq($seq); 
+
+    return $self;
+}
+
+
+sub length {
+   my ($obj,$value) = @_;
+   if( defined $value) {
+      $obj->{'length'} = $value;
+    }
+   
+    return (defined $obj->{'length'}) ? $obj->{'length'} : 0;
+}
+
+=head2 seq
+
+ Title   : seq
+ Usage   :
+ Function:
+ Example :
+ Returns : 
+ Args    :
+
+
+=cut
+
+sub seq {
+   my ($self, $data) = @_;   
+   if( defined $data ) {
+       if( $self->length() == 0) {
+	   $self->add_sequence_as_string($data);
+       } else { 
+	   $self->warn("Trying to reset the seq string, cannot do this with a LargePrimarySeq - must allocate a new object");
+       }
+   } 
+   return $self->subseq(1,$self->length);
+}
+
+=head2 subseq
+
+ Title   : subseq
+ Usage   :
+ Function:
+ Example :
+ Returns : 
+ Args    :
+
+
+=cut
+
+sub subseq{
+   my ($self,$start,$end) = @_;
+   my $string;
+   my $fh = $self->_fh();
+   
+   if( ref($start) && $start->isa('Bio::LocationI') ) {
+       my $loc = $start;
+       if( $loc->length == 0 ) { 
+	   $self->warn("Expect location lengths to be > 0");
+	   return '';
+       } elsif( $loc->end < $loc->start ) { 
+	   # what about circular seqs
+	   $self->warn("Expect location start to come before location end");
+       }
+       my $seq = '';
+       if( $loc->isa('Bio::Location::SplitLocationI') ) {
+	   foreach my $subloc ( $loc->sub_Location ) {
+	       if(! seek($fh,$subloc->start() - 1,0)) {
+		   $self->throw("Unable to seek on file $start:$end $!");
+	       }
+	       my $ret = read($fh, $string, $subloc->length());
+	       if( !defined $ret ) {
+		   $self->throw("Unable to read $start:$end $!");
+	       }
+	       if( $subloc->strand < 0 ) { 
+		   $string = Bio::PrimarySeq->new(-seq => $string)->revcom()->seq();
+	       }
+	       $seq .= $string;		   
+	   }
+       } else { 
+	   if(! seek($fh,$loc->start()-1,0)) {
+	       $self->throw("Unable to seek on file ".$loc->start.":".
+			    $loc->end ." $!");
+	   }
+	   my $ret = read($fh, $string, $loc->length());
+	   if( !defined $ret ) {
+	       $self->throw("Unable to read ".$loc->start.":".
+			    $loc->end ." $!");
+	   }
+	   $seq = $string;
+       }
+       if( defined $loc->strand && 
+	   $loc->strand < 0 ) { 
+	   $seq = Bio::PrimarySeq->new(-seq => $seq)->revcom()->seq();
+       }
+       return $seq;
+   }
+   if( $start <= 0 || $end > $self->length ) {
+       $self->throw("Attempting to get a subseq out of range $start:$end vs ".
+		    $self->length);
+   }
+   if( $end < $start ) {
+       $self->throw("Attempting to subseq with end ($end) less than start ($start). To revcom use the revcom function with trunc");
+   }
+   
+   if(! seek($fh,$start-1,0)) {
+       $self->throw("Unable to seek on file $start:$end $!");
+   }
+   my $ret = read($fh, $string, $end-$start+1);
+   if( !defined $ret ) {
+       $self->throw("Unable to read $start:$end $!");
+   }
+   return $string;
+}
+
+=head2 add_sequence_as_string
+
+ Title   : add_sequence_as_string
+ Usage   : $seq->add_sequence_as_string("CATGAT");
+ Function: Appends additional residues to an existing LargePrimarySeq object.  
+           This allows one to build up a large sequence without storing
+           entire object in memory.
+ Returns : Current length of sequence
+ Args    : string to append
+
+=cut
+
+sub add_sequence_as_string{
+   my ($self,$str) = @_;
+   my $len = $self->length + CORE::length($str);
+   my $fh = $self->_fh();
+   if(! seek($fh,0,2)) {
+       $self->throw("Unable to seek end of file: $!");
+   }
+   $self->_print($str);
+   $self->length($len);
+}
+
+
+=head2 _filename
+
+ Title   : _filename
+ Usage   : $obj->_filename($newval)
+ Function: 
+ Example : 
+ Returns : value of _filename
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub _filename{
+   my ($obj,$value) = @_;
+   if( defined $value) {
+      $obj->{'_filename'} = $value;
+    }
+    return $obj->{'_filename'};
+
+}
+=head2 alphabet
+
+ Title   : alphabet
+ Usage   : $obj->alphabet($newval)
+ Function: 
+ Example : 
+ Returns : value of alphabet
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub alphabet{
+   my ($self,$value) = @_;
+   if( defined $value) {
+      $self->SUPER::alphabet($value);
+    }
+    return $self->SUPER::alphabet() || 'dna';
+
+}
+
+sub DESTROY {
+    my $self = shift;
+    my $fh = $self->_fh();
+    close($fh) if( defined $fh );
+    # this should be handled by Tempfile removal, but we'll unlink anyways.
+    unlink $self->_filename() if defined $self->_filename() && -e $self->_filename;
+    $self->SUPER::DESTROY();
+}
+
+1;