diff variant_effect_predictor/Bio/Index/AbstractSeq.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Index/AbstractSeq.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,313 @@
+# $Id: AbstractSeq.pm,v 1.16 2002/10/22 07:38:33 lapp Exp $
+#
+# BioPerl module for Bio::DB::AbstractSeq
+#
+# Cared for by Ewan Birney <birney@ebi.ac.uk>
+#
+# Copyright Ewan Birney
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Index::AbstractSeq - Base class for AbstractSeq s 
+
+=head1 SYNOPSIS
+
+  # Make a new sequence file indexing package
+
+  package MyShinyNewIndexer;
+  use Bio::Index::AbstractSeq;
+
+  @ISA = ('Bio::Index::AbstractSeq');
+
+  # Now provide the necessary methods...
+
+=head1 DESCRIPTION
+
+Provides a common base class for multiple
+sequence files built using the
+Bio::Index::Abstract system, and provides a
+Bio::DB::SeqI interface.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this
+and other Bioperl modules. Send your comments and suggestions preferably
+ to one of the Bioperl mailing lists.
+Your participation is much appreciated.
+
+  bioperl-l@bioperl.org             - General discussion
+  http://bioperl.org/MailList.shtml - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+ the bugs and their resolution.
+ Bug reports can be submitted via email or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Ewan Birney
+
+Email birney@ebi.ac.uk
+
+Describe contact details here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
+
+=head1 SEE ALSO
+
+Bio::Index::Abstract - Module which
+Bio::Index::AbstractSeq inherits off, which
+provides dbm indexing for flat files (which are
+not necessarily sequence files).
+
+=cut
+
+# Let's begin the code ...
+
+
+package Bio::Index::AbstractSeq;
+use vars qw(@ISA);
+use strict;
+
+use Bio::SeqIO::MultiFile;
+use Bio::Index::Abstract;
+use Bio::DB::SeqI;
+
+
+@ISA = qw(Bio::Index::Abstract Bio::DB::SeqI);
+
+sub new {
+    my ($class, @args) = @_;
+    my $self = $class->SUPER::new(@args);
+    
+    $self->{'_seqio_cache'} = [];
+    return $self;
+}
+
+=head2 _file_format
+
+ Title   : _file_format
+ Usage   : $self->_file_format
+ Function: Derived classes should override this
+           method (it throws an exception here)
+           to give the file format of the files used
+ Example :
+ Returns : 
+ Args    :
+
+
+=cut
+
+sub _file_format {
+   my ($self,@args) = @_;
+
+   my $pkg = ref($self);
+   $self->throw("Class '$pkg' must provide a file format method correctly");
+}
+
+=head2 fetch
+
+  Title   : fetch
+  Usage   : $index->fetch( $id )
+  Function: Returns a Bio::Seq object from the index
+  Example : $seq = $index->fetch( 'dJ67B12' )
+  Returns : Bio::Seq object
+  Args    : ID
+
+=cut
+
+sub fetch {
+    my( $self, $id ) = @_;
+    my $db = $self->db();
+    my $seq;
+
+    if (my $rec = $db->{ $id }) {
+	my ($file, $begin) = $self->unpack_record( $rec );
+        
+        # Get the (possibly cached) SeqIO object
+        my $seqio = $self->_get_SeqIO_object( $file );
+        my $fh = $seqio->_fh();
+
+        # move to start of record
+	$begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug
+        seek($fh, $begin, 0);
+	
+	$seq = $seqio->next_seq();
+    }
+
+    # we essentially assumme that the primary_id for the database
+    # is the display_id
+    $seq->primary_id($seq->display_id()) if( defined $seq && ref($seq) &&
+					     $seq->isa('Bio::PrimarySeqI') );
+
+    return $seq;
+}
+
+=head2 _get_SeqIO_object
+
+  Title   : _get_SeqIO_object
+  Usage   : $index->_get_SeqIO_object( $file )
+  Function: Returns a Bio::SeqIO object for the file
+  Example : $seq = $index->_get_SeqIO_object( 0 )
+  Returns : Bio::SeqIO object
+  Args    : File number (an integer)
+
+=cut
+
+sub _get_SeqIO_object {
+    my( $self, $i ) = @_;
+    
+    unless ($self->{'_seqio_cache'}[$i]) {
+        my $fh = $self->_file_handle($i);
+        # make a new SeqIO object
+        my $seqio = Bio::SeqIO->new( -Format => $self->_file_format,
+				     -fh     => $fh);
+        $self->{'_seqio_cache'}[$i] = $seqio;
+    }
+    return $self->{'_seqio_cache'}[$i];
+}
+
+=head2 get_Seq_by_id
+
+ Title   : get_Seq_by_id
+ Usage   : $seq = $db->get_Seq_by_id()
+ Function: retrieves a sequence object, identically to
+           ->fetch, but here behaving as a Bio::DB::BioSeqI
+ Returns : new Bio::Seq object
+ Args    : string represents the id
+
+
+=cut
+
+sub get_Seq_by_id {
+   my ($self,$id) = @_;
+
+   return $self->fetch($id);
+}
+
+=head2 get_Seq_by_acc
+
+ Title   : get_Seq_by_acc
+ Usage   : $seq = $db->get_Seq_by_acc()
+ Function: retrieves a sequence object, identically to
+           ->fetch, but here behaving as a Bio::DB::BioSeqI
+ Returns : new Bio::Seq object
+ Args    : string represents the accession number
+
+
+=cut
+
+sub get_Seq_by_acc {
+   my ($self,$id) = @_;
+
+   return $self->fetch($id);
+}
+
+=head2 get_PrimarySeq_stream
+
+ Title   : get_PrimarySeq_stream
+ Usage   : $stream = get_PrimarySeq_stream
+ Function: Makes a Bio::DB::SeqStreamI compliant object
+           which provides a single method, next_primary_seq
+ Returns : Bio::DB::SeqStreamI
+ Args    : none
+
+
+=cut
+
+sub get_PrimarySeq_stream {
+    my $self = shift;
+    my $num  = $self->_file_count() || 0;
+    my @file;
+    
+    for (my $i = 0; $i < $num; $i++) {
+        my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
+	push(@file,$file);
+    }
+   
+    my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file);
+    return $out;
+}
+
+=head2 get_all_primary_ids
+
+ Title   : get_all_primary_ids
+ Usage   : @ids = $seqdb->get_all_primary_ids()
+ Function: gives an array of all the primary_ids of the 
+           sequence objects in the database. These
+           maybe ids (display style) or accession numbers
+           or something else completely different - they
+           *are not* meaningful outside of this database
+           implementation.
+ Example :
+ Returns : an array of strings
+ Args    : none
+
+
+=cut
+
+sub get_all_primary_ids {
+   my ($self,@args) = @_;
+    my $db = $self->db;
+   
+   # the problem is here that we have indexed things both on
+   # accession number and name. 
+
+   # We could take two options
+   # here - loop over the database, returning only one copy of each
+   # id that points to the same byte position, or we rely on semantics
+   # of accession numbers.
+
+   # someone is going to index a database with no accession numbers.
+   # doh!. We have to uniquify the index...
+
+   my( %bytepos );
+   while (my($id, $rec) = each %$db) {
+       if( $id =~ /^__/ ) {
+           # internal info
+           next;
+       }
+       my ($file, $begin) = $self->unpack_record( $rec );
+       
+       $bytepos{"$file:$begin"} = $id;
+   }
+
+   return values %bytepos;
+}
+
+
+=head2 get_Seq_by_primary_id
+
+ Title   : get_Seq_by_primary_id
+ Usage   : $seq = $db->get_Seq_by_primary_id($primary_id_string);
+ Function: Gets a Bio::Seq object by the primary id. The primary
+           id in these cases has to come from $db->get_all_primary_ids.
+           There is no other way to get (or guess) the primary_ids
+           in a database.
+
+           The other possibility is to get Bio::PrimarySeqI objects
+           via the get_PrimarySeq_stream and the primary_id field
+           on these objects are specified as the ids to use here.
+ Returns : A Bio::Seq object
+ Args    : primary id (as a string)
+ Throws  : "acc does not exist" exception
+
+
+=cut
+
+sub get_Seq_by_primary_id {
+   my ($self,$id) = @_;
+   return $self->fetch($id);
+}
+
+1;