Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Index/AbstractSeq.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Index/AbstractSeq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,313 @@ +# $Id: AbstractSeq.pm,v 1.16 2002/10/22 07:38:33 lapp Exp $ +# +# BioPerl module for Bio::DB::AbstractSeq +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Index::AbstractSeq - Base class for AbstractSeq s + +=head1 SYNOPSIS + + # Make a new sequence file indexing package + + package MyShinyNewIndexer; + use Bio::Index::AbstractSeq; + + @ISA = ('Bio::Index::AbstractSeq'); + + # Now provide the necessary methods... + +=head1 DESCRIPTION + +Provides a common base class for multiple +sequence files built using the +Bio::Index::Abstract system, and provides a +Bio::DB::SeqI interface. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@ebi.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=head1 SEE ALSO + +Bio::Index::Abstract - Module which +Bio::Index::AbstractSeq inherits off, which +provides dbm indexing for flat files (which are +not necessarily sequence files). + +=cut + +# Let's begin the code ... + + +package Bio::Index::AbstractSeq; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO::MultiFile; +use Bio::Index::Abstract; +use Bio::DB::SeqI; + + +@ISA = qw(Bio::Index::Abstract Bio::DB::SeqI); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + $self->{'_seqio_cache'} = []; + return $self; +} + +=head2 _file_format + + Title : _file_format + Usage : $self->_file_format + Function: Derived classes should override this + method (it throws an exception here) + to give the file format of the files used + Example : + Returns : + Args : + + +=cut + +sub _file_format { + my ($self,@args) = @_; + + my $pkg = ref($self); + $self->throw("Class '$pkg' must provide a file format method correctly"); +} + +=head2 fetch + + Title : fetch + Usage : $index->fetch( $id ) + Function: Returns a Bio::Seq object from the index + Example : $seq = $index->fetch( 'dJ67B12' ) + Returns : Bio::Seq object + Args : ID + +=cut + +sub fetch { + my( $self, $id ) = @_; + my $db = $self->db(); + my $seq; + + if (my $rec = $db->{ $id }) { + my ($file, $begin) = $self->unpack_record( $rec ); + + # Get the (possibly cached) SeqIO object + my $seqio = $self->_get_SeqIO_object( $file ); + my $fh = $seqio->_fh(); + + # move to start of record + $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug + seek($fh, $begin, 0); + + $seq = $seqio->next_seq(); + } + + # we essentially assumme that the primary_id for the database + # is the display_id + $seq->primary_id($seq->display_id()) if( defined $seq && ref($seq) && + $seq->isa('Bio::PrimarySeqI') ); + + return $seq; +} + +=head2 _get_SeqIO_object + + Title : _get_SeqIO_object + Usage : $index->_get_SeqIO_object( $file ) + Function: Returns a Bio::SeqIO object for the file + Example : $seq = $index->_get_SeqIO_object( 0 ) + Returns : Bio::SeqIO object + Args : File number (an integer) + +=cut + +sub _get_SeqIO_object { + my( $self, $i ) = @_; + + unless ($self->{'_seqio_cache'}[$i]) { + my $fh = $self->_file_handle($i); + # make a new SeqIO object + my $seqio = Bio::SeqIO->new( -Format => $self->_file_format, + -fh => $fh); + $self->{'_seqio_cache'}[$i] = $seqio; + } + return $self->{'_seqio_cache'}[$i]; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id() + Function: retrieves a sequence object, identically to + ->fetch, but here behaving as a Bio::DB::BioSeqI + Returns : new Bio::Seq object + Args : string represents the id + + +=cut + +sub get_Seq_by_id { + my ($self,$id) = @_; + + return $self->fetch($id); +} + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc() + Function: retrieves a sequence object, identically to + ->fetch, but here behaving as a Bio::DB::BioSeqI + Returns : new Bio::Seq object + Args : string represents the accession number + + +=cut + +sub get_Seq_by_acc { + my ($self,$id) = @_; + + return $self->fetch($id); +} + +=head2 get_PrimarySeq_stream + + Title : get_PrimarySeq_stream + Usage : $stream = get_PrimarySeq_stream + Function: Makes a Bio::DB::SeqStreamI compliant object + which provides a single method, next_primary_seq + Returns : Bio::DB::SeqStreamI + Args : none + + +=cut + +sub get_PrimarySeq_stream { + my $self = shift; + my $num = $self->_file_count() || 0; + my @file; + + for (my $i = 0; $i < $num; $i++) { + my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} ); + push(@file,$file); + } + + my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file); + return $out; +} + +=head2 get_all_primary_ids + + Title : get_all_primary_ids + Usage : @ids = $seqdb->get_all_primary_ids() + Function: gives an array of all the primary_ids of the + sequence objects in the database. These + maybe ids (display style) or accession numbers + or something else completely different - they + *are not* meaningful outside of this database + implementation. + Example : + Returns : an array of strings + Args : none + + +=cut + +sub get_all_primary_ids { + my ($self,@args) = @_; + my $db = $self->db; + + # the problem is here that we have indexed things both on + # accession number and name. + + # We could take two options + # here - loop over the database, returning only one copy of each + # id that points to the same byte position, or we rely on semantics + # of accession numbers. + + # someone is going to index a database with no accession numbers. + # doh!. We have to uniquify the index... + + my( %bytepos ); + while (my($id, $rec) = each %$db) { + if( $id =~ /^__/ ) { + # internal info + next; + } + my ($file, $begin) = $self->unpack_record( $rec ); + + $bytepos{"$file:$begin"} = $id; + } + + return values %bytepos; +} + + +=head2 get_Seq_by_primary_id + + Title : get_Seq_by_primary_id + Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string); + Function: Gets a Bio::Seq object by the primary id. The primary + id in these cases has to come from $db->get_all_primary_ids. + There is no other way to get (or guess) the primary_ids + in a database. + + The other possibility is to get Bio::PrimarySeqI objects + via the get_PrimarySeq_stream and the primary_id field + on these objects are specified as the ids to use here. + Returns : A Bio::Seq object + Args : primary id (as a string) + Throws : "acc does not exist" exception + + +=cut + +sub get_Seq_by_primary_id { + my ($self,$id) = @_; + return $self->fetch($id); +} + +1;