Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/DB/EMBL.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/EMBL.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,209 @@ +# +# $Id: EMBL.pm,v 1.12.2.1 2003/06/25 13:44:18 heikki Exp $ +# +# BioPerl module for Bio::DB::EMBL +# +# Cared for by Heikki Lehvaslaiho <Heikki@ebi.ac.uk> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::EMBL - Database object interface for EMBL entry retrieval + +=head1 SYNOPSIS + + use Bio::DB::EMBL; + + $embl = new Bio::DB::EMBL; + + # remember that EMBL_ID does not equal GenBank_ID! + $seq = $embl->get_Seq_by_id('BUM'); # EMBL ID + print "cloneid is ", $seq->id, "\n"; + + # or changeing to accession number and Fasta format ... + $embl->request_format('fasta'); + $seq = $embl->get_Seq_by_acc('J02231'); # EMBL ACC + print "cloneid is ", $seq->id, "\n"; + + # especially when using versions, you better be prepared + # in not getting what what want + eval { + $seq = $embl->get_Seq_by_version('J02231.1'); # EMBL VERSION + }; + print "cloneid is ", $seq->id, "\n" unless $@; + + # or ... best when downloading very large files, prevents + # keeping all of the file in memory + + # also don't want features, just sequence so let's save bandwith + # and request Fasta sequence + $embl = new Bio::DB::EMBL(-retrievaltype => 'tempfile' , + -format => 'fasta'); + my $seqio = $embl->get_Stream_by_batch(['AC013798', 'AC021953'] ); + while( my $clone = $seqio->next_seq ) { + print "cloneid is ", $clone->id, "\n"; + } + +=head1 DESCRIPTION + +Allows the dynamic retrieval of sequence objects L<Bio::Seq> from the +EMBL database using the dbfetch script at EBI: +L<http://www.ebi.ac.uk/cgi-bin/dbfetch>. + +In order to make changes transparent we have host type (currently only +ebi) and location (defaults to ebi) separated out. This allows later +additions of more servers in different geographical locations. + +The functionality of this module is inherited from L<Bio::DB::DBFetch> +which implements L<Bio::DB::WebDBSeqI>. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email Heikki Lehvaslaiho E<lt>Heikki@ebi.ac.ukE<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::EMBL; +use strict; +use vars qw(@ISA $MODVERSION %HOSTS %FORMATMAP $DEFAULTFORMAT); + +$MODVERSION = '0.2'; +use Bio::DB::DBFetch; +use Bio::DB::RefSeq; + +@ISA = qw(Bio::DB::DBFetch); + +BEGIN { + # you can add your own here theoretically. + %HOSTS = ( + 'dbfetch' => { + baseurl => 'http://%s/cgi-bin/dbfetch?db=embl&style=raw', + hosts => { + 'ebi' => 'www.ebi.ac.uk' + } + } + ); + %FORMATMAP = ( 'embl' => 'embl', + 'fasta' => 'fasta' + ); + $DEFAULTFORMAT = 'embl'; +} + +=head2 new + + Title : new + Usage : $gb = Bio::DB::GenBank->new(@options) + Function: Creates a new genbank handle + Returns : New genbank handle + Args : -delay number of seconds to delay between fetches (3s) + +NOTE: There are other options that are used internally. + +=cut + +sub new { + my ($class, @args ) = @_; + my $self = $class->SUPER::new(@args); + + $self->{ '_hosts' } = {}; + $self->{ '_formatmap' } = {}; + + $self->hosts(\%HOSTS); + $self->formatmap(\%FORMATMAP); + $self->{'_default_format'} = $DEFAULTFORMAT; + + return $self; +} + + +=head2 Bio::DB::WebDBSeqI methods + +Overriding WebDBSeqI method to help newbies to retrieve sequences. +EMBL database is all too often passed RefSeq accessions. This +redirects those calls. See L<Bio::DB::RefSeq>. + + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +sub get_Stream_by_acc { + my ($self, $ids ) = @_; + my $newdb = $self->_check_id($ids); + if ($newdb && $newdb->isa('Bio::DB::RefSeq')) { + return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single'); + } else { + return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); + } +} + + +=head2 _check_id + + Title : _check_id + Usage : + Function: + Returns : A Bio::DB::RefSeq reference or throws + Args : $id(s), $string +=cut + +sub _check_id { + my ($self, $ids) = @_; + + # NT contigs can not be retrieved + $self->throw("NT_ contigs are whole chromosome files which are not part of regular". + "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") + if $ids =~ /NT_/; + + # Asking for a RefSeq from EMBL/GenBank + + if ($ids =~ /N._/) { + $self->warn("[$ids] is not a normal sequence database but a RefSeq entry.". + " Redirecting the request.\n") + if $self->verbose >= 0; + return new Bio::DB::RefSeq(-verbose => $self->verbose); + } +} + + +1;