Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/DB/GenBank.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GenBank.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,309 @@ +# $Id: GenBank.pm,v 1.47.2.2 2003/07/03 12:31:31 heikki Exp $ +# +# BioPerl module for Bio::DB::GenBank +# +# Cared for by Aaron Mackey <amackey@virginia.edu> +# +# Copyright Aaron Mackey +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code +# +# Added LWP support - Jason Stajich 2000-11-6 +# completely reworked by Jason Stajich 2000-12-8 +# to use WebDBSeqI + +# Added batch entrez back when determined that new entrez cgi will +# essentially work (there is a limit to the number of characters in a +# GET request so I am not sure how we can get around this). The NCBI +# Batch Entrez form has changed some and it does not support retrieval +# of text only data. Still should investigate POST-ing (tried and +# failed) a message to the entrez cgi to get around the GET +# limitations. + +=head1 NAME + +Bio::DB::GenBank - Database object interface to GenBank + +=head1 SYNOPSIS + + use Bio::DB::GenBank; + $gb = new Bio::DB::GenBank; + + $seq = $gb->get_Seq_by_id('MUSIGHBA1'); # Unique ID + + # or ... + + $seq = $gb->get_Seq_by_acc('J00522'); # Accession Number + $seq = $gb->get_Seq_by_version('J00522.1'); # Accession.version + $seq = $gb->get_Seq_by_gi('405830'); # GI Number + + # get a stream via a query string + my $query = Bio::DB::Query::GenBank->new + (-query =>'Oryza sativa[Organism] AND EST', + -reldate => '30', + -db => 'nucleotide'); + my $seqio = $gb->get_Stream_by_query($query); + + while( my $seq = $seqio->next_seq ) { + print "seq length is ", $seq->length,"\n"; + } + + # or ... best when downloading very large files, prevents + # keeping all of the file in memory + + # also don't want features, just sequence so let's save bandwith + # and request Fasta sequence + $gb = new Bio::DB::GenBank(-retrievaltype => 'tempfile' , + -format => 'Fasta'); + my $seqio = $gb->get_Stream_by_acc(['AC013798', 'AC021953'] ); + while( my $clone = $seqio->next_seq ) { + print "cloneid is ", $clone->display_id, " ", + $clone->accession_number, "\n"; + } + # note that get_Stream_by_version is not implemented + +=head1 DESCRIPTION + +Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the +GenBank database at NCBI, via an Entrez query. + +WARNING: Please do NOT spam the Entrez web server with multiple +requests. NCBI offers Batch Entrez for this purpose. + +Note that when querying for GenBank accessions starting with 'NT_' you +will need to call $gb-E<gt>request_format('fasta') beforehand, because +in GenBank format (the default) the sequence part will be left out +(the reason is that NT contigs are rather annotation with references +to clones). + +Some work has been done to automatically detect and retrieve whole NT_ +clones when the data is in that format (NCBI RefSeq clones). More +testing and feedback from users is needed to achieve a good fit of +functionality and ease of use. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Aaron Mackey, Jason Stajich + +Email amackey@virginia.edu +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the +object methods. Internal methods are usually +preceded with a _ + +=cut + +# Let the code begin... + +package Bio::DB::GenBank; +use strict; +use vars qw(@ISA %PARAMSTRING $DEFAULTFORMAT $DEFAULTMODE); +use Bio::DB::NCBIHelper; + +@ISA = qw(Bio::DB::NCBIHelper); +BEGIN { + $DEFAULTMODE = 'single'; + $DEFAULTFORMAT = 'gp'; + %PARAMSTRING = ( + 'batch' => { 'db' => 'nucleotide', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'query' => { 'usehistory' => 'y', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'gi' => { 'db' => 'nucleotide', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'version' => { 'db' => 'nucleotide', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + 'single' => { 'db' => 'nucleotide', + 'usehistory' => 'n', + 'tool' => 'bioperl', + 'retmode' => 'text'}, + ); +} + +# new is in NCBIHelper + +# helper method to get db specific options + +=head2 new + + Title : new + Usage : $gb = Bio::DB::GenBank->new(@options) + Function: Creates a new genbank handle + Returns : New genbank handle + Args : -delay number of seconds to delay between fetches (3s) + +NOTE: There are other options that are used internally. By NCBI policy, this +module introduces a 3s delay between fetches. If you are fetching multiple genbank +ids, it is a good idea to use get + +=cut + +=head2 get_params + + Title : get_params + Usage : my %params = $self->get_params($mode) + Function: Returns key,value pairs to be passed to NCBI database + for either 'batch' or 'single' sequence retrieval method + Returns : a key,value pair hash + Args : 'single' or 'batch' mode for retrieval + +=cut + +sub get_params { + my ($self, $mode) = @_; + return defined $PARAMSTRING{$mode} ? + %{$PARAMSTRING{$mode}} : %{$PARAMSTRING{$DEFAULTMODE}}; +} + +# from Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI + +=head1 Routines Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc($acc); + Function: Gets a Seq object by accession numbers + Returns : a Bio::Seq object + Args : the accession number as a string + Note : For GenBank, this just calls the same code for get_Seq_by_id() + Throws : "id does not exist" exception + +=cut + + +sub get_Seq_by_acc { + my ($self,$seqid) = @_; + $self->SUPER::get_Seq_by_acc("gb|$seqid"); +} + +=head2 get_Seq_by_gi + + Title : get_Seq_by_gi + Usage : $seq = $db->get_Seq_by_gi('405830'); + Function: Gets a Bio::Seq object by gi number + Returns : A Bio::Seq object + Args : gi number (as a string) + Throws : "gi does not exist" exception + +=head2 get_Seq_by_version + + Title : get_Seq_by_version + Usage : $seq = $db->get_Seq_by_version('X77802.1'); + Function: Gets a Bio::Seq object by sequence version + Returns : A Bio::Seq object + Args : accession.version (as a string) + Throws : "acc.version does not exist" exception + +=head1 Routines implemented by Bio::DB::NCBIHelper + +=head2 get_Stream_by_query + + Title : get_Stream_by_query + Usage : $seq = $db->get_Stream_by_query($query); + Function: Retrieves Seq objects from Entrez 'en masse', rather than one + at a time. For large numbers of sequences, this is far superior + than get_Stream_by_[id/acc](). + Example : + Returns : a Bio::SeqIO stream object + Args : $query : An Entrez query string or a + Bio::DB::Query::GenBank object. It is suggested that you + create a Bio::DB::Query::GenBank object and get the entry + count before you fetch a potentially large stream. + +=cut + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); + Function: Gets a series of Seq objects by unique identifiers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of unique identifiers for + the desired sequence entries + +=head2 get_Stream_by_acc + + Title : get_Stream_by_acc + Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); + Function: Gets a series of Seq objects by accession numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of accession numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=cut + +=head2 get_Stream_by_gi + + Title : get_Stream_by_gi + Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]); + Function: Gets a series of Seq objects by gi numbers + Returns : a Bio::SeqIO stream object + Args : $ref : a reference to an array of gi numbers for + the desired sequence entries + Note : For GenBank, this just calls the same code for get_Stream_by_id() + +=head2 get_Stream_by_batch + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch($ref); + Function: Retrieves Seq objects from Entrez 'en masse', rather than one + at a time. + Example : + Returns : a Bio::SeqIO stream object + Args : $ref : either an array reference, a filename, or a filehandle + from which to get the list of unique ids/accession numbers. + +NOTE: This method is redundant and deprecated. Use get_Stream_by_id() +instead. + +=head2 get_request + + Title : get_request + Usage : my $url = $self->get_request + Function: HTTP::Request + Returns : + Args : %qualifiers = a hash of qualifiers (ids, format, etc) + +1; +__END__