comparison variant_effect_predictor/Bio/DB/GenBank.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: GenBank.pm,v 1.47.2.2 2003/07/03 12:31:31 heikki Exp $
2 #
3 # BioPerl module for Bio::DB::GenBank
4 #
5 # Cared for by Aaron Mackey <amackey@virginia.edu>
6 #
7 # Copyright Aaron Mackey
8 #
9 # You may distribute this module under the same terms as perl itself
10 #
11 # POD documentation - main docs before the code
12 #
13 # Added LWP support - Jason Stajich 2000-11-6
14 # completely reworked by Jason Stajich 2000-12-8
15 # to use WebDBSeqI
16
17 # Added batch entrez back when determined that new entrez cgi will
18 # essentially work (there is a limit to the number of characters in a
19 # GET request so I am not sure how we can get around this). The NCBI
20 # Batch Entrez form has changed some and it does not support retrieval
21 # of text only data. Still should investigate POST-ing (tried and
22 # failed) a message to the entrez cgi to get around the GET
23 # limitations.
24
25 =head1 NAME
26
27 Bio::DB::GenBank - Database object interface to GenBank
28
29 =head1 SYNOPSIS
30
31 use Bio::DB::GenBank;
32 $gb = new Bio::DB::GenBank;
33
34 $seq = $gb->get_Seq_by_id('MUSIGHBA1'); # Unique ID
35
36 # or ...
37
38 $seq = $gb->get_Seq_by_acc('J00522'); # Accession Number
39 $seq = $gb->get_Seq_by_version('J00522.1'); # Accession.version
40 $seq = $gb->get_Seq_by_gi('405830'); # GI Number
41
42 # get a stream via a query string
43 my $query = Bio::DB::Query::GenBank->new
44 (-query =>'Oryza sativa[Organism] AND EST',
45 -reldate => '30',
46 -db => 'nucleotide');
47 my $seqio = $gb->get_Stream_by_query($query);
48
49 while( my $seq = $seqio->next_seq ) {
50 print "seq length is ", $seq->length,"\n";
51 }
52
53 # or ... best when downloading very large files, prevents
54 # keeping all of the file in memory
55
56 # also don't want features, just sequence so let's save bandwith
57 # and request Fasta sequence
58 $gb = new Bio::DB::GenBank(-retrievaltype => 'tempfile' ,
59 -format => 'Fasta');
60 my $seqio = $gb->get_Stream_by_acc(['AC013798', 'AC021953'] );
61 while( my $clone = $seqio->next_seq ) {
62 print "cloneid is ", $clone->display_id, " ",
63 $clone->accession_number, "\n";
64 }
65 # note that get_Stream_by_version is not implemented
66
67 =head1 DESCRIPTION
68
69 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
70 GenBank database at NCBI, via an Entrez query.
71
72 WARNING: Please do NOT spam the Entrez web server with multiple
73 requests. NCBI offers Batch Entrez for this purpose.
74
75 Note that when querying for GenBank accessions starting with 'NT_' you
76 will need to call $gb-E<gt>request_format('fasta') beforehand, because
77 in GenBank format (the default) the sequence part will be left out
78 (the reason is that NT contigs are rather annotation with references
79 to clones).
80
81 Some work has been done to automatically detect and retrieve whole NT_
82 clones when the data is in that format (NCBI RefSeq clones). More
83 testing and feedback from users is needed to achieve a good fit of
84 functionality and ease of use.
85
86 =head1 FEEDBACK
87
88 =head2 Mailing Lists
89
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to one
92 of the Bioperl mailing lists. Your participation is much appreciated.
93
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/MailList.shtml - About the mailing lists
96
97 =head2 Reporting Bugs
98
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution. Bug reports can be submitted via email
101 or the web:
102
103 bioperl-bugs@bio.perl.org
104 http://bugzilla.bioperl.org/
105
106 =head1 AUTHOR - Aaron Mackey, Jason Stajich
107
108 Email amackey@virginia.edu
109 Email jason@bioperl.org
110
111 =head1 APPENDIX
112
113 The rest of the documentation details each of the
114 object methods. Internal methods are usually
115 preceded with a _
116
117 =cut
118
119 # Let the code begin...
120
121 package Bio::DB::GenBank;
122 use strict;
123 use vars qw(@ISA %PARAMSTRING $DEFAULTFORMAT $DEFAULTMODE);
124 use Bio::DB::NCBIHelper;
125
126 @ISA = qw(Bio::DB::NCBIHelper);
127 BEGIN {
128 $DEFAULTMODE = 'single';
129 $DEFAULTFORMAT = 'gp';
130 %PARAMSTRING = (
131 'batch' => { 'db' => 'nucleotide',
132 'usehistory' => 'n',
133 'tool' => 'bioperl',
134 'retmode' => 'text'},
135 'query' => { 'usehistory' => 'y',
136 'tool' => 'bioperl',
137 'retmode' => 'text'},
138 'gi' => { 'db' => 'nucleotide',
139 'usehistory' => 'n',
140 'tool' => 'bioperl',
141 'retmode' => 'text'},
142 'version' => { 'db' => 'nucleotide',
143 'usehistory' => 'n',
144 'tool' => 'bioperl',
145 'retmode' => 'text'},
146 'single' => { 'db' => 'nucleotide',
147 'usehistory' => 'n',
148 'tool' => 'bioperl',
149 'retmode' => 'text'},
150 );
151 }
152
153 # new is in NCBIHelper
154
155 # helper method to get db specific options
156
157 =head2 new
158
159 Title : new
160 Usage : $gb = Bio::DB::GenBank->new(@options)
161 Function: Creates a new genbank handle
162 Returns : New genbank handle
163 Args : -delay number of seconds to delay between fetches (3s)
164
165 NOTE: There are other options that are used internally. By NCBI policy, this
166 module introduces a 3s delay between fetches. If you are fetching multiple genbank
167 ids, it is a good idea to use get
168
169 =cut
170
171 =head2 get_params
172
173 Title : get_params
174 Usage : my %params = $self->get_params($mode)
175 Function: Returns key,value pairs to be passed to NCBI database
176 for either 'batch' or 'single' sequence retrieval method
177 Returns : a key,value pair hash
178 Args : 'single' or 'batch' mode for retrieval
179
180 =cut
181
182 sub get_params {
183 my ($self, $mode) = @_;
184 return defined $PARAMSTRING{$mode} ?
185 %{$PARAMSTRING{$mode}} : %{$PARAMSTRING{$DEFAULTMODE}};
186 }
187
188 # from Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
189
190 =head1 Routines Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
191
192 =head2 get_Seq_by_id
193
194 Title : get_Seq_by_id
195 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
196 Function: Gets a Bio::Seq object by its name
197 Returns : a Bio::Seq object
198 Args : the id (as a string) of a sequence
199 Throws : "id does not exist" exception
200
201 =head2 get_Seq_by_acc
202
203 Title : get_Seq_by_acc
204 Usage : $seq = $db->get_Seq_by_acc($acc);
205 Function: Gets a Seq object by accession numbers
206 Returns : a Bio::Seq object
207 Args : the accession number as a string
208 Note : For GenBank, this just calls the same code for get_Seq_by_id()
209 Throws : "id does not exist" exception
210
211 =cut
212
213
214 sub get_Seq_by_acc {
215 my ($self,$seqid) = @_;
216 $self->SUPER::get_Seq_by_acc("gb|$seqid");
217 }
218
219 =head2 get_Seq_by_gi
220
221 Title : get_Seq_by_gi
222 Usage : $seq = $db->get_Seq_by_gi('405830');
223 Function: Gets a Bio::Seq object by gi number
224 Returns : A Bio::Seq object
225 Args : gi number (as a string)
226 Throws : "gi does not exist" exception
227
228 =head2 get_Seq_by_version
229
230 Title : get_Seq_by_version
231 Usage : $seq = $db->get_Seq_by_version('X77802.1');
232 Function: Gets a Bio::Seq object by sequence version
233 Returns : A Bio::Seq object
234 Args : accession.version (as a string)
235 Throws : "acc.version does not exist" exception
236
237 =head1 Routines implemented by Bio::DB::NCBIHelper
238
239 =head2 get_Stream_by_query
240
241 Title : get_Stream_by_query
242 Usage : $seq = $db->get_Stream_by_query($query);
243 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
244 at a time. For large numbers of sequences, this is far superior
245 than get_Stream_by_[id/acc]().
246 Example :
247 Returns : a Bio::SeqIO stream object
248 Args : $query : An Entrez query string or a
249 Bio::DB::Query::GenBank object. It is suggested that you
250 create a Bio::DB::Query::GenBank object and get the entry
251 count before you fetch a potentially large stream.
252
253 =cut
254
255 =head2 get_Stream_by_id
256
257 Title : get_Stream_by_id
258 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
259 Function: Gets a series of Seq objects by unique identifiers
260 Returns : a Bio::SeqIO stream object
261 Args : $ref : a reference to an array of unique identifiers for
262 the desired sequence entries
263
264 =head2 get_Stream_by_acc
265
266 Title : get_Stream_by_acc
267 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
268 Function: Gets a series of Seq objects by accession numbers
269 Returns : a Bio::SeqIO stream object
270 Args : $ref : a reference to an array of accession numbers for
271 the desired sequence entries
272 Note : For GenBank, this just calls the same code for get_Stream_by_id()
273
274 =cut
275
276 =head2 get_Stream_by_gi
277
278 Title : get_Stream_by_gi
279 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
280 Function: Gets a series of Seq objects by gi numbers
281 Returns : a Bio::SeqIO stream object
282 Args : $ref : a reference to an array of gi numbers for
283 the desired sequence entries
284 Note : For GenBank, this just calls the same code for get_Stream_by_id()
285
286 =head2 get_Stream_by_batch
287
288 Title : get_Stream_by_batch
289 Usage : $seq = $db->get_Stream_by_batch($ref);
290 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
291 at a time.
292 Example :
293 Returns : a Bio::SeqIO stream object
294 Args : $ref : either an array reference, a filename, or a filehandle
295 from which to get the list of unique ids/accession numbers.
296
297 NOTE: This method is redundant and deprecated. Use get_Stream_by_id()
298 instead.
299
300 =head2 get_request
301
302 Title : get_request
303 Usage : my $url = $self->get_request
304 Function: HTTP::Request
305 Returns :
306 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
307
308 1;
309 __END__