0
|
1 # $Id: GenBank.pm,v 1.47.2.2 2003/07/03 12:31:31 heikki Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::DB::GenBank
|
|
4 #
|
|
5 # Cared for by Aaron Mackey <amackey@virginia.edu>
|
|
6 #
|
|
7 # Copyright Aaron Mackey
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12 #
|
|
13 # Added LWP support - Jason Stajich 2000-11-6
|
|
14 # completely reworked by Jason Stajich 2000-12-8
|
|
15 # to use WebDBSeqI
|
|
16
|
|
17 # Added batch entrez back when determined that new entrez cgi will
|
|
18 # essentially work (there is a limit to the number of characters in a
|
|
19 # GET request so I am not sure how we can get around this). The NCBI
|
|
20 # Batch Entrez form has changed some and it does not support retrieval
|
|
21 # of text only data. Still should investigate POST-ing (tried and
|
|
22 # failed) a message to the entrez cgi to get around the GET
|
|
23 # limitations.
|
|
24
|
|
25 =head1 NAME
|
|
26
|
|
27 Bio::DB::GenBank - Database object interface to GenBank
|
|
28
|
|
29 =head1 SYNOPSIS
|
|
30
|
|
31 use Bio::DB::GenBank;
|
|
32 $gb = new Bio::DB::GenBank;
|
|
33
|
|
34 $seq = $gb->get_Seq_by_id('MUSIGHBA1'); # Unique ID
|
|
35
|
|
36 # or ...
|
|
37
|
|
38 $seq = $gb->get_Seq_by_acc('J00522'); # Accession Number
|
|
39 $seq = $gb->get_Seq_by_version('J00522.1'); # Accession.version
|
|
40 $seq = $gb->get_Seq_by_gi('405830'); # GI Number
|
|
41
|
|
42 # get a stream via a query string
|
|
43 my $query = Bio::DB::Query::GenBank->new
|
|
44 (-query =>'Oryza sativa[Organism] AND EST',
|
|
45 -reldate => '30',
|
|
46 -db => 'nucleotide');
|
|
47 my $seqio = $gb->get_Stream_by_query($query);
|
|
48
|
|
49 while( my $seq = $seqio->next_seq ) {
|
|
50 print "seq length is ", $seq->length,"\n";
|
|
51 }
|
|
52
|
|
53 # or ... best when downloading very large files, prevents
|
|
54 # keeping all of the file in memory
|
|
55
|
|
56 # also don't want features, just sequence so let's save bandwith
|
|
57 # and request Fasta sequence
|
|
58 $gb = new Bio::DB::GenBank(-retrievaltype => 'tempfile' ,
|
|
59 -format => 'Fasta');
|
|
60 my $seqio = $gb->get_Stream_by_acc(['AC013798', 'AC021953'] );
|
|
61 while( my $clone = $seqio->next_seq ) {
|
|
62 print "cloneid is ", $clone->display_id, " ",
|
|
63 $clone->accession_number, "\n";
|
|
64 }
|
|
65 # note that get_Stream_by_version is not implemented
|
|
66
|
|
67 =head1 DESCRIPTION
|
|
68
|
|
69 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
|
|
70 GenBank database at NCBI, via an Entrez query.
|
|
71
|
|
72 WARNING: Please do NOT spam the Entrez web server with multiple
|
|
73 requests. NCBI offers Batch Entrez for this purpose.
|
|
74
|
|
75 Note that when querying for GenBank accessions starting with 'NT_' you
|
|
76 will need to call $gb-E<gt>request_format('fasta') beforehand, because
|
|
77 in GenBank format (the default) the sequence part will be left out
|
|
78 (the reason is that NT contigs are rather annotation with references
|
|
79 to clones).
|
|
80
|
|
81 Some work has been done to automatically detect and retrieve whole NT_
|
|
82 clones when the data is in that format (NCBI RefSeq clones). More
|
|
83 testing and feedback from users is needed to achieve a good fit of
|
|
84 functionality and ease of use.
|
|
85
|
|
86 =head1 FEEDBACK
|
|
87
|
|
88 =head2 Mailing Lists
|
|
89
|
|
90 User feedback is an integral part of the evolution of this and other
|
|
91 Bioperl modules. Send your comments and suggestions preferably to one
|
|
92 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
93
|
|
94 bioperl-l@bioperl.org - General discussion
|
|
95 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
96
|
|
97 =head2 Reporting Bugs
|
|
98
|
|
99 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
100 the bugs and their resolution. Bug reports can be submitted via email
|
|
101 or the web:
|
|
102
|
|
103 bioperl-bugs@bio.perl.org
|
|
104 http://bugzilla.bioperl.org/
|
|
105
|
|
106 =head1 AUTHOR - Aaron Mackey, Jason Stajich
|
|
107
|
|
108 Email amackey@virginia.edu
|
|
109 Email jason@bioperl.org
|
|
110
|
|
111 =head1 APPENDIX
|
|
112
|
|
113 The rest of the documentation details each of the
|
|
114 object methods. Internal methods are usually
|
|
115 preceded with a _
|
|
116
|
|
117 =cut
|
|
118
|
|
119 # Let the code begin...
|
|
120
|
|
121 package Bio::DB::GenBank;
|
|
122 use strict;
|
|
123 use vars qw(@ISA %PARAMSTRING $DEFAULTFORMAT $DEFAULTMODE);
|
|
124 use Bio::DB::NCBIHelper;
|
|
125
|
|
126 @ISA = qw(Bio::DB::NCBIHelper);
|
|
127 BEGIN {
|
|
128 $DEFAULTMODE = 'single';
|
|
129 $DEFAULTFORMAT = 'gp';
|
|
130 %PARAMSTRING = (
|
|
131 'batch' => { 'db' => 'nucleotide',
|
|
132 'usehistory' => 'n',
|
|
133 'tool' => 'bioperl',
|
|
134 'retmode' => 'text'},
|
|
135 'query' => { 'usehistory' => 'y',
|
|
136 'tool' => 'bioperl',
|
|
137 'retmode' => 'text'},
|
|
138 'gi' => { 'db' => 'nucleotide',
|
|
139 'usehistory' => 'n',
|
|
140 'tool' => 'bioperl',
|
|
141 'retmode' => 'text'},
|
|
142 'version' => { 'db' => 'nucleotide',
|
|
143 'usehistory' => 'n',
|
|
144 'tool' => 'bioperl',
|
|
145 'retmode' => 'text'},
|
|
146 'single' => { 'db' => 'nucleotide',
|
|
147 'usehistory' => 'n',
|
|
148 'tool' => 'bioperl',
|
|
149 'retmode' => 'text'},
|
|
150 );
|
|
151 }
|
|
152
|
|
153 # new is in NCBIHelper
|
|
154
|
|
155 # helper method to get db specific options
|
|
156
|
|
157 =head2 new
|
|
158
|
|
159 Title : new
|
|
160 Usage : $gb = Bio::DB::GenBank->new(@options)
|
|
161 Function: Creates a new genbank handle
|
|
162 Returns : New genbank handle
|
|
163 Args : -delay number of seconds to delay between fetches (3s)
|
|
164
|
|
165 NOTE: There are other options that are used internally. By NCBI policy, this
|
|
166 module introduces a 3s delay between fetches. If you are fetching multiple genbank
|
|
167 ids, it is a good idea to use get
|
|
168
|
|
169 =cut
|
|
170
|
|
171 =head2 get_params
|
|
172
|
|
173 Title : get_params
|
|
174 Usage : my %params = $self->get_params($mode)
|
|
175 Function: Returns key,value pairs to be passed to NCBI database
|
|
176 for either 'batch' or 'single' sequence retrieval method
|
|
177 Returns : a key,value pair hash
|
|
178 Args : 'single' or 'batch' mode for retrieval
|
|
179
|
|
180 =cut
|
|
181
|
|
182 sub get_params {
|
|
183 my ($self, $mode) = @_;
|
|
184 return defined $PARAMSTRING{$mode} ?
|
|
185 %{$PARAMSTRING{$mode}} : %{$PARAMSTRING{$DEFAULTMODE}};
|
|
186 }
|
|
187
|
|
188 # from Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
|
|
189
|
|
190 =head1 Routines Bio::DB::WebDBSeqI from Bio::DB::RandomAccessI
|
|
191
|
|
192 =head2 get_Seq_by_id
|
|
193
|
|
194 Title : get_Seq_by_id
|
|
195 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
|
|
196 Function: Gets a Bio::Seq object by its name
|
|
197 Returns : a Bio::Seq object
|
|
198 Args : the id (as a string) of a sequence
|
|
199 Throws : "id does not exist" exception
|
|
200
|
|
201 =head2 get_Seq_by_acc
|
|
202
|
|
203 Title : get_Seq_by_acc
|
|
204 Usage : $seq = $db->get_Seq_by_acc($acc);
|
|
205 Function: Gets a Seq object by accession numbers
|
|
206 Returns : a Bio::Seq object
|
|
207 Args : the accession number as a string
|
|
208 Note : For GenBank, this just calls the same code for get_Seq_by_id()
|
|
209 Throws : "id does not exist" exception
|
|
210
|
|
211 =cut
|
|
212
|
|
213
|
|
214 sub get_Seq_by_acc {
|
|
215 my ($self,$seqid) = @_;
|
|
216 $self->SUPER::get_Seq_by_acc("gb|$seqid");
|
|
217 }
|
|
218
|
|
219 =head2 get_Seq_by_gi
|
|
220
|
|
221 Title : get_Seq_by_gi
|
|
222 Usage : $seq = $db->get_Seq_by_gi('405830');
|
|
223 Function: Gets a Bio::Seq object by gi number
|
|
224 Returns : A Bio::Seq object
|
|
225 Args : gi number (as a string)
|
|
226 Throws : "gi does not exist" exception
|
|
227
|
|
228 =head2 get_Seq_by_version
|
|
229
|
|
230 Title : get_Seq_by_version
|
|
231 Usage : $seq = $db->get_Seq_by_version('X77802.1');
|
|
232 Function: Gets a Bio::Seq object by sequence version
|
|
233 Returns : A Bio::Seq object
|
|
234 Args : accession.version (as a string)
|
|
235 Throws : "acc.version does not exist" exception
|
|
236
|
|
237 =head1 Routines implemented by Bio::DB::NCBIHelper
|
|
238
|
|
239 =head2 get_Stream_by_query
|
|
240
|
|
241 Title : get_Stream_by_query
|
|
242 Usage : $seq = $db->get_Stream_by_query($query);
|
|
243 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
|
|
244 at a time. For large numbers of sequences, this is far superior
|
|
245 than get_Stream_by_[id/acc]().
|
|
246 Example :
|
|
247 Returns : a Bio::SeqIO stream object
|
|
248 Args : $query : An Entrez query string or a
|
|
249 Bio::DB::Query::GenBank object. It is suggested that you
|
|
250 create a Bio::DB::Query::GenBank object and get the entry
|
|
251 count before you fetch a potentially large stream.
|
|
252
|
|
253 =cut
|
|
254
|
|
255 =head2 get_Stream_by_id
|
|
256
|
|
257 Title : get_Stream_by_id
|
|
258 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
|
|
259 Function: Gets a series of Seq objects by unique identifiers
|
|
260 Returns : a Bio::SeqIO stream object
|
|
261 Args : $ref : a reference to an array of unique identifiers for
|
|
262 the desired sequence entries
|
|
263
|
|
264 =head2 get_Stream_by_acc
|
|
265
|
|
266 Title : get_Stream_by_acc
|
|
267 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
|
|
268 Function: Gets a series of Seq objects by accession numbers
|
|
269 Returns : a Bio::SeqIO stream object
|
|
270 Args : $ref : a reference to an array of accession numbers for
|
|
271 the desired sequence entries
|
|
272 Note : For GenBank, this just calls the same code for get_Stream_by_id()
|
|
273
|
|
274 =cut
|
|
275
|
|
276 =head2 get_Stream_by_gi
|
|
277
|
|
278 Title : get_Stream_by_gi
|
|
279 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
|
|
280 Function: Gets a series of Seq objects by gi numbers
|
|
281 Returns : a Bio::SeqIO stream object
|
|
282 Args : $ref : a reference to an array of gi numbers for
|
|
283 the desired sequence entries
|
|
284 Note : For GenBank, this just calls the same code for get_Stream_by_id()
|
|
285
|
|
286 =head2 get_Stream_by_batch
|
|
287
|
|
288 Title : get_Stream_by_batch
|
|
289 Usage : $seq = $db->get_Stream_by_batch($ref);
|
|
290 Function: Retrieves Seq objects from Entrez 'en masse', rather than one
|
|
291 at a time.
|
|
292 Example :
|
|
293 Returns : a Bio::SeqIO stream object
|
|
294 Args : $ref : either an array reference, a filename, or a filehandle
|
|
295 from which to get the list of unique ids/accession numbers.
|
|
296
|
|
297 NOTE: This method is redundant and deprecated. Use get_Stream_by_id()
|
|
298 instead.
|
|
299
|
|
300 =head2 get_request
|
|
301
|
|
302 Title : get_request
|
|
303 Usage : my $url = $self->get_request
|
|
304 Function: HTTP::Request
|
|
305 Returns :
|
|
306 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
|
|
307
|
|
308 1;
|
|
309 __END__
|