0
|
1 # $Id: BioFetch.pm,v 1.13.2.1 2003/06/25 13:44:18 heikki Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::DB::BioFetch
|
|
4 #
|
|
5 # Cared for by Lincoln Stein <lstein@cshl.org>
|
|
6 #
|
|
7 # Copyright Lincoln Stein
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12 #
|
|
13
|
|
14 package Bio::DB::BioFetch;
|
|
15 use strict;
|
|
16 use Bio::DB::WebDBSeqI;
|
|
17 use HTTP::Request::Common 'POST';
|
|
18
|
|
19 =head1 NAME
|
|
20
|
|
21 Bio::DB::BioFetch - Database object interface to BioFetch retrieval
|
|
22
|
|
23 =head1 SYNOPSIS
|
|
24
|
|
25 use Bio::DB::BioFetch;
|
|
26
|
|
27 $bf = new Bio::DB::BioFetch;
|
|
28
|
|
29 $seq = $sp->get_Seq_by_id('BUM'); # EMBL or SWALL ID
|
|
30
|
|
31 # change formats, storage procedures
|
|
32 $bf = new Bio::DB::BioFetch(-format => 'fasta',
|
|
33 -retrievaltype => 'tempfile',
|
|
34 -db => 'EMBL');
|
|
35
|
|
36 $stream = $bf->get_Stream_by_id(['BUM','J00231']);
|
|
37 while (my $s = $stream->next_seq) {
|
|
38 print $s->seq,"\n";
|
|
39 }
|
|
40 # get a RefSeq entry
|
|
41 $bf->db('refseq');
|
|
42 eval {
|
|
43 $seq = $bf->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION
|
|
44 };
|
|
45 print "accession is ", $seq->accession_number, "\n" unless $@;
|
|
46
|
|
47
|
|
48 =head1 DESCRIPTION
|
|
49
|
|
50 Bio::DB::BioFetch is a guaranteed best effort sequence entry fetching
|
|
51 method. It goes to the Web-based dbfetch server located at the EBI
|
|
52 (http://www.ebi.ac.uk/cgi-bin/dbfetch) to retrieve sequences in the
|
|
53 EMBL or GenBank sequence repositories.
|
|
54
|
|
55 This module implements all the Bio::DB::RandomAccessI interface, plus
|
|
56 the get_Stream_by_id() and get_Stream_by_acc() methods that are found
|
|
57 in the Bio::DB::SwissProt interface.
|
|
58
|
|
59 =head1 FEEDBACK
|
|
60
|
|
61 =head2 Mailing Lists
|
|
62
|
|
63 User feedback is an integral part of the evolution of this and other
|
|
64 Bioperl modules. Send your comments and suggestions preferably to one
|
|
65 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
66
|
|
67
|
|
68 bioperl-l@bioperl.org - General discussion
|
|
69 http://bio.perl.org/MailList.html - About the mailing lists
|
|
70
|
|
71 =head2 Reporting Bugs
|
|
72
|
|
73 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
74 the bugs and their resolution. Bug reports can be submitted via email
|
|
75 or the web:
|
|
76
|
|
77 bioperl-bugs@bio.perl.org
|
|
78 http://bugzilla.bioperl.org/
|
|
79
|
|
80 =head1 AUTHOR - Lincoln Stein
|
|
81
|
|
82 Email Lincoln Stein E<lt>lstein@cshl.orgE<lt>
|
|
83
|
|
84 Also thanks to Heikki Lehvaslaiho E<lt>heikki@ebi.ac.ukE<gt> for the
|
|
85 BioFetch server and interface specification.
|
|
86
|
|
87 =head1 APPENDIX
|
|
88
|
|
89 The rest of the documentation details each of the object
|
|
90 methods. Internal methods are usually preceded with a _
|
|
91
|
|
92 =cut
|
|
93
|
|
94 # Let the code begin...
|
|
95 use vars qw(@ISA $VERSION %FORMATMAP );
|
|
96 use Bio::Root::Root;
|
|
97 @ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root);
|
|
98 $VERSION = '1.0';
|
|
99
|
|
100 # warning: names used here must map into Bio::SeqIO::* space
|
|
101 use constant DEFAULT_LOCATION => 'http://www.ebi.ac.uk/cgi-bin/dbfetch';
|
|
102
|
|
103 BEGIN {
|
|
104
|
|
105 %FORMATMAP = (
|
|
106 'embl' => {
|
|
107 default => 'embl', # default BioFetch format/SeqIOmodule pair
|
|
108 embl => 'embl', # alternative BioFetch format/module pair
|
|
109 fasta => 'fasta', # alternative BioFetch format/module pair
|
|
110 namespace => 'embl',
|
|
111 },
|
|
112 'swissprot' => {
|
|
113 default => 'swiss',
|
|
114 swissprot => 'swiss',
|
|
115 fasta => 'fasta',
|
|
116 namespace => 'swall',
|
|
117 },
|
|
118 'refseq' => {
|
|
119 default => 'genbank',
|
|
120 genbank => 'genbank',
|
|
121 fasta => 'fasta',
|
|
122 namespace => 'RefSeq',
|
|
123 },
|
|
124 'swall' => {
|
|
125 default => 'swiss',
|
|
126 swissprot => 'swiss',
|
|
127 fasta => 'fasta',
|
|
128 namespace => 'swall',
|
|
129 },
|
|
130 'genbank' => {
|
|
131 default => 'genbank',
|
|
132 genbank => 'genbank',
|
|
133 namespace => 'genbank',
|
|
134 },
|
|
135 'genpep' => {
|
|
136 default => 'genbank',
|
|
137 genbank => 'genbank',
|
|
138 namespace => 'genpep',
|
|
139 },
|
|
140 );
|
|
141 }
|
|
142
|
|
143 =head2 new
|
|
144
|
|
145 Title : new
|
|
146 Usage : $bf = Bio::DB::BioFetch->new(@args)
|
|
147 Function: Construct a new Bio::DB::BioFetch object
|
|
148 Returns : a Bio::DB::BioFetch object
|
|
149 Args : see below
|
|
150 Throws :
|
|
151
|
|
152 @args are standard -name=E<gt>value options as listed in the following
|
|
153 table. If you do not provide any options, the module assumes reasonable
|
|
154 defaults.
|
|
155
|
|
156 Option Value Default
|
|
157 ------ ----- -------
|
|
158
|
|
159 -baseaddress location of dbfetch server http://www.ebi.ac.uk/cgi-bin/dbfetch
|
|
160 -retrievaltype "tempfile" or "io_string" io_string
|
|
161 -format "embl", "fasta", "swissprot", embl
|
|
162 or "genbank"
|
|
163 -db "embl", "genbank" or "swissprot" embl
|
|
164
|
|
165 =cut
|
|
166
|
|
167 #'
|
|
168 sub new {
|
|
169 my ($class,@args) = @_;
|
|
170 my $self = $class->SUPER::new(@args);
|
|
171 my ($db) = $self->_rearrange([qw(DB)],@args);
|
|
172 $db ||= $self->default_db;
|
|
173 $self->db($db);
|
|
174 $self->url_base_address(DEFAULT_LOCATION) unless $self->url_base_address;
|
|
175 $self;
|
|
176 }
|
|
177
|
|
178 =head2 new_from_registry
|
|
179
|
|
180 Title : new_from_registry
|
|
181 Usage : $biofetch = $db->new_from_registry(%config)
|
|
182 Function: Creates a BioFetch object from the registry config hash
|
|
183 Returns : itself
|
|
184 Args : A configuration hash (see Registry.pm)
|
|
185 Throws :
|
|
186
|
|
187
|
|
188 =cut
|
|
189
|
|
190 sub new_from_registry {
|
|
191 my ($class,%config)=@_;
|
|
192
|
|
193 my $self = $class->SUPER::new(
|
|
194 -BASEADDRESS=>$config{'location'}
|
|
195 );
|
|
196 $self->db($config{'dbname'}) if $config{dbname};
|
|
197 return $self;
|
|
198 }
|
|
199
|
|
200 # from Bio::DB::RandomAccessI
|
|
201
|
|
202 =head2 get_Seq_by_id
|
|
203
|
|
204 Title : get_Seq_by_id
|
|
205 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
|
|
206 Function: Gets a Bio::Seq object by its name
|
|
207 Returns : a Bio::Seq object
|
|
208 Args : the id (as a string) of a sequence
|
|
209 Throws : "id does not exist" exception
|
|
210
|
|
211
|
|
212 =cut
|
|
213
|
|
214 =head2 get_Seq_by_acc
|
|
215
|
|
216 Title : get_Seq_by_acc
|
|
217 Usage : $seq = $db->get_Seq_by_acc('X77802');
|
|
218 Function: Gets a Bio::Seq object by accession number
|
|
219 Returns : A Bio::Seq object
|
|
220 Args : accession number (as a string)
|
|
221 Throws : "acc does not exist" exception
|
|
222
|
|
223 =cut
|
|
224
|
|
225 =head2 get_Seq_by_gi
|
|
226
|
|
227 Title : get_Seq_by_gi
|
|
228 Usage : $seq = $db->get_Seq_by_gi('405830');
|
|
229 Function: Gets a Bio::Seq object by gi number
|
|
230 Returns : A Bio::Seq object
|
|
231 Args : gi number (as a string)
|
|
232 Throws : "gi does not exist" exception
|
|
233
|
|
234 =cut
|
|
235
|
|
236 =head2 get_Seq_by_version
|
|
237
|
|
238 Title : get_Seq_by_version
|
|
239 Usage : $seq = $db->get_Seq_by_version('X77802.1');
|
|
240 Function: Gets a Bio::Seq object by sequence version
|
|
241 Returns : A Bio::Seq object
|
|
242 Args : accession.version (as a string)
|
|
243 Throws : "acc.version does not exist" exception
|
|
244
|
|
245 =cut
|
|
246
|
|
247 sub get_Seq_by_version {
|
|
248 my ($self,$seqid) = @_;
|
|
249 return $self->get_Seq_by_acc($seqid);
|
|
250 }
|
|
251
|
|
252
|
|
253 =head2 get_Stream_by_id
|
|
254
|
|
255 Title : get_Stream_by_id
|
|
256 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
|
|
257 Function: Gets a series of Seq objects by unique identifiers
|
|
258 Returns : a Bio::SeqIO stream object
|
|
259 Args : $ref : a reference to an array of unique identifiers for
|
|
260 the desired sequence entries
|
|
261
|
|
262 =cut
|
|
263
|
|
264 =head2 get_Stream_by_gi
|
|
265
|
|
266 Title : get_Stream_by_gi
|
|
267 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
|
|
268 Function: Gets a series of Seq objects by gi numbers
|
|
269 Returns : a Bio::SeqIO stream object
|
|
270 Args : $ref : a reference to an array of gi numbers for
|
|
271 the desired sequence entries
|
|
272 Note : For GenBank, this just calls the same code for get_Stream_by_id()
|
|
273
|
|
274 =cut
|
|
275
|
|
276 =head2 get_Stream_by_batch
|
|
277
|
|
278 Title : get_Stream_by_batch
|
|
279 Usage : $seq = $db->get_Stream_by_batch($ref);
|
|
280 Function: Get a series of Seq objects by their IDs
|
|
281 Example :
|
|
282 Returns : a Bio::SeqIO stream object
|
|
283 Args : $ref : an array reference containing a list of unique
|
|
284 ids/accession numbers.
|
|
285
|
|
286 In some of the Bio::DB::* moduels, get_Stream_by_id() is called
|
|
287 get_Stream_by_batch(). Since there seems to be no consensus, this
|
|
288 is provided as an alias.
|
|
289
|
|
290 =cut
|
|
291
|
|
292 *get_Stream_by_batch = \&Bio::DB::WebDBSeqI::get_Stream_by_id;
|
|
293
|
|
294 =head1 The remainder of these methods are for internal use
|
|
295
|
|
296 =head2 get_request
|
|
297
|
|
298 Title : get_request
|
|
299 Usage : my $url = $self->get_request
|
|
300 Function: returns a HTTP::Request object
|
|
301 Returns :
|
|
302 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
|
|
303
|
|
304 =cut
|
|
305
|
|
306
|
|
307 sub get_request {
|
|
308 my ($self, @qualifiers) = @_;
|
|
309 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
|
|
310 @qualifiers);
|
|
311 my $db = $self->db;
|
|
312 my $namespace = $self->_namespace;
|
|
313
|
|
314 $self->throw("Must specify a value for UIDs to fetch")
|
|
315 unless defined $uids;
|
|
316 my $tmp;
|
|
317 my $format_string = '';
|
|
318
|
|
319 $format ||= $self->default_format;
|
|
320 ($format, $tmp) = $self->request_format($format);
|
|
321
|
|
322 my $base = $self->url_base_address;
|
|
323 my $uid = join('+', ref $uids ? @$uids : $uids);
|
|
324 $self->debug("\n$base$format_string&id=$uid\n");
|
|
325 return POST($base,
|
|
326 [ db => $namespace,
|
|
327 id => join('+',ref $uids ? @$uids : $uids),
|
|
328 format => $format,
|
|
329 style => 'raw'
|
|
330 ]);
|
|
331 }
|
|
332
|
|
333 =head2 default_format
|
|
334
|
|
335 Title : default_format
|
|
336 Usage : $format = $self->default_format
|
|
337 Function: return the default format
|
|
338 Returns : a string
|
|
339 Args :
|
|
340
|
|
341 =cut
|
|
342
|
|
343 sub default_format {
|
|
344 return 'default';
|
|
345 }
|
|
346
|
|
347 =head2 default_db
|
|
348
|
|
349 Title : default_db
|
|
350 Usage : $db = $self->default_db
|
|
351 Function: return the default database
|
|
352 Returns : a string
|
|
353 Args :
|
|
354
|
|
355 =cut
|
|
356
|
|
357 sub default_db { 'embl' }
|
|
358
|
|
359 =head2 db
|
|
360
|
|
361 Title : db
|
|
362 Usage : $db = $self->db([$db])
|
|
363 Function: get/set the database
|
|
364 Returns : a string
|
|
365 Args : new database
|
|
366
|
|
367 =cut
|
|
368
|
|
369 sub db {
|
|
370 my $self = shift;
|
|
371
|
|
372 if (@_) {
|
|
373
|
|
374 my $db = lc shift;
|
|
375 $FORMATMAP{$db} or $self->throw("invalid db [$db], must be one of [".
|
|
376 join(' ',keys %FORMATMAP). "]");
|
|
377 $self->{_db} = $db;
|
|
378 }
|
|
379 return $self->{_db} || $self->default_db ;
|
|
380 }
|
|
381
|
|
382 sub _namespace {
|
|
383 my $self = shift;
|
|
384 my $db = $self->db;
|
|
385 return $FORMATMAP{$db}{namespace} or $db;
|
|
386 }
|
|
387
|
|
388 =head2 postprocess_data
|
|
389
|
|
390 Title : postprocess_data
|
|
391 Usage : $self->postprocess_data ( 'type' => 'string',
|
|
392 'location' => \$datastr);
|
|
393 Function: process downloaded data before loading into a Bio::SeqIO
|
|
394 Returns : void
|
|
395 Args : hash with two keys - 'type' can be 'string' or 'file'
|
|
396 - 'location' either file location or string
|
|
397 reference containing data
|
|
398
|
|
399 =cut
|
|
400
|
|
401 sub postprocess_data {
|
|
402 my ($self,%args) = @_;
|
|
403
|
|
404 # check for errors in the stream
|
|
405 if ($args{'type'} eq 'string') {
|
|
406 my $stringref = $args{'location'};
|
|
407 if ($$stringref =~ /^ERROR (\d+) (.+)/m) {
|
|
408 $self->throw("BioFetch Error $1: $2");
|
|
409 }
|
|
410 }
|
|
411
|
|
412 elsif ($args{'type'} eq 'file') {
|
|
413 open (F,$args{'location'}) or $self->throw("Couldn't open $args{location}: $!");
|
|
414 # this is dumb, but the error may be anywhere on the first three lines because the
|
|
415 # CGI headers are sometimes printed out by the server...
|
|
416 my @data = (scalar <F>,scalar <F>,scalar <F>);
|
|
417 if (join('',@data) =~ /^ERROR (\d+) (.+)/m) {
|
|
418 $self->throw("BioFetch Error $1: $2");
|
|
419 }
|
|
420 close F;
|
|
421 }
|
|
422
|
|
423 else {
|
|
424 $self->throw("Don't know how to postprocess data of type $args{'type'}");
|
|
425 }
|
|
426 }
|
|
427
|
|
428
|
|
429 =head2 request_format
|
|
430
|
|
431 Title : request_format
|
|
432 Usage : my ($req_format, $ioformat) = $self->request_format;
|
|
433 $self->request_format("genbank");
|
|
434 $self->request_format("fasta");
|
|
435 Function: Get/Set sequence format retrieval. The get-form will normally not
|
|
436 be used outside of this and derived modules.
|
|
437 Returns : Array of two strings, the first representing the format for
|
|
438 retrieval, and the second specifying the corresponding SeqIO format.
|
|
439 Args : $format = sequence format
|
|
440
|
|
441 =cut
|
|
442
|
|
443 sub request_format {
|
|
444 my ($self, $value) = @_;
|
|
445 if ( defined $value ) {
|
|
446 my $db = $self->db;
|
|
447 my $namespace = $self->_namespace;
|
|
448 my $format = lc $value;
|
|
449 print "format:", $format, " module:", $FORMATMAP{$db}->{$format}, " ($namespace)\n"
|
|
450 if $self->verbose > 0;
|
|
451 $self->throw("Invalid format [$format], must be one of [".
|
|
452 join(' ',keys %{$FORMATMAP{$db}}). "]")
|
|
453 unless $format eq 'default' || $FORMATMAP{$db}->{$format};
|
|
454
|
|
455 $self->{'_format'} = [ $format, $FORMATMAP{$db}->{$format}];
|
|
456 }
|
|
457 return @{$self->{'_format'}};
|
|
458 }
|
|
459
|
|
460
|
|
461 =head2 Bio::DB::WebDBSeqI methods
|
|
462
|
|
463 Overriding WebDBSeqI method to help newbies to retrieve sequences.
|
|
464 EMBL database is all too often passed RefSeq accessions. This
|
|
465 redirects those calls. See L<Bio::DB::RefSeq>.
|
|
466
|
|
467
|
|
468 =head2 get_Stream_by_acc
|
|
469
|
|
470 Title : get_Stream_by_acc
|
|
471 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
|
|
472 Function: Gets a series of Seq objects by accession numbers
|
|
473 Returns : a Bio::SeqIO stream object
|
|
474 Args : $ref : a reference to an array of accession numbers for
|
|
475 the desired sequence entries
|
|
476
|
|
477 =cut
|
|
478
|
|
479 sub get_Stream_by_acc {
|
|
480 my ($self, $ids ) = @_;
|
|
481 $self->_check_id($ids);
|
|
482 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
|
|
483 }
|
|
484
|
|
485
|
|
486 =head2 _check_id
|
|
487
|
|
488 Title : _check_id
|
|
489 Usage :
|
|
490 Function: Throw on whole chromosome NCBI sequences not in sequence databases
|
|
491 and redirect RefSeq accession requests sent to EMBL.
|
|
492 Returns :
|
|
493 Args : $id(s), $string
|
|
494 Throws : if accessionn number indicates whole chromosome NCBI sequence
|
|
495
|
|
496 =cut
|
|
497
|
|
498 sub _check_id {
|
|
499 my ($self, $id) = @_;
|
|
500
|
|
501 # NT contigs can not be retrieved
|
|
502 $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
|
|
503 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
|
|
504 if $id =~ /NT_/;
|
|
505
|
|
506 # Asking for a RefSeq from EMBL/GenBank
|
|
507
|
|
508 if ($id =~ /N._/ && $self->db ne 'refseq') {
|
|
509 $self->warn("[$id] is not a normal sequence database but a RefSeq entry.".
|
|
510 " Redirecting the request.\n")
|
|
511 if $self->verbose >= 0;
|
|
512 $self->db('RefSeq');
|
|
513 }
|
|
514 }
|
|
515
|
|
516 1;
|