comparison variant_effect_predictor/Bio/DB/BioFetch.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: BioFetch.pm,v 1.13.2.1 2003/06/25 13:44:18 heikki Exp $
2 #
3 # BioPerl module for Bio::DB::BioFetch
4 #
5 # Cared for by Lincoln Stein <lstein@cshl.org>
6 #
7 # Copyright Lincoln Stein
8 #
9 # You may distribute this module under the same terms as perl itself
10 #
11 # POD documentation - main docs before the code
12 #
13
14 package Bio::DB::BioFetch;
15 use strict;
16 use Bio::DB::WebDBSeqI;
17 use HTTP::Request::Common 'POST';
18
19 =head1 NAME
20
21 Bio::DB::BioFetch - Database object interface to BioFetch retrieval
22
23 =head1 SYNOPSIS
24
25 use Bio::DB::BioFetch;
26
27 $bf = new Bio::DB::BioFetch;
28
29 $seq = $sp->get_Seq_by_id('BUM'); # EMBL or SWALL ID
30
31 # change formats, storage procedures
32 $bf = new Bio::DB::BioFetch(-format => 'fasta',
33 -retrievaltype => 'tempfile',
34 -db => 'EMBL');
35
36 $stream = $bf->get_Stream_by_id(['BUM','J00231']);
37 while (my $s = $stream->next_seq) {
38 print $s->seq,"\n";
39 }
40 # get a RefSeq entry
41 $bf->db('refseq');
42 eval {
43 $seq = $bf->get_Seq_by_version('NM_006732.1'); # RefSeq VERSION
44 };
45 print "accession is ", $seq->accession_number, "\n" unless $@;
46
47
48 =head1 DESCRIPTION
49
50 Bio::DB::BioFetch is a guaranteed best effort sequence entry fetching
51 method. It goes to the Web-based dbfetch server located at the EBI
52 (http://www.ebi.ac.uk/cgi-bin/dbfetch) to retrieve sequences in the
53 EMBL or GenBank sequence repositories.
54
55 This module implements all the Bio::DB::RandomAccessI interface, plus
56 the get_Stream_by_id() and get_Stream_by_acc() methods that are found
57 in the Bio::DB::SwissProt interface.
58
59 =head1 FEEDBACK
60
61 =head2 Mailing Lists
62
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to one
65 of the Bioperl mailing lists. Your participation is much appreciated.
66
67
68 bioperl-l@bioperl.org - General discussion
69 http://bio.perl.org/MailList.html - About the mailing lists
70
71 =head2 Reporting Bugs
72
73 Report bugs to the Bioperl bug tracking system to help us keep track
74 the bugs and their resolution. Bug reports can be submitted via email
75 or the web:
76
77 bioperl-bugs@bio.perl.org
78 http://bugzilla.bioperl.org/
79
80 =head1 AUTHOR - Lincoln Stein
81
82 Email Lincoln Stein E<lt>lstein@cshl.orgE<lt>
83
84 Also thanks to Heikki Lehvaslaiho E<lt>heikki@ebi.ac.ukE<gt> for the
85 BioFetch server and interface specification.
86
87 =head1 APPENDIX
88
89 The rest of the documentation details each of the object
90 methods. Internal methods are usually preceded with a _
91
92 =cut
93
94 # Let the code begin...
95 use vars qw(@ISA $VERSION %FORMATMAP );
96 use Bio::Root::Root;
97 @ISA = qw(Bio::DB::WebDBSeqI Bio::Root::Root);
98 $VERSION = '1.0';
99
100 # warning: names used here must map into Bio::SeqIO::* space
101 use constant DEFAULT_LOCATION => 'http://www.ebi.ac.uk/cgi-bin/dbfetch';
102
103 BEGIN {
104
105 %FORMATMAP = (
106 'embl' => {
107 default => 'embl', # default BioFetch format/SeqIOmodule pair
108 embl => 'embl', # alternative BioFetch format/module pair
109 fasta => 'fasta', # alternative BioFetch format/module pair
110 namespace => 'embl',
111 },
112 'swissprot' => {
113 default => 'swiss',
114 swissprot => 'swiss',
115 fasta => 'fasta',
116 namespace => 'swall',
117 },
118 'refseq' => {
119 default => 'genbank',
120 genbank => 'genbank',
121 fasta => 'fasta',
122 namespace => 'RefSeq',
123 },
124 'swall' => {
125 default => 'swiss',
126 swissprot => 'swiss',
127 fasta => 'fasta',
128 namespace => 'swall',
129 },
130 'genbank' => {
131 default => 'genbank',
132 genbank => 'genbank',
133 namespace => 'genbank',
134 },
135 'genpep' => {
136 default => 'genbank',
137 genbank => 'genbank',
138 namespace => 'genpep',
139 },
140 );
141 }
142
143 =head2 new
144
145 Title : new
146 Usage : $bf = Bio::DB::BioFetch->new(@args)
147 Function: Construct a new Bio::DB::BioFetch object
148 Returns : a Bio::DB::BioFetch object
149 Args : see below
150 Throws :
151
152 @args are standard -name=E<gt>value options as listed in the following
153 table. If you do not provide any options, the module assumes reasonable
154 defaults.
155
156 Option Value Default
157 ------ ----- -------
158
159 -baseaddress location of dbfetch server http://www.ebi.ac.uk/cgi-bin/dbfetch
160 -retrievaltype "tempfile" or "io_string" io_string
161 -format "embl", "fasta", "swissprot", embl
162 or "genbank"
163 -db "embl", "genbank" or "swissprot" embl
164
165 =cut
166
167 #'
168 sub new {
169 my ($class,@args) = @_;
170 my $self = $class->SUPER::new(@args);
171 my ($db) = $self->_rearrange([qw(DB)],@args);
172 $db ||= $self->default_db;
173 $self->db($db);
174 $self->url_base_address(DEFAULT_LOCATION) unless $self->url_base_address;
175 $self;
176 }
177
178 =head2 new_from_registry
179
180 Title : new_from_registry
181 Usage : $biofetch = $db->new_from_registry(%config)
182 Function: Creates a BioFetch object from the registry config hash
183 Returns : itself
184 Args : A configuration hash (see Registry.pm)
185 Throws :
186
187
188 =cut
189
190 sub new_from_registry {
191 my ($class,%config)=@_;
192
193 my $self = $class->SUPER::new(
194 -BASEADDRESS=>$config{'location'}
195 );
196 $self->db($config{'dbname'}) if $config{dbname};
197 return $self;
198 }
199
200 # from Bio::DB::RandomAccessI
201
202 =head2 get_Seq_by_id
203
204 Title : get_Seq_by_id
205 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
206 Function: Gets a Bio::Seq object by its name
207 Returns : a Bio::Seq object
208 Args : the id (as a string) of a sequence
209 Throws : "id does not exist" exception
210
211
212 =cut
213
214 =head2 get_Seq_by_acc
215
216 Title : get_Seq_by_acc
217 Usage : $seq = $db->get_Seq_by_acc('X77802');
218 Function: Gets a Bio::Seq object by accession number
219 Returns : A Bio::Seq object
220 Args : accession number (as a string)
221 Throws : "acc does not exist" exception
222
223 =cut
224
225 =head2 get_Seq_by_gi
226
227 Title : get_Seq_by_gi
228 Usage : $seq = $db->get_Seq_by_gi('405830');
229 Function: Gets a Bio::Seq object by gi number
230 Returns : A Bio::Seq object
231 Args : gi number (as a string)
232 Throws : "gi does not exist" exception
233
234 =cut
235
236 =head2 get_Seq_by_version
237
238 Title : get_Seq_by_version
239 Usage : $seq = $db->get_Seq_by_version('X77802.1');
240 Function: Gets a Bio::Seq object by sequence version
241 Returns : A Bio::Seq object
242 Args : accession.version (as a string)
243 Throws : "acc.version does not exist" exception
244
245 =cut
246
247 sub get_Seq_by_version {
248 my ($self,$seqid) = @_;
249 return $self->get_Seq_by_acc($seqid);
250 }
251
252
253 =head2 get_Stream_by_id
254
255 Title : get_Stream_by_id
256 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
257 Function: Gets a series of Seq objects by unique identifiers
258 Returns : a Bio::SeqIO stream object
259 Args : $ref : a reference to an array of unique identifiers for
260 the desired sequence entries
261
262 =cut
263
264 =head2 get_Stream_by_gi
265
266 Title : get_Stream_by_gi
267 Usage : $seq = $db->get_Seq_by_gi([$gi1, $gi2]);
268 Function: Gets a series of Seq objects by gi numbers
269 Returns : a Bio::SeqIO stream object
270 Args : $ref : a reference to an array of gi numbers for
271 the desired sequence entries
272 Note : For GenBank, this just calls the same code for get_Stream_by_id()
273
274 =cut
275
276 =head2 get_Stream_by_batch
277
278 Title : get_Stream_by_batch
279 Usage : $seq = $db->get_Stream_by_batch($ref);
280 Function: Get a series of Seq objects by their IDs
281 Example :
282 Returns : a Bio::SeqIO stream object
283 Args : $ref : an array reference containing a list of unique
284 ids/accession numbers.
285
286 In some of the Bio::DB::* moduels, get_Stream_by_id() is called
287 get_Stream_by_batch(). Since there seems to be no consensus, this
288 is provided as an alias.
289
290 =cut
291
292 *get_Stream_by_batch = \&Bio::DB::WebDBSeqI::get_Stream_by_id;
293
294 =head1 The remainder of these methods are for internal use
295
296 =head2 get_request
297
298 Title : get_request
299 Usage : my $url = $self->get_request
300 Function: returns a HTTP::Request object
301 Returns :
302 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
303
304 =cut
305
306
307 sub get_request {
308 my ($self, @qualifiers) = @_;
309 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
310 @qualifiers);
311 my $db = $self->db;
312 my $namespace = $self->_namespace;
313
314 $self->throw("Must specify a value for UIDs to fetch")
315 unless defined $uids;
316 my $tmp;
317 my $format_string = '';
318
319 $format ||= $self->default_format;
320 ($format, $tmp) = $self->request_format($format);
321
322 my $base = $self->url_base_address;
323 my $uid = join('+', ref $uids ? @$uids : $uids);
324 $self->debug("\n$base$format_string&id=$uid\n");
325 return POST($base,
326 [ db => $namespace,
327 id => join('+',ref $uids ? @$uids : $uids),
328 format => $format,
329 style => 'raw'
330 ]);
331 }
332
333 =head2 default_format
334
335 Title : default_format
336 Usage : $format = $self->default_format
337 Function: return the default format
338 Returns : a string
339 Args :
340
341 =cut
342
343 sub default_format {
344 return 'default';
345 }
346
347 =head2 default_db
348
349 Title : default_db
350 Usage : $db = $self->default_db
351 Function: return the default database
352 Returns : a string
353 Args :
354
355 =cut
356
357 sub default_db { 'embl' }
358
359 =head2 db
360
361 Title : db
362 Usage : $db = $self->db([$db])
363 Function: get/set the database
364 Returns : a string
365 Args : new database
366
367 =cut
368
369 sub db {
370 my $self = shift;
371
372 if (@_) {
373
374 my $db = lc shift;
375 $FORMATMAP{$db} or $self->throw("invalid db [$db], must be one of [".
376 join(' ',keys %FORMATMAP). "]");
377 $self->{_db} = $db;
378 }
379 return $self->{_db} || $self->default_db ;
380 }
381
382 sub _namespace {
383 my $self = shift;
384 my $db = $self->db;
385 return $FORMATMAP{$db}{namespace} or $db;
386 }
387
388 =head2 postprocess_data
389
390 Title : postprocess_data
391 Usage : $self->postprocess_data ( 'type' => 'string',
392 'location' => \$datastr);
393 Function: process downloaded data before loading into a Bio::SeqIO
394 Returns : void
395 Args : hash with two keys - 'type' can be 'string' or 'file'
396 - 'location' either file location or string
397 reference containing data
398
399 =cut
400
401 sub postprocess_data {
402 my ($self,%args) = @_;
403
404 # check for errors in the stream
405 if ($args{'type'} eq 'string') {
406 my $stringref = $args{'location'};
407 if ($$stringref =~ /^ERROR (\d+) (.+)/m) {
408 $self->throw("BioFetch Error $1: $2");
409 }
410 }
411
412 elsif ($args{'type'} eq 'file') {
413 open (F,$args{'location'}) or $self->throw("Couldn't open $args{location}: $!");
414 # this is dumb, but the error may be anywhere on the first three lines because the
415 # CGI headers are sometimes printed out by the server...
416 my @data = (scalar <F>,scalar <F>,scalar <F>);
417 if (join('',@data) =~ /^ERROR (\d+) (.+)/m) {
418 $self->throw("BioFetch Error $1: $2");
419 }
420 close F;
421 }
422
423 else {
424 $self->throw("Don't know how to postprocess data of type $args{'type'}");
425 }
426 }
427
428
429 =head2 request_format
430
431 Title : request_format
432 Usage : my ($req_format, $ioformat) = $self->request_format;
433 $self->request_format("genbank");
434 $self->request_format("fasta");
435 Function: Get/Set sequence format retrieval. The get-form will normally not
436 be used outside of this and derived modules.
437 Returns : Array of two strings, the first representing the format for
438 retrieval, and the second specifying the corresponding SeqIO format.
439 Args : $format = sequence format
440
441 =cut
442
443 sub request_format {
444 my ($self, $value) = @_;
445 if ( defined $value ) {
446 my $db = $self->db;
447 my $namespace = $self->_namespace;
448 my $format = lc $value;
449 print "format:", $format, " module:", $FORMATMAP{$db}->{$format}, " ($namespace)\n"
450 if $self->verbose > 0;
451 $self->throw("Invalid format [$format], must be one of [".
452 join(' ',keys %{$FORMATMAP{$db}}). "]")
453 unless $format eq 'default' || $FORMATMAP{$db}->{$format};
454
455 $self->{'_format'} = [ $format, $FORMATMAP{$db}->{$format}];
456 }
457 return @{$self->{'_format'}};
458 }
459
460
461 =head2 Bio::DB::WebDBSeqI methods
462
463 Overriding WebDBSeqI method to help newbies to retrieve sequences.
464 EMBL database is all too often passed RefSeq accessions. This
465 redirects those calls. See L<Bio::DB::RefSeq>.
466
467
468 =head2 get_Stream_by_acc
469
470 Title : get_Stream_by_acc
471 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
472 Function: Gets a series of Seq objects by accession numbers
473 Returns : a Bio::SeqIO stream object
474 Args : $ref : a reference to an array of accession numbers for
475 the desired sequence entries
476
477 =cut
478
479 sub get_Stream_by_acc {
480 my ($self, $ids ) = @_;
481 $self->_check_id($ids);
482 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
483 }
484
485
486 =head2 _check_id
487
488 Title : _check_id
489 Usage :
490 Function: Throw on whole chromosome NCBI sequences not in sequence databases
491 and redirect RefSeq accession requests sent to EMBL.
492 Returns :
493 Args : $id(s), $string
494 Throws : if accessionn number indicates whole chromosome NCBI sequence
495
496 =cut
497
498 sub _check_id {
499 my ($self, $id) = @_;
500
501 # NT contigs can not be retrieved
502 $self->throw("NT_ contigs are whole chromosome files which are not part of regular".
503 "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
504 if $id =~ /NT_/;
505
506 # Asking for a RefSeq from EMBL/GenBank
507
508 if ($id =~ /N._/ && $self->db ne 'refseq') {
509 $self->warn("[$id] is not a normal sequence database but a RefSeq entry.".
510 " Redirecting the request.\n")
511 if $self->verbose >= 0;
512 $self->db('RefSeq');
513 }
514 }
515
516 1;