0
|
1 # $Id: WebDBSeqI.pm,v 1.30.2.1 2003/06/12 09:29:38 heikki Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::DB::WebDBSeqI
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12 #
|
|
13
|
|
14 =head1 NAME
|
|
15
|
|
16 Bio::DB::WebDBSeqI - Object Interface to generalize Web Databases
|
|
17 for retrieving sequences
|
|
18
|
|
19 =head1 SYNOPSIS
|
|
20
|
|
21 # get a WebDBSeqI object somehow
|
|
22 # assuming it is a nucleotide db
|
|
23 my $seq = $db->get_Seq_by_id('ROA1_HUMAN')
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27
|
|
28
|
|
29
|
|
30 Provides core set of functionality for connecting to a web based
|
|
31 database for retriving sequences.
|
|
32
|
|
33 Users wishing to add another Web Based Sequence Dabatase will need to
|
|
34 extend this class (see Bio::DB::SwissProt or Bio::DB::NCBIHelper for
|
|
35 examples) and implement the get_request method which returns a
|
|
36 HTTP::Request for the specified uids (accessions, ids, etc depending
|
|
37 on what query types the database accepts).
|
|
38
|
|
39
|
|
40
|
|
41 =head1 FEEDBACK
|
|
42
|
|
43 =head2 Mailing Lists
|
|
44
|
|
45 User feedback is an integral part of the
|
|
46 evolution of this and other Bioperl modules. Send
|
|
47 your comments and suggestions preferably to one
|
|
48 of the Bioperl mailing lists. Your participation
|
|
49 is much appreciated.
|
|
50
|
|
51 bioperl-l@bioperl.org - General discussion
|
|
52 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
53
|
|
54 =head2 Reporting Bugs
|
|
55
|
|
56 Report bugs to the Bioperl bug tracking system to
|
|
57 help us keep track the bugs and their resolution.
|
|
58 Bug reports can be submitted via email or the
|
|
59 web:
|
|
60
|
|
61 bioperl-bugs@bio.perl.org
|
|
62 http://bugzilla.bioperl.org/
|
|
63
|
|
64 =head1 AUTHOR - Jason Stajich
|
|
65
|
|
66 Email E<lt> jason@bioperl.org E<gt>
|
|
67
|
|
68 =head1 APPENDIX
|
|
69
|
|
70 The rest of the documentation details each of the
|
|
71 object methods. Internal methods are usually
|
|
72 preceded with a _
|
|
73
|
|
74 =cut
|
|
75
|
|
76 # Let the code begin...
|
|
77
|
|
78 package Bio::DB::WebDBSeqI;
|
|
79 use strict;
|
|
80 use vars qw(@ISA $MODVERSION %RETRIEVAL_TYPES $DEFAULT_RETRIEVAL_TYPE
|
|
81 $DEFAULTFORMAT $LAST_INVOCATION_TIME);
|
|
82
|
|
83 use Bio::DB::RandomAccessI;
|
|
84 use Bio::SeqIO;
|
|
85 use Bio::Root::IO;
|
|
86 use LWP::UserAgent;
|
|
87 use HTTP::Request::Common;
|
|
88 use HTTP::Response;
|
|
89 use File::Spec;
|
|
90 use IO::String;
|
|
91 use Bio::Root::Root;
|
|
92
|
|
93 @ISA = qw(Bio::DB::RandomAccessI);
|
|
94
|
|
95 BEGIN {
|
|
96 $MODVERSION = '0.8';
|
|
97 %RETRIEVAL_TYPES = ( 'io_string' => 1,
|
|
98 'tempfile' => 1,
|
|
99 'pipeline' => 1,
|
|
100 );
|
|
101 $DEFAULT_RETRIEVAL_TYPE = 'pipeline';
|
|
102 $DEFAULTFORMAT = 'fasta';
|
|
103 $LAST_INVOCATION_TIME = 0;
|
|
104 }
|
|
105
|
|
106 sub new {
|
|
107 my ($class, @args) = @_;
|
|
108 my $self = $class->SUPER::new(@args);
|
|
109 my ($baseaddress, $params, $ret_type, $format,$delay,$db) =
|
|
110 $self->_rearrange([qw(BASEADDRESS PARAMS RETRIEVALTYPE FORMAT DELAY DB)],
|
|
111 @args);
|
|
112
|
|
113 $ret_type = $DEFAULT_RETRIEVAL_TYPE unless ( $ret_type);
|
|
114 $baseaddress && $self->url_base_address($baseaddress);
|
|
115 $params && $self->url_params($params);
|
|
116 $db && $self->db($db);
|
|
117 $ret_type && $self->retrieval_type($ret_type);
|
|
118 $delay = $self->delay_policy unless defined $delay;
|
|
119 $self->delay($delay);
|
|
120
|
|
121 # insure we always have a default format set for retrieval
|
|
122 # even though this will be immedietly overwritten by most sub classes
|
|
123 $format = $self->default_format unless ( defined $format &&
|
|
124 $format ne '' );
|
|
125
|
|
126 $self->request_format($format);
|
|
127 my $ua = new LWP::UserAgent;
|
|
128 $ua->agent(ref($self) ."/$MODVERSION");
|
|
129 $self->ua($ua);
|
|
130 $self->{'_authentication'} = [];
|
|
131 return $self;
|
|
132 }
|
|
133
|
|
134 # from Bio::DB::RandomAccessI
|
|
135
|
|
136 =head2 get_Seq_by_id
|
|
137
|
|
138 Title : get_Seq_by_id
|
|
139 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
|
|
140 Function: Gets a Bio::Seq object by its name
|
|
141 Returns : a Bio::Seq object
|
|
142 Args : the id (as a string) of a sequence
|
|
143 Throws : "id does not exist" exception
|
|
144
|
|
145
|
|
146 =cut
|
|
147
|
|
148 sub get_Seq_by_id {
|
|
149 my ($self,$seqid) = @_;
|
|
150 $self->_sleep;
|
|
151 my $seqio = $self->get_Stream_by_id([$seqid]);
|
|
152 $self->throw("id does not exist") if( !defined $seqio ) ;
|
|
153 my @seqs;
|
|
154 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; }
|
|
155 $self->throw("id does not exist") unless @seqs;
|
|
156 if( wantarray ) { return @seqs } else { return shift @seqs }
|
|
157 }
|
|
158
|
|
159 =head2 get_Seq_by_acc
|
|
160
|
|
161 Title : get_Seq_by_acc
|
|
162 Usage : $seq = $db->get_Seq_by_acc('X77802');
|
|
163 Function: Gets a Bio::Seq object by accession number
|
|
164 Returns : A Bio::Seq object
|
|
165 Args : accession number (as a string)
|
|
166 Throws : "acc does not exist" exception
|
|
167
|
|
168 =cut
|
|
169
|
|
170 sub get_Seq_by_acc {
|
|
171 my ($self,$seqid) = @_;
|
|
172 $self->_sleep;
|
|
173 my $seqio = $self->get_Stream_by_acc($seqid);
|
|
174 $self->throw("acc does not exist") if( ! defined $seqio );
|
|
175 my @seqs;
|
|
176 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; }
|
|
177 $self->throw("acc does not exist") unless @seqs;
|
|
178 if( wantarray ) { return @seqs } else { return shift @seqs }
|
|
179 }
|
|
180
|
|
181
|
|
182 =head2 get_Seq_by_gi
|
|
183
|
|
184 Title : get_Seq_by_gi
|
|
185 Usage : $seq = $db->get_Seq_by_gi('405830');
|
|
186 Function: Gets a Bio::Seq object by gi number
|
|
187 Returns : A Bio::Seq object
|
|
188 Args : gi number (as a string)
|
|
189 Throws : "gi does not exist" exception
|
|
190
|
|
191 =cut
|
|
192
|
|
193 sub get_Seq_by_gi {
|
|
194 my ($self,$seqid) = @_;
|
|
195 $self->_sleep;
|
|
196 my $seqio = $self->get_Stream_by_gi($seqid);
|
|
197 $self->throw("gi does not exist") if( !defined $seqio );
|
|
198 my @seqs;
|
|
199 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; }
|
|
200 $self->throw("gi does not exist") unless @seqs;
|
|
201 if( wantarray ) { return @seqs } else { return shift @seqs }
|
|
202 }
|
|
203
|
|
204 =head2 get_Seq_by_version
|
|
205
|
|
206 Title : get_Seq_by_version
|
|
207 Usage : $seq = $db->get_Seq_by_version('X77802.1');
|
|
208 Function: Gets a Bio::Seq object by sequence version
|
|
209 Returns : A Bio::Seq object
|
|
210 Args : accession.version (as a string)
|
|
211 Throws : "acc.version does not exist" exception
|
|
212
|
|
213 =cut
|
|
214
|
|
215 sub get_Seq_by_version {
|
|
216 my ($self,$seqid) = @_;
|
|
217 $self->_sleep;
|
|
218 my $seqio = $self->get_Stream_by_version($seqid);
|
|
219 $self->throw("accession.version does not exist") if( !defined $seqio );
|
|
220 my @seqs;
|
|
221 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; }
|
|
222 $self->throw("accession.version does not exist") unless @seqs;
|
|
223 if( wantarray ) { return @seqs } else { return shift @seqs }
|
|
224 }
|
|
225
|
|
226 # implementing class must define these
|
|
227
|
|
228 =head2 get_request
|
|
229
|
|
230 Title : get_request
|
|
231 Usage : my $url = $self->get_request
|
|
232 Function: returns a HTTP::Request object
|
|
233 Returns :
|
|
234 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
|
|
235
|
|
236 =cut
|
|
237
|
|
238 sub get_request {
|
|
239 my ($self) = @_;
|
|
240 my $msg = "Implementing class must define method get_request in class WebDBSeqI";
|
|
241 $self->throw($msg);
|
|
242 }
|
|
243
|
|
244 # class methods
|
|
245
|
|
246 =head2 get_Stream_by_id
|
|
247
|
|
248 Title : get_Stream_by_id
|
|
249 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
|
|
250 Function: Gets a series of Seq objects by unique identifiers
|
|
251 Returns : a Bio::SeqIO stream object
|
|
252 Args : $ref : a reference to an array of unique identifiers for
|
|
253 the desired sequence entries
|
|
254
|
|
255
|
|
256 =cut
|
|
257
|
|
258 sub get_Stream_by_id {
|
|
259 my ($self, $ids) = @_;
|
|
260 my ($webfmt,$localfmt) = $self->request_format;
|
|
261 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single',
|
|
262 '-format' => $webfmt);
|
|
263 }
|
|
264
|
|
265 *get_Stream_by_batch = sub {
|
|
266 my $self = shift;
|
|
267 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
|
|
268 $self->get_Stream_by_id(@_)
|
|
269 };
|
|
270
|
|
271
|
|
272 =head2 get_Stream_by_acc
|
|
273
|
|
274 Title : get_Stream_by_acc
|
|
275 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
|
|
276 Function: Gets a series of Seq objects by accession numbers
|
|
277 Returns : a Bio::SeqIO stream object
|
|
278 Args : $ref : a reference to an array of accession numbers for
|
|
279 the desired sequence entries
|
|
280 Note : For GenBank, this just calls the same code for get_Stream_by_id()
|
|
281
|
|
282 =cut
|
|
283
|
|
284 sub get_Stream_by_acc {
|
|
285 my ($self, $ids ) = @_;
|
|
286 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
|
|
287 }
|
|
288
|
|
289
|
|
290 =head2 get_Stream_by_gi
|
|
291
|
|
292 Title : get_Stream_by_gi
|
|
293 Usage : $seq = $db->get_Stream_by_gi([$gi1, $gi2]);
|
|
294 Function: Gets a series of Seq objects by gi numbers
|
|
295 Returns : a Bio::SeqIO stream object
|
|
296 Args : $ref : a reference to an array of gi numbers for
|
|
297 the desired sequence entries
|
|
298 Note : For GenBank, this just calls the same code for get_Stream_by_id()
|
|
299
|
|
300 =cut
|
|
301
|
|
302 sub get_Stream_by_gi {
|
|
303 my ($self, $ids ) = @_;
|
|
304 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'gi');
|
|
305 }
|
|
306
|
|
307 =head2 get_Stream_by_version
|
|
308
|
|
309 Title : get_Stream_by_version
|
|
310 Usage : $seq = $db->get_Stream_by_version([$version1, $version2]);
|
|
311 Function: Gets a series of Seq objects by accession.versions
|
|
312 Returns : a Bio::SeqIO stream object
|
|
313 Args : $ref : a reference to an array of accession.version strings for
|
|
314 the desired sequence entries
|
|
315 Note : For GenBank, this is implemeted in NCBIHelper
|
|
316
|
|
317 =cut
|
|
318
|
|
319 sub get_Stream_by_version {
|
|
320 my ($self, $ids ) = @_;
|
|
321 # $self->throw("Implementing class should define this method!");
|
|
322 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'version'); # how it should work
|
|
323 }
|
|
324
|
|
325 =head2 get_Stream_by_query
|
|
326
|
|
327 Title : get_Stream_by_query
|
|
328 Usage : $stream = $db->get_Stream_by_query($query);
|
|
329 Function: Gets a series of Seq objects by way of a query string or oject
|
|
330 Returns : a Bio::SeqIO stream object
|
|
331 Args : $query : A string that uses the appropriate query language
|
|
332 for the database or a Bio::DB::QueryI object. It is suggested
|
|
333 that you create the Bio::DB::Query object first and interrogate
|
|
334 it for the entry count before you fetch a potentially large stream.
|
|
335
|
|
336 =cut
|
|
337
|
|
338 sub get_Stream_by_query {
|
|
339 my ($self, $query ) = @_;
|
|
340 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
|
|
341 }
|
|
342
|
|
343 =head2 default_format
|
|
344
|
|
345 Title : default_format
|
|
346 Usage : my $format = $self->default_format
|
|
347 Function: Returns default sequence format for this module
|
|
348 Returns : string
|
|
349 Args : none
|
|
350
|
|
351 =cut
|
|
352
|
|
353 sub default_format {
|
|
354 return $DEFAULTFORMAT;
|
|
355 }
|
|
356
|
|
357 # sorry, but this is hacked in because of BioFetch problems...
|
|
358 sub db {
|
|
359 my $self = shift;
|
|
360 my $d = $self->{_db};
|
|
361 $self->{_db} = shift if @_;
|
|
362 $d;
|
|
363 }
|
|
364
|
|
365 =head2 request_format
|
|
366
|
|
367 Title : request_format
|
|
368 Usage : my ($req_format, $ioformat) = $self->request_format;
|
|
369 $self->request_format("genbank");
|
|
370 $self->request_format("fasta");
|
|
371 Function: Get/Set sequence format retrieval. The get-form will normally not
|
|
372 be used outside of this and derived modules.
|
|
373 Returns : Array of two strings, the first representing the format for
|
|
374 retrieval, and the second specifying the corresponding SeqIO format.
|
|
375 Args : $format = sequence format
|
|
376
|
|
377 =cut
|
|
378
|
|
379 sub request_format {
|
|
380 my ($self, $value) = @_;
|
|
381
|
|
382 if( defined $value ) {
|
|
383 $self->{'_format'} = [ $value, $value];
|
|
384 }
|
|
385 return @{$self->{'_format'}};
|
|
386 }
|
|
387
|
|
388 =head2 get_seq_stream
|
|
389
|
|
390 Title : get_seq_stream
|
|
391 Usage : my $seqio = $self->get_seq_sream(%qualifiers)
|
|
392 Function: builds a url and queries a web db
|
|
393 Returns : a Bio::SeqIO stream capable of producing sequence
|
|
394 Args : %qualifiers = a hash qualifiers that the implementing class
|
|
395 will process to make a url suitable for web querying
|
|
396
|
|
397 =cut
|
|
398
|
|
399 sub get_seq_stream {
|
|
400 my ($self, %qualifiers) = @_;
|
|
401 my ($rformat, $ioformat) = $self->request_format();
|
|
402 my $seen = 0;
|
|
403 foreach my $key ( keys %qualifiers ) {
|
|
404 if( $key =~ /format/i ) {
|
|
405 $rformat = $qualifiers{$key};
|
|
406 $seen = 1;
|
|
407 }
|
|
408 }
|
|
409 $qualifiers{'-format'} = $rformat if( !$seen);
|
|
410 ($rformat, $ioformat) = $self->request_format($rformat);
|
|
411
|
|
412 my $request = $self->get_request(%qualifiers);
|
|
413 $request->proxy_authorization_basic($self->authentication)
|
|
414 if ( $self->authentication);
|
|
415 $self->debug("request is ". $request->as_string(). "\n");
|
|
416
|
|
417 # workaround for MSWin systems
|
|
418 $self->retrieval_type('io_string') if $self->retrieval_type =~ /pipeline/ && $^O =~ /^MSWin/;
|
|
419
|
|
420 if ($self->retrieval_type =~ /pipeline/) {
|
|
421 # Try to create a stream using POSIX fork-and-pipe facility.
|
|
422 # this is a *big* win when fetching thousands of sequences from
|
|
423 # a web database because we can return the first entry while
|
|
424 # transmission is still in progress.
|
|
425 # Also, no need to keep sequence in memory or in a temporary file.
|
|
426 # If this fails (Windows, MacOS 9), we fall back to non-pipelined access.
|
|
427
|
|
428 # fork and pipe: _stream_request()=><STREAM>
|
|
429 my $result = eval { open(STREAM,"-|") };
|
|
430
|
|
431 if (defined $result) {
|
|
432 $DB::fork_TTY = '/dev/null'; # prevents complaints from debugger
|
|
433 if (!$result) { # in child process
|
|
434 $self->_stream_request($request);
|
|
435 kill 9=>$$; # to prevent END{} blocks from executing in forked children
|
|
436 exit 0;
|
|
437 }
|
|
438 else {
|
|
439 return Bio::SeqIO->new('-verbose' => $self->verbose,
|
|
440 '-format' => $ioformat,
|
|
441 '-fh' => \*STREAM);
|
|
442 }
|
|
443 }
|
|
444 else {
|
|
445 $self->retrieval_type('io_string');
|
|
446 }
|
|
447 }
|
|
448
|
|
449 if ($self->retrieval_type =~ /temp/i) {
|
|
450 my $dir = $self->io->tempdir( CLEANUP => 1);
|
|
451 my ( $fh, $tmpfile) = $self->io()->tempfile( DIR => $dir );
|
|
452 close $fh;
|
|
453 my $resp = $self->_request($request, $tmpfile);
|
|
454 if( ! -e $tmpfile || -z $tmpfile || ! $resp->is_success() ) {
|
|
455 $self->throw("WebDBSeqI Error - check query sequences!\n");
|
|
456 }
|
|
457 $self->postprocess_data('type' => 'file',
|
|
458 'location' => $tmpfile);
|
|
459 # this may get reset when requesting batch mode
|
|
460 ($rformat,$ioformat) = $self->request_format();
|
|
461 if( $self->verbose > 0 ) {
|
|
462 open(ERR, "<$tmpfile");
|
|
463 while(<ERR>) { $self->debug($_);}
|
|
464 }
|
|
465
|
|
466 return Bio::SeqIO->new('-verbose' => $self->verbose,
|
|
467 '-format' => $ioformat,
|
|
468 '-file' => $tmpfile);
|
|
469 }
|
|
470
|
|
471 if ($self->retrieval_type =~ /io_string/i ) {
|
|
472 my $resp = $self->_request($request);
|
|
473 my $content = $resp->content_ref;
|
|
474 $self->debug( "content is $$content\n");
|
|
475 if (!$resp->is_success() || length($$content) == 0) {
|
|
476 $self->throw("WebDBSeqI Error - check query sequences!\n");
|
|
477 }
|
|
478 ($rformat,$ioformat) = $self->request_format();
|
|
479 $self->postprocess_data('type'=> 'string',
|
|
480 'location' => $content);
|
|
481 $self->debug( "str is $$content\n");
|
|
482 return Bio::SeqIO->new('-verbose' => $self->verbose,
|
|
483 '-format' => $ioformat,
|
|
484 '-fh' => new IO::String($$content));
|
|
485 }
|
|
486
|
|
487 # if we got here, we don't know how to handle the retrieval type
|
|
488 $self->throw("retrieval type " . $self->retrieval_type .
|
|
489 " unsupported\n");
|
|
490 }
|
|
491
|
|
492 =head2 url_base_address
|
|
493
|
|
494 Title : url_base_address
|
|
495 Usage : my $address = $self->url_base_address or
|
|
496 $self->url_base_address($address)
|
|
497 Function: Get/Set the base URL for the Web Database
|
|
498 Returns : Base URL for the Web Database
|
|
499 Args : $address - URL for the WebDatabase
|
|
500
|
|
501 =cut
|
|
502
|
|
503 sub url_base_address {
|
|
504 my $self = shift;
|
|
505 my $d = $self->{'_baseaddress'};
|
|
506 $self->{'_baseaddress'} = shift if @_;
|
|
507 $d;
|
|
508 }
|
|
509
|
|
510
|
|
511 =head2 proxy
|
|
512
|
|
513 Title : proxy
|
|
514 Usage : $httpproxy = $db->proxy('http') or
|
|
515 $db->proxy(['http','ftp'], 'http://myproxy' )
|
|
516 Function: Get/Set a proxy for use of proxy
|
|
517 Returns : a string indicating the proxy
|
|
518 Args : $protocol : an array ref of the protocol(s) to set/get
|
|
519 $proxyurl : url of the proxy to use for the specified protocol
|
|
520 $username : username (if proxy requires authentication)
|
|
521 $password : password (if proxy requires authentication)
|
|
522
|
|
523 =cut
|
|
524
|
|
525 sub proxy {
|
|
526 my ($self,$protocol,$proxy,$username,$password) = @_;
|
|
527 return undef if ( !defined $self->ua || !defined $protocol
|
|
528 || !defined $proxy );
|
|
529 $self->authentication($username, $password)
|
|
530 if ($username && $password);
|
|
531 return $self->ua->proxy($protocol,$proxy);
|
|
532 }
|
|
533
|
|
534 =head2 authentication
|
|
535
|
|
536 Title : authentication
|
|
537 Usage : $db->authentication($user,$pass)
|
|
538 Function: Get/Set authentication credentials
|
|
539 Returns : Array of user/pass
|
|
540 Args : Array or user/pass
|
|
541
|
|
542
|
|
543 =cut
|
|
544
|
|
545 sub authentication{
|
|
546 my ($self,$u,$p) = @_;
|
|
547
|
|
548 if( defined $u && defined $p ) {
|
|
549 $self->{'_authentication'} = [ $u,$p];
|
|
550 }
|
|
551 return @{$self->{'_authentication'}};
|
|
552 }
|
|
553
|
|
554
|
|
555 =head2 retrieval_type
|
|
556
|
|
557 Title : retrieval_type
|
|
558 Usage : $self->retrieval_type($type);
|
|
559 my $type = $self->retrieval_type
|
|
560 Function: Get/Set a proxy for retrieval_type (pipeline, io_string or tempfile)
|
|
561 Returns : string representing retrieval type
|
|
562 Args : $value - the value to store
|
|
563
|
|
564 This setting affects how the data stream from the remote web server is
|
|
565 processed and passed to the Bio::SeqIO layer. Three types of retrieval
|
|
566 types are currently allowed:
|
|
567
|
|
568 pipeline Perform a fork in an attempt to begin streaming
|
|
569 while the data is still downloading from the remote
|
|
570 server. Disk, memory and speed efficient, but will
|
|
571 not work on Windows or MacOS 9 platforms.
|
|
572
|
|
573 io_string Store downloaded database entry(s) in memory. Can be
|
|
574 problematic for batch downloads because entire set
|
|
575 of entries must fit in memory. Alll entries must be
|
|
576 downloaded before processing can begin.
|
|
577
|
|
578 tempfile Store downloaded database entry(s) in a temporary file.
|
|
579 All entries must be downloaded before processing can
|
|
580 begin.
|
|
581
|
|
582 The default is pipeline, with automatic fallback to io_string if
|
|
583 pipelining is not available.
|
|
584
|
|
585 =cut
|
|
586
|
|
587 sub retrieval_type {
|
|
588 my ($self, $value) = @_;
|
|
589 if( defined $value ) {
|
|
590 $value = lc $value;
|
|
591 if( ! $RETRIEVAL_TYPES{$value} ) {
|
|
592 $self->warn("invalid retrieval type $value must be one of (" .
|
|
593 join(",", keys %RETRIEVAL_TYPES), ")");
|
|
594 $value = $DEFAULT_RETRIEVAL_TYPE;
|
|
595 }
|
|
596 $self->{'_retrieval_type'} = $value;
|
|
597 }
|
|
598 return $self->{'_retrieval_type'};
|
|
599 }
|
|
600
|
|
601 =head2 url_params
|
|
602
|
|
603 Title : url_params
|
|
604 Usage : my $params = $self->url_params or
|
|
605 $self->url_params($params)
|
|
606 Function: Get/Set the URL parameters for the Web Database
|
|
607 Returns : url parameters for Web Database
|
|
608 Args : $params - parameters to be appended to the URL for the WebDatabase
|
|
609
|
|
610 =cut
|
|
611
|
|
612 sub url_params {
|
|
613 my ($self, $value) = @_;
|
|
614 if( defined $value ) {
|
|
615 $self->{'_urlparams'} = $value;
|
|
616 }
|
|
617 }
|
|
618
|
|
619 =head2 ua
|
|
620
|
|
621 Title : ua
|
|
622 Usage : my $ua = $self->ua or
|
|
623 $self->ua($ua)
|
|
624 Function: Get/Set a LWP::UserAgent for use
|
|
625 Returns : reference to LWP::UserAgent Object
|
|
626 Args : $ua - must be a LWP::UserAgent
|
|
627
|
|
628 =cut
|
|
629
|
|
630 sub ua {
|
|
631 my ($self, $ua) = @_;
|
|
632 if( defined $ua && $ua->isa("LWP::UserAgent") ) {
|
|
633 $self->{'_ua'} = $ua;
|
|
634 }
|
|
635 return $self->{'_ua'};
|
|
636 }
|
|
637
|
|
638 =head2 postprocess_data
|
|
639
|
|
640 Title : postprocess_data
|
|
641 Usage : $self->postprocess_data ( 'type' => 'string',
|
|
642 'location' => \$datastr);
|
|
643 Function: process downloaded data before loading into a Bio::SeqIO
|
|
644 Returns : void
|
|
645 Args : hash with two keys - 'type' can be 'string' or 'file'
|
|
646 - 'location' either file location or string
|
|
647 reference containing data
|
|
648
|
|
649 =cut
|
|
650
|
|
651 sub postprocess_data {
|
|
652 my ( $self, %args) = @_;
|
|
653 return;
|
|
654 }
|
|
655
|
|
656 # private methods
|
|
657 sub _request {
|
|
658
|
|
659 my ($self, $url,$tmpfile) = @_;
|
|
660 my ($resp);
|
|
661 if( defined $tmpfile && $tmpfile ne '' ) {
|
|
662 $resp = $self->ua->request($url, $tmpfile);
|
|
663 } else { $resp = $self->ua->request($url); }
|
|
664
|
|
665 if( $resp->is_error ) {
|
|
666 $self->throw("WebDBSeqI Request Error:\n".$resp->as_string);
|
|
667 }
|
|
668 return $resp;
|
|
669 }
|
|
670
|
|
671 # send web request to stdout for streaming purposes
|
|
672 sub _stream_request {
|
|
673 my $self = shift;
|
|
674 my $request = shift;
|
|
675
|
|
676 # fork so as to pipe output of fetch process through to
|
|
677 # postprocess_data method call.
|
|
678 my $child = open (FETCH,"-|");
|
|
679 $self->throw("Couldn't fork: $!") unless defined $child;
|
|
680
|
|
681 if ($child) {
|
|
682 local ($/) = "//\n"; # assume genbank/swiss format
|
|
683 $| = 1;
|
|
684 my $records = 0;
|
|
685 while (my $record = <FETCH>) {
|
|
686 $records++;
|
|
687 $self->postprocess_data('type' => 'string',
|
|
688 'location' => \$record);
|
|
689 print STDOUT $record;
|
|
690 }
|
|
691 $/ = "\n"; # reset to be safe;
|
|
692 close(FETCH);
|
|
693 close STDOUT;
|
|
694 close STDERR;
|
|
695 kill 9=>$$; # to prevent END{} blocks from executing in forked children
|
|
696 sleep;
|
|
697 }
|
|
698 else {
|
|
699 $| = 1;
|
|
700 my $resp = $self->ua->request($request,
|
|
701 sub { print shift }
|
|
702 );
|
|
703 if( $resp->is_error ) {
|
|
704 $self->throw("WebDBSeqI Request Error:\n".$resp->as_string);
|
|
705 }
|
|
706
|
|
707 close STDOUT; close STDERR;
|
|
708 kill 9=>$$; # to prevent END{} blocks from executing in forked children
|
|
709 sleep;
|
|
710 }
|
|
711 exit 0;
|
|
712 }
|
|
713
|
|
714 sub io {
|
|
715 my ($self,$io) = @_;
|
|
716
|
|
717 if(defined($io) || (! exists($self->{'_io'}))) {
|
|
718 $io = Bio::Root::IO->new() unless $io;
|
|
719 $self->{'_io'} = $io;
|
|
720 }
|
|
721 return $self->{'_io'};
|
|
722 }
|
|
723
|
|
724
|
|
725 =head2 delay
|
|
726
|
|
727 Title : delay
|
|
728 Usage : $secs = $self->delay([$secs])
|
|
729 Function: get/set number of seconds to delay between fetches
|
|
730 Returns : number of seconds to delay
|
|
731 Args : new value
|
|
732
|
|
733 NOTE: the default is to use the value specified by delay_policy().
|
|
734 This can be overridden by calling this method, or by passing the
|
|
735 -delay argument to new().
|
|
736
|
|
737 =cut
|
|
738
|
|
739 sub delay {
|
|
740 my $self = shift;
|
|
741 my $d = $self->{'_delay'};
|
|
742 $self->{'_delay'} = shift if @_;
|
|
743 $d;
|
|
744 }
|
|
745
|
|
746 =head2 delay_policy
|
|
747
|
|
748 Title : delay_policy
|
|
749 Usage : $secs = $self->delay_policy
|
|
750 Function: return number of seconds to delay between calls to remote db
|
|
751 Returns : number of seconds to delay
|
|
752 Args : none
|
|
753
|
|
754 NOTE: The default delay policy is 0s. Override in subclasses to
|
|
755 implement delays. The timer has only second resolution, so the delay
|
|
756 will actually be +/- 1s.
|
|
757
|
|
758 =cut
|
|
759
|
|
760 sub delay_policy {
|
|
761 my $self = shift;
|
|
762 return 0;
|
|
763 }
|
|
764
|
|
765 =head2 _sleep
|
|
766
|
|
767 Title : _sleep
|
|
768 Usage : $self->_sleep
|
|
769 Function: sleep for a number of seconds indicated by the delay policy
|
|
770 Returns : none
|
|
771 Args : none
|
|
772
|
|
773 NOTE: This method keeps track of the last time it was called and only
|
|
774 imposes a sleep if it was called more recently than the delay_policy()
|
|
775 allows.
|
|
776
|
|
777 =cut
|
|
778
|
|
779 sub _sleep {
|
|
780 my $self = shift;
|
|
781 my $last_invocation = $LAST_INVOCATION_TIME;
|
|
782 if (time - $LAST_INVOCATION_TIME < $self->delay) {
|
|
783 my $delay = $self->delay - (time - $LAST_INVOCATION_TIME);
|
|
784 warn "sleeping for $delay seconds\n" if $self->verbose;
|
|
785 sleep $delay;
|
|
786 }
|
|
787 $LAST_INVOCATION_TIME = time;
|
|
788 }
|
|
789
|
|
790 1;
|