Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/WebDBSeqI.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: WebDBSeqI.pm,v 1.30.2.1 2003/06/12 09:29:38 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::DB::WebDBSeqI | |
| 4 # | |
| 5 # Cared for by Jason Stajich <jason@bioperl.org> | |
| 6 # | |
| 7 # Copyright Jason Stajich | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 # | |
| 11 # POD documentation - main docs before the code | |
| 12 # | |
| 13 | |
| 14 =head1 NAME | |
| 15 | |
| 16 Bio::DB::WebDBSeqI - Object Interface to generalize Web Databases | |
| 17 for retrieving sequences | |
| 18 | |
| 19 =head1 SYNOPSIS | |
| 20 | |
| 21 # get a WebDBSeqI object somehow | |
| 22 # assuming it is a nucleotide db | |
| 23 my $seq = $db->get_Seq_by_id('ROA1_HUMAN') | |
| 24 | |
| 25 =head1 DESCRIPTION | |
| 26 | |
| 27 | |
| 28 | |
| 29 | |
| 30 Provides core set of functionality for connecting to a web based | |
| 31 database for retriving sequences. | |
| 32 | |
| 33 Users wishing to add another Web Based Sequence Dabatase will need to | |
| 34 extend this class (see Bio::DB::SwissProt or Bio::DB::NCBIHelper for | |
| 35 examples) and implement the get_request method which returns a | |
| 36 HTTP::Request for the specified uids (accessions, ids, etc depending | |
| 37 on what query types the database accepts). | |
| 38 | |
| 39 | |
| 40 | |
| 41 =head1 FEEDBACK | |
| 42 | |
| 43 =head2 Mailing Lists | |
| 44 | |
| 45 User feedback is an integral part of the | |
| 46 evolution of this and other Bioperl modules. Send | |
| 47 your comments and suggestions preferably to one | |
| 48 of the Bioperl mailing lists. Your participation | |
| 49 is much appreciated. | |
| 50 | |
| 51 bioperl-l@bioperl.org - General discussion | |
| 52 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 53 | |
| 54 =head2 Reporting Bugs | |
| 55 | |
| 56 Report bugs to the Bioperl bug tracking system to | |
| 57 help us keep track the bugs and their resolution. | |
| 58 Bug reports can be submitted via email or the | |
| 59 web: | |
| 60 | |
| 61 bioperl-bugs@bio.perl.org | |
| 62 http://bugzilla.bioperl.org/ | |
| 63 | |
| 64 =head1 AUTHOR - Jason Stajich | |
| 65 | |
| 66 Email E<lt> jason@bioperl.org E<gt> | |
| 67 | |
| 68 =head1 APPENDIX | |
| 69 | |
| 70 The rest of the documentation details each of the | |
| 71 object methods. Internal methods are usually | |
| 72 preceded with a _ | |
| 73 | |
| 74 =cut | |
| 75 | |
| 76 # Let the code begin... | |
| 77 | |
| 78 package Bio::DB::WebDBSeqI; | |
| 79 use strict; | |
| 80 use vars qw(@ISA $MODVERSION %RETRIEVAL_TYPES $DEFAULT_RETRIEVAL_TYPE | |
| 81 $DEFAULTFORMAT $LAST_INVOCATION_TIME); | |
| 82 | |
| 83 use Bio::DB::RandomAccessI; | |
| 84 use Bio::SeqIO; | |
| 85 use Bio::Root::IO; | |
| 86 use LWP::UserAgent; | |
| 87 use HTTP::Request::Common; | |
| 88 use HTTP::Response; | |
| 89 use File::Spec; | |
| 90 use IO::String; | |
| 91 use Bio::Root::Root; | |
| 92 | |
| 93 @ISA = qw(Bio::DB::RandomAccessI); | |
| 94 | |
| 95 BEGIN { | |
| 96 $MODVERSION = '0.8'; | |
| 97 %RETRIEVAL_TYPES = ( 'io_string' => 1, | |
| 98 'tempfile' => 1, | |
| 99 'pipeline' => 1, | |
| 100 ); | |
| 101 $DEFAULT_RETRIEVAL_TYPE = 'pipeline'; | |
| 102 $DEFAULTFORMAT = 'fasta'; | |
| 103 $LAST_INVOCATION_TIME = 0; | |
| 104 } | |
| 105 | |
| 106 sub new { | |
| 107 my ($class, @args) = @_; | |
| 108 my $self = $class->SUPER::new(@args); | |
| 109 my ($baseaddress, $params, $ret_type, $format,$delay,$db) = | |
| 110 $self->_rearrange([qw(BASEADDRESS PARAMS RETRIEVALTYPE FORMAT DELAY DB)], | |
| 111 @args); | |
| 112 | |
| 113 $ret_type = $DEFAULT_RETRIEVAL_TYPE unless ( $ret_type); | |
| 114 $baseaddress && $self->url_base_address($baseaddress); | |
| 115 $params && $self->url_params($params); | |
| 116 $db && $self->db($db); | |
| 117 $ret_type && $self->retrieval_type($ret_type); | |
| 118 $delay = $self->delay_policy unless defined $delay; | |
| 119 $self->delay($delay); | |
| 120 | |
| 121 # insure we always have a default format set for retrieval | |
| 122 # even though this will be immedietly overwritten by most sub classes | |
| 123 $format = $self->default_format unless ( defined $format && | |
| 124 $format ne '' ); | |
| 125 | |
| 126 $self->request_format($format); | |
| 127 my $ua = new LWP::UserAgent; | |
| 128 $ua->agent(ref($self) ."/$MODVERSION"); | |
| 129 $self->ua($ua); | |
| 130 $self->{'_authentication'} = []; | |
| 131 return $self; | |
| 132 } | |
| 133 | |
| 134 # from Bio::DB::RandomAccessI | |
| 135 | |
| 136 =head2 get_Seq_by_id | |
| 137 | |
| 138 Title : get_Seq_by_id | |
| 139 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') | |
| 140 Function: Gets a Bio::Seq object by its name | |
| 141 Returns : a Bio::Seq object | |
| 142 Args : the id (as a string) of a sequence | |
| 143 Throws : "id does not exist" exception | |
| 144 | |
| 145 | |
| 146 =cut | |
| 147 | |
| 148 sub get_Seq_by_id { | |
| 149 my ($self,$seqid) = @_; | |
| 150 $self->_sleep; | |
| 151 my $seqio = $self->get_Stream_by_id([$seqid]); | |
| 152 $self->throw("id does not exist") if( !defined $seqio ) ; | |
| 153 my @seqs; | |
| 154 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; } | |
| 155 $self->throw("id does not exist") unless @seqs; | |
| 156 if( wantarray ) { return @seqs } else { return shift @seqs } | |
| 157 } | |
| 158 | |
| 159 =head2 get_Seq_by_acc | |
| 160 | |
| 161 Title : get_Seq_by_acc | |
| 162 Usage : $seq = $db->get_Seq_by_acc('X77802'); | |
| 163 Function: Gets a Bio::Seq object by accession number | |
| 164 Returns : A Bio::Seq object | |
| 165 Args : accession number (as a string) | |
| 166 Throws : "acc does not exist" exception | |
| 167 | |
| 168 =cut | |
| 169 | |
| 170 sub get_Seq_by_acc { | |
| 171 my ($self,$seqid) = @_; | |
| 172 $self->_sleep; | |
| 173 my $seqio = $self->get_Stream_by_acc($seqid); | |
| 174 $self->throw("acc does not exist") if( ! defined $seqio ); | |
| 175 my @seqs; | |
| 176 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; } | |
| 177 $self->throw("acc does not exist") unless @seqs; | |
| 178 if( wantarray ) { return @seqs } else { return shift @seqs } | |
| 179 } | |
| 180 | |
| 181 | |
| 182 =head2 get_Seq_by_gi | |
| 183 | |
| 184 Title : get_Seq_by_gi | |
| 185 Usage : $seq = $db->get_Seq_by_gi('405830'); | |
| 186 Function: Gets a Bio::Seq object by gi number | |
| 187 Returns : A Bio::Seq object | |
| 188 Args : gi number (as a string) | |
| 189 Throws : "gi does not exist" exception | |
| 190 | |
| 191 =cut | |
| 192 | |
| 193 sub get_Seq_by_gi { | |
| 194 my ($self,$seqid) = @_; | |
| 195 $self->_sleep; | |
| 196 my $seqio = $self->get_Stream_by_gi($seqid); | |
| 197 $self->throw("gi does not exist") if( !defined $seqio ); | |
| 198 my @seqs; | |
| 199 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; } | |
| 200 $self->throw("gi does not exist") unless @seqs; | |
| 201 if( wantarray ) { return @seqs } else { return shift @seqs } | |
| 202 } | |
| 203 | |
| 204 =head2 get_Seq_by_version | |
| 205 | |
| 206 Title : get_Seq_by_version | |
| 207 Usage : $seq = $db->get_Seq_by_version('X77802.1'); | |
| 208 Function: Gets a Bio::Seq object by sequence version | |
| 209 Returns : A Bio::Seq object | |
| 210 Args : accession.version (as a string) | |
| 211 Throws : "acc.version does not exist" exception | |
| 212 | |
| 213 =cut | |
| 214 | |
| 215 sub get_Seq_by_version { | |
| 216 my ($self,$seqid) = @_; | |
| 217 $self->_sleep; | |
| 218 my $seqio = $self->get_Stream_by_version($seqid); | |
| 219 $self->throw("accession.version does not exist") if( !defined $seqio ); | |
| 220 my @seqs; | |
| 221 while( my $seq = $seqio->next_seq() ) { push @seqs, $seq; } | |
| 222 $self->throw("accession.version does not exist") unless @seqs; | |
| 223 if( wantarray ) { return @seqs } else { return shift @seqs } | |
| 224 } | |
| 225 | |
| 226 # implementing class must define these | |
| 227 | |
| 228 =head2 get_request | |
| 229 | |
| 230 Title : get_request | |
| 231 Usage : my $url = $self->get_request | |
| 232 Function: returns a HTTP::Request object | |
| 233 Returns : | |
| 234 Args : %qualifiers = a hash of qualifiers (ids, format, etc) | |
| 235 | |
| 236 =cut | |
| 237 | |
| 238 sub get_request { | |
| 239 my ($self) = @_; | |
| 240 my $msg = "Implementing class must define method get_request in class WebDBSeqI"; | |
| 241 $self->throw($msg); | |
| 242 } | |
| 243 | |
| 244 # class methods | |
| 245 | |
| 246 =head2 get_Stream_by_id | |
| 247 | |
| 248 Title : get_Stream_by_id | |
| 249 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] ); | |
| 250 Function: Gets a series of Seq objects by unique identifiers | |
| 251 Returns : a Bio::SeqIO stream object | |
| 252 Args : $ref : a reference to an array of unique identifiers for | |
| 253 the desired sequence entries | |
| 254 | |
| 255 | |
| 256 =cut | |
| 257 | |
| 258 sub get_Stream_by_id { | |
| 259 my ($self, $ids) = @_; | |
| 260 my ($webfmt,$localfmt) = $self->request_format; | |
| 261 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single', | |
| 262 '-format' => $webfmt); | |
| 263 } | |
| 264 | |
| 265 *get_Stream_by_batch = sub { | |
| 266 my $self = shift; | |
| 267 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead'); | |
| 268 $self->get_Stream_by_id(@_) | |
| 269 }; | |
| 270 | |
| 271 | |
| 272 =head2 get_Stream_by_acc | |
| 273 | |
| 274 Title : get_Stream_by_acc | |
| 275 Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); | |
| 276 Function: Gets a series of Seq objects by accession numbers | |
| 277 Returns : a Bio::SeqIO stream object | |
| 278 Args : $ref : a reference to an array of accession numbers for | |
| 279 the desired sequence entries | |
| 280 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
| 281 | |
| 282 =cut | |
| 283 | |
| 284 sub get_Stream_by_acc { | |
| 285 my ($self, $ids ) = @_; | |
| 286 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single'); | |
| 287 } | |
| 288 | |
| 289 | |
| 290 =head2 get_Stream_by_gi | |
| 291 | |
| 292 Title : get_Stream_by_gi | |
| 293 Usage : $seq = $db->get_Stream_by_gi([$gi1, $gi2]); | |
| 294 Function: Gets a series of Seq objects by gi numbers | |
| 295 Returns : a Bio::SeqIO stream object | |
| 296 Args : $ref : a reference to an array of gi numbers for | |
| 297 the desired sequence entries | |
| 298 Note : For GenBank, this just calls the same code for get_Stream_by_id() | |
| 299 | |
| 300 =cut | |
| 301 | |
| 302 sub get_Stream_by_gi { | |
| 303 my ($self, $ids ) = @_; | |
| 304 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'gi'); | |
| 305 } | |
| 306 | |
| 307 =head2 get_Stream_by_version | |
| 308 | |
| 309 Title : get_Stream_by_version | |
| 310 Usage : $seq = $db->get_Stream_by_version([$version1, $version2]); | |
| 311 Function: Gets a series of Seq objects by accession.versions | |
| 312 Returns : a Bio::SeqIO stream object | |
| 313 Args : $ref : a reference to an array of accession.version strings for | |
| 314 the desired sequence entries | |
| 315 Note : For GenBank, this is implemeted in NCBIHelper | |
| 316 | |
| 317 =cut | |
| 318 | |
| 319 sub get_Stream_by_version { | |
| 320 my ($self, $ids ) = @_; | |
| 321 # $self->throw("Implementing class should define this method!"); | |
| 322 return $self->get_seq_stream('-uids' => $ids, '-mode' => 'version'); # how it should work | |
| 323 } | |
| 324 | |
| 325 =head2 get_Stream_by_query | |
| 326 | |
| 327 Title : get_Stream_by_query | |
| 328 Usage : $stream = $db->get_Stream_by_query($query); | |
| 329 Function: Gets a series of Seq objects by way of a query string or oject | |
| 330 Returns : a Bio::SeqIO stream object | |
| 331 Args : $query : A string that uses the appropriate query language | |
| 332 for the database or a Bio::DB::QueryI object. It is suggested | |
| 333 that you create the Bio::DB::Query object first and interrogate | |
| 334 it for the entry count before you fetch a potentially large stream. | |
| 335 | |
| 336 =cut | |
| 337 | |
| 338 sub get_Stream_by_query { | |
| 339 my ($self, $query ) = @_; | |
| 340 return $self->get_seq_stream('-query' => $query, '-mode'=>'query'); | |
| 341 } | |
| 342 | |
| 343 =head2 default_format | |
| 344 | |
| 345 Title : default_format | |
| 346 Usage : my $format = $self->default_format | |
| 347 Function: Returns default sequence format for this module | |
| 348 Returns : string | |
| 349 Args : none | |
| 350 | |
| 351 =cut | |
| 352 | |
| 353 sub default_format { | |
| 354 return $DEFAULTFORMAT; | |
| 355 } | |
| 356 | |
| 357 # sorry, but this is hacked in because of BioFetch problems... | |
| 358 sub db { | |
| 359 my $self = shift; | |
| 360 my $d = $self->{_db}; | |
| 361 $self->{_db} = shift if @_; | |
| 362 $d; | |
| 363 } | |
| 364 | |
| 365 =head2 request_format | |
| 366 | |
| 367 Title : request_format | |
| 368 Usage : my ($req_format, $ioformat) = $self->request_format; | |
| 369 $self->request_format("genbank"); | |
| 370 $self->request_format("fasta"); | |
| 371 Function: Get/Set sequence format retrieval. The get-form will normally not | |
| 372 be used outside of this and derived modules. | |
| 373 Returns : Array of two strings, the first representing the format for | |
| 374 retrieval, and the second specifying the corresponding SeqIO format. | |
| 375 Args : $format = sequence format | |
| 376 | |
| 377 =cut | |
| 378 | |
| 379 sub request_format { | |
| 380 my ($self, $value) = @_; | |
| 381 | |
| 382 if( defined $value ) { | |
| 383 $self->{'_format'} = [ $value, $value]; | |
| 384 } | |
| 385 return @{$self->{'_format'}}; | |
| 386 } | |
| 387 | |
| 388 =head2 get_seq_stream | |
| 389 | |
| 390 Title : get_seq_stream | |
| 391 Usage : my $seqio = $self->get_seq_sream(%qualifiers) | |
| 392 Function: builds a url and queries a web db | |
| 393 Returns : a Bio::SeqIO stream capable of producing sequence | |
| 394 Args : %qualifiers = a hash qualifiers that the implementing class | |
| 395 will process to make a url suitable for web querying | |
| 396 | |
| 397 =cut | |
| 398 | |
| 399 sub get_seq_stream { | |
| 400 my ($self, %qualifiers) = @_; | |
| 401 my ($rformat, $ioformat) = $self->request_format(); | |
| 402 my $seen = 0; | |
| 403 foreach my $key ( keys %qualifiers ) { | |
| 404 if( $key =~ /format/i ) { | |
| 405 $rformat = $qualifiers{$key}; | |
| 406 $seen = 1; | |
| 407 } | |
| 408 } | |
| 409 $qualifiers{'-format'} = $rformat if( !$seen); | |
| 410 ($rformat, $ioformat) = $self->request_format($rformat); | |
| 411 | |
| 412 my $request = $self->get_request(%qualifiers); | |
| 413 $request->proxy_authorization_basic($self->authentication) | |
| 414 if ( $self->authentication); | |
| 415 $self->debug("request is ". $request->as_string(). "\n"); | |
| 416 | |
| 417 # workaround for MSWin systems | |
| 418 $self->retrieval_type('io_string') if $self->retrieval_type =~ /pipeline/ && $^O =~ /^MSWin/; | |
| 419 | |
| 420 if ($self->retrieval_type =~ /pipeline/) { | |
| 421 # Try to create a stream using POSIX fork-and-pipe facility. | |
| 422 # this is a *big* win when fetching thousands of sequences from | |
| 423 # a web database because we can return the first entry while | |
| 424 # transmission is still in progress. | |
| 425 # Also, no need to keep sequence in memory or in a temporary file. | |
| 426 # If this fails (Windows, MacOS 9), we fall back to non-pipelined access. | |
| 427 | |
| 428 # fork and pipe: _stream_request()=><STREAM> | |
| 429 my $result = eval { open(STREAM,"-|") }; | |
| 430 | |
| 431 if (defined $result) { | |
| 432 $DB::fork_TTY = '/dev/null'; # prevents complaints from debugger | |
| 433 if (!$result) { # in child process | |
| 434 $self->_stream_request($request); | |
| 435 kill 9=>$$; # to prevent END{} blocks from executing in forked children | |
| 436 exit 0; | |
| 437 } | |
| 438 else { | |
| 439 return Bio::SeqIO->new('-verbose' => $self->verbose, | |
| 440 '-format' => $ioformat, | |
| 441 '-fh' => \*STREAM); | |
| 442 } | |
| 443 } | |
| 444 else { | |
| 445 $self->retrieval_type('io_string'); | |
| 446 } | |
| 447 } | |
| 448 | |
| 449 if ($self->retrieval_type =~ /temp/i) { | |
| 450 my $dir = $self->io->tempdir( CLEANUP => 1); | |
| 451 my ( $fh, $tmpfile) = $self->io()->tempfile( DIR => $dir ); | |
| 452 close $fh; | |
| 453 my $resp = $self->_request($request, $tmpfile); | |
| 454 if( ! -e $tmpfile || -z $tmpfile || ! $resp->is_success() ) { | |
| 455 $self->throw("WebDBSeqI Error - check query sequences!\n"); | |
| 456 } | |
| 457 $self->postprocess_data('type' => 'file', | |
| 458 'location' => $tmpfile); | |
| 459 # this may get reset when requesting batch mode | |
| 460 ($rformat,$ioformat) = $self->request_format(); | |
| 461 if( $self->verbose > 0 ) { | |
| 462 open(ERR, "<$tmpfile"); | |
| 463 while(<ERR>) { $self->debug($_);} | |
| 464 } | |
| 465 | |
| 466 return Bio::SeqIO->new('-verbose' => $self->verbose, | |
| 467 '-format' => $ioformat, | |
| 468 '-file' => $tmpfile); | |
| 469 } | |
| 470 | |
| 471 if ($self->retrieval_type =~ /io_string/i ) { | |
| 472 my $resp = $self->_request($request); | |
| 473 my $content = $resp->content_ref; | |
| 474 $self->debug( "content is $$content\n"); | |
| 475 if (!$resp->is_success() || length($$content) == 0) { | |
| 476 $self->throw("WebDBSeqI Error - check query sequences!\n"); | |
| 477 } | |
| 478 ($rformat,$ioformat) = $self->request_format(); | |
| 479 $self->postprocess_data('type'=> 'string', | |
| 480 'location' => $content); | |
| 481 $self->debug( "str is $$content\n"); | |
| 482 return Bio::SeqIO->new('-verbose' => $self->verbose, | |
| 483 '-format' => $ioformat, | |
| 484 '-fh' => new IO::String($$content)); | |
| 485 } | |
| 486 | |
| 487 # if we got here, we don't know how to handle the retrieval type | |
| 488 $self->throw("retrieval type " . $self->retrieval_type . | |
| 489 " unsupported\n"); | |
| 490 } | |
| 491 | |
| 492 =head2 url_base_address | |
| 493 | |
| 494 Title : url_base_address | |
| 495 Usage : my $address = $self->url_base_address or | |
| 496 $self->url_base_address($address) | |
| 497 Function: Get/Set the base URL for the Web Database | |
| 498 Returns : Base URL for the Web Database | |
| 499 Args : $address - URL for the WebDatabase | |
| 500 | |
| 501 =cut | |
| 502 | |
| 503 sub url_base_address { | |
| 504 my $self = shift; | |
| 505 my $d = $self->{'_baseaddress'}; | |
| 506 $self->{'_baseaddress'} = shift if @_; | |
| 507 $d; | |
| 508 } | |
| 509 | |
| 510 | |
| 511 =head2 proxy | |
| 512 | |
| 513 Title : proxy | |
| 514 Usage : $httpproxy = $db->proxy('http') or | |
| 515 $db->proxy(['http','ftp'], 'http://myproxy' ) | |
| 516 Function: Get/Set a proxy for use of proxy | |
| 517 Returns : a string indicating the proxy | |
| 518 Args : $protocol : an array ref of the protocol(s) to set/get | |
| 519 $proxyurl : url of the proxy to use for the specified protocol | |
| 520 $username : username (if proxy requires authentication) | |
| 521 $password : password (if proxy requires authentication) | |
| 522 | |
| 523 =cut | |
| 524 | |
| 525 sub proxy { | |
| 526 my ($self,$protocol,$proxy,$username,$password) = @_; | |
| 527 return undef if ( !defined $self->ua || !defined $protocol | |
| 528 || !defined $proxy ); | |
| 529 $self->authentication($username, $password) | |
| 530 if ($username && $password); | |
| 531 return $self->ua->proxy($protocol,$proxy); | |
| 532 } | |
| 533 | |
| 534 =head2 authentication | |
| 535 | |
| 536 Title : authentication | |
| 537 Usage : $db->authentication($user,$pass) | |
| 538 Function: Get/Set authentication credentials | |
| 539 Returns : Array of user/pass | |
| 540 Args : Array or user/pass | |
| 541 | |
| 542 | |
| 543 =cut | |
| 544 | |
| 545 sub authentication{ | |
| 546 my ($self,$u,$p) = @_; | |
| 547 | |
| 548 if( defined $u && defined $p ) { | |
| 549 $self->{'_authentication'} = [ $u,$p]; | |
| 550 } | |
| 551 return @{$self->{'_authentication'}}; | |
| 552 } | |
| 553 | |
| 554 | |
| 555 =head2 retrieval_type | |
| 556 | |
| 557 Title : retrieval_type | |
| 558 Usage : $self->retrieval_type($type); | |
| 559 my $type = $self->retrieval_type | |
| 560 Function: Get/Set a proxy for retrieval_type (pipeline, io_string or tempfile) | |
| 561 Returns : string representing retrieval type | |
| 562 Args : $value - the value to store | |
| 563 | |
| 564 This setting affects how the data stream from the remote web server is | |
| 565 processed and passed to the Bio::SeqIO layer. Three types of retrieval | |
| 566 types are currently allowed: | |
| 567 | |
| 568 pipeline Perform a fork in an attempt to begin streaming | |
| 569 while the data is still downloading from the remote | |
| 570 server. Disk, memory and speed efficient, but will | |
| 571 not work on Windows or MacOS 9 platforms. | |
| 572 | |
| 573 io_string Store downloaded database entry(s) in memory. Can be | |
| 574 problematic for batch downloads because entire set | |
| 575 of entries must fit in memory. Alll entries must be | |
| 576 downloaded before processing can begin. | |
| 577 | |
| 578 tempfile Store downloaded database entry(s) in a temporary file. | |
| 579 All entries must be downloaded before processing can | |
| 580 begin. | |
| 581 | |
| 582 The default is pipeline, with automatic fallback to io_string if | |
| 583 pipelining is not available. | |
| 584 | |
| 585 =cut | |
| 586 | |
| 587 sub retrieval_type { | |
| 588 my ($self, $value) = @_; | |
| 589 if( defined $value ) { | |
| 590 $value = lc $value; | |
| 591 if( ! $RETRIEVAL_TYPES{$value} ) { | |
| 592 $self->warn("invalid retrieval type $value must be one of (" . | |
| 593 join(",", keys %RETRIEVAL_TYPES), ")"); | |
| 594 $value = $DEFAULT_RETRIEVAL_TYPE; | |
| 595 } | |
| 596 $self->{'_retrieval_type'} = $value; | |
| 597 } | |
| 598 return $self->{'_retrieval_type'}; | |
| 599 } | |
| 600 | |
| 601 =head2 url_params | |
| 602 | |
| 603 Title : url_params | |
| 604 Usage : my $params = $self->url_params or | |
| 605 $self->url_params($params) | |
| 606 Function: Get/Set the URL parameters for the Web Database | |
| 607 Returns : url parameters for Web Database | |
| 608 Args : $params - parameters to be appended to the URL for the WebDatabase | |
| 609 | |
| 610 =cut | |
| 611 | |
| 612 sub url_params { | |
| 613 my ($self, $value) = @_; | |
| 614 if( defined $value ) { | |
| 615 $self->{'_urlparams'} = $value; | |
| 616 } | |
| 617 } | |
| 618 | |
| 619 =head2 ua | |
| 620 | |
| 621 Title : ua | |
| 622 Usage : my $ua = $self->ua or | |
| 623 $self->ua($ua) | |
| 624 Function: Get/Set a LWP::UserAgent for use | |
| 625 Returns : reference to LWP::UserAgent Object | |
| 626 Args : $ua - must be a LWP::UserAgent | |
| 627 | |
| 628 =cut | |
| 629 | |
| 630 sub ua { | |
| 631 my ($self, $ua) = @_; | |
| 632 if( defined $ua && $ua->isa("LWP::UserAgent") ) { | |
| 633 $self->{'_ua'} = $ua; | |
| 634 } | |
| 635 return $self->{'_ua'}; | |
| 636 } | |
| 637 | |
| 638 =head2 postprocess_data | |
| 639 | |
| 640 Title : postprocess_data | |
| 641 Usage : $self->postprocess_data ( 'type' => 'string', | |
| 642 'location' => \$datastr); | |
| 643 Function: process downloaded data before loading into a Bio::SeqIO | |
| 644 Returns : void | |
| 645 Args : hash with two keys - 'type' can be 'string' or 'file' | |
| 646 - 'location' either file location or string | |
| 647 reference containing data | |
| 648 | |
| 649 =cut | |
| 650 | |
| 651 sub postprocess_data { | |
| 652 my ( $self, %args) = @_; | |
| 653 return; | |
| 654 } | |
| 655 | |
| 656 # private methods | |
| 657 sub _request { | |
| 658 | |
| 659 my ($self, $url,$tmpfile) = @_; | |
| 660 my ($resp); | |
| 661 if( defined $tmpfile && $tmpfile ne '' ) { | |
| 662 $resp = $self->ua->request($url, $tmpfile); | |
| 663 } else { $resp = $self->ua->request($url); } | |
| 664 | |
| 665 if( $resp->is_error ) { | |
| 666 $self->throw("WebDBSeqI Request Error:\n".$resp->as_string); | |
| 667 } | |
| 668 return $resp; | |
| 669 } | |
| 670 | |
| 671 # send web request to stdout for streaming purposes | |
| 672 sub _stream_request { | |
| 673 my $self = shift; | |
| 674 my $request = shift; | |
| 675 | |
| 676 # fork so as to pipe output of fetch process through to | |
| 677 # postprocess_data method call. | |
| 678 my $child = open (FETCH,"-|"); | |
| 679 $self->throw("Couldn't fork: $!") unless defined $child; | |
| 680 | |
| 681 if ($child) { | |
| 682 local ($/) = "//\n"; # assume genbank/swiss format | |
| 683 $| = 1; | |
| 684 my $records = 0; | |
| 685 while (my $record = <FETCH>) { | |
| 686 $records++; | |
| 687 $self->postprocess_data('type' => 'string', | |
| 688 'location' => \$record); | |
| 689 print STDOUT $record; | |
| 690 } | |
| 691 $/ = "\n"; # reset to be safe; | |
| 692 close(FETCH); | |
| 693 close STDOUT; | |
| 694 close STDERR; | |
| 695 kill 9=>$$; # to prevent END{} blocks from executing in forked children | |
| 696 sleep; | |
| 697 } | |
| 698 else { | |
| 699 $| = 1; | |
| 700 my $resp = $self->ua->request($request, | |
| 701 sub { print shift } | |
| 702 ); | |
| 703 if( $resp->is_error ) { | |
| 704 $self->throw("WebDBSeqI Request Error:\n".$resp->as_string); | |
| 705 } | |
| 706 | |
| 707 close STDOUT; close STDERR; | |
| 708 kill 9=>$$; # to prevent END{} blocks from executing in forked children | |
| 709 sleep; | |
| 710 } | |
| 711 exit 0; | |
| 712 } | |
| 713 | |
| 714 sub io { | |
| 715 my ($self,$io) = @_; | |
| 716 | |
| 717 if(defined($io) || (! exists($self->{'_io'}))) { | |
| 718 $io = Bio::Root::IO->new() unless $io; | |
| 719 $self->{'_io'} = $io; | |
| 720 } | |
| 721 return $self->{'_io'}; | |
| 722 } | |
| 723 | |
| 724 | |
| 725 =head2 delay | |
| 726 | |
| 727 Title : delay | |
| 728 Usage : $secs = $self->delay([$secs]) | |
| 729 Function: get/set number of seconds to delay between fetches | |
| 730 Returns : number of seconds to delay | |
| 731 Args : new value | |
| 732 | |
| 733 NOTE: the default is to use the value specified by delay_policy(). | |
| 734 This can be overridden by calling this method, or by passing the | |
| 735 -delay argument to new(). | |
| 736 | |
| 737 =cut | |
| 738 | |
| 739 sub delay { | |
| 740 my $self = shift; | |
| 741 my $d = $self->{'_delay'}; | |
| 742 $self->{'_delay'} = shift if @_; | |
| 743 $d; | |
| 744 } | |
| 745 | |
| 746 =head2 delay_policy | |
| 747 | |
| 748 Title : delay_policy | |
| 749 Usage : $secs = $self->delay_policy | |
| 750 Function: return number of seconds to delay between calls to remote db | |
| 751 Returns : number of seconds to delay | |
| 752 Args : none | |
| 753 | |
| 754 NOTE: The default delay policy is 0s. Override in subclasses to | |
| 755 implement delays. The timer has only second resolution, so the delay | |
| 756 will actually be +/- 1s. | |
| 757 | |
| 758 =cut | |
| 759 | |
| 760 sub delay_policy { | |
| 761 my $self = shift; | |
| 762 return 0; | |
| 763 } | |
| 764 | |
| 765 =head2 _sleep | |
| 766 | |
| 767 Title : _sleep | |
| 768 Usage : $self->_sleep | |
| 769 Function: sleep for a number of seconds indicated by the delay policy | |
| 770 Returns : none | |
| 771 Args : none | |
| 772 | |
| 773 NOTE: This method keeps track of the last time it was called and only | |
| 774 imposes a sleep if it was called more recently than the delay_policy() | |
| 775 allows. | |
| 776 | |
| 777 =cut | |
| 778 | |
| 779 sub _sleep { | |
| 780 my $self = shift; | |
| 781 my $last_invocation = $LAST_INVOCATION_TIME; | |
| 782 if (time - $LAST_INVOCATION_TIME < $self->delay) { | |
| 783 my $delay = $self->delay - (time - $LAST_INVOCATION_TIME); | |
| 784 warn "sleeping for $delay seconds\n" if $self->verbose; | |
| 785 sleep $delay; | |
| 786 } | |
| 787 $LAST_INVOCATION_TIME = time; | |
| 788 } | |
| 789 | |
| 790 1; |
