0
|
1 # $Id: RemoteBlast.pm,v 1.14.2.2 2003/09/03 18:29:50 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Tools::Run::RemoteBlast
|
|
4 #
|
|
5 # Cared for by Jason Stajich, Mat Wiepert
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Tools::Run::RemoteBlast - Object for remote execution of the NCBI Blast
|
|
16 via HTTP
|
|
17
|
|
18 =head1 SYNOPSIS
|
|
19
|
|
20 #Remote-blast "factory object" creation and blast-parameter initialization
|
|
21
|
|
22 use Bio::Tools::Run::RemoteBlast;
|
|
23 use strict;
|
|
24 my $prog = 'blastp';
|
|
25 my $db = 'swissprot';
|
|
26 my $e_val= '1e-10';
|
|
27
|
|
28 my @params = ( '-prog' => $prog,
|
|
29 '-data' => $db,
|
|
30 '-expect' => $e_val,
|
|
31 '-readmethod' => 'SearchIO' );
|
|
32
|
|
33 my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
|
|
34
|
|
35 #change a paramter
|
|
36 $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]';
|
|
37
|
|
38 #remove a parameter
|
|
39 delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
|
|
40
|
|
41 my $v = 1;
|
|
42 #$v is just to turn on and off the messages
|
|
43
|
|
44 my $str = Bio::SeqIO->new(-file=>'amino.fa' , '-format' => 'fasta' );
|
|
45
|
|
46 while (my $input = $str->next_seq()){
|
|
47 #Blast a sequence against a database:
|
|
48
|
|
49 #Alternatively, you could pass in a file with many
|
|
50 #sequences rather than loop through sequence one at a time
|
|
51 #Remove the loop starting 'while (my $input = $str->next_seq())'
|
|
52 #and swap the two lines below for an example of that.
|
|
53 my $r = $factory->submit_blast($input);
|
|
54 #my $r = $factory->submit_blast('amino.fa');
|
|
55
|
|
56 print STDERR "waiting..." if( $v > 0 );
|
|
57 while ( my @rids = $factory->each_rid ) {
|
|
58 foreach my $rid ( @rids ) {
|
|
59 my $rc = $factory->retrieve_blast($rid);
|
|
60 if( !ref($rc) ) {
|
|
61 if( $rc < 0 ) {
|
|
62 $factory->remove_rid($rid);
|
|
63 }
|
|
64 print STDERR "." if ( $v > 0 );
|
|
65 sleep 5;
|
|
66 } else {
|
|
67 my $result = $rc->next_result();
|
|
68 #save the output
|
|
69 my $filename = $result->query_name()."\.out";
|
|
70 $factory->save_output($filename);
|
|
71 $factory->remove_rid($rid);
|
|
72 print "\nQuery Name: ", $result->query_name(), "\n";
|
|
73 while ( my $hit = $result->next_hit ) {
|
|
74 next unless ( $v > 0);
|
|
75 print "\thit name is ", $hit->name, "\n";
|
|
76 while( my $hsp = $hit->next_hsp ) {
|
|
77 print "\t\tscore is ", $hsp->score, "\n";
|
|
78 }
|
|
79 }
|
|
80 }
|
|
81 }
|
|
82 }
|
|
83 }
|
|
84
|
|
85 # This example shows how to change a CGI parameter:
|
|
86 $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM25';
|
|
87
|
|
88 # And this is how to delete a CGI parameter:
|
|
89 delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'};
|
|
90
|
|
91
|
|
92 =head1 DESCRIPTION
|
|
93
|
|
94 Class for remote execution of the NCBI Blast via HTTP.
|
|
95
|
|
96 For a description of the many CGI parameters see:
|
|
97 http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html
|
|
98
|
|
99 Various additional options and input formats are available.
|
|
100
|
|
101 =head1 FEEDBACK
|
|
102
|
|
103 =head2 Mailing Lists
|
|
104
|
|
105 User feedback is an integral part of the evolution of this and other
|
|
106 Bioperl modules. Send your comments and suggestions preferably to one
|
|
107 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
108
|
|
109 bioperl-l@bioperl.org - General discussion
|
|
110 http://bio.perl.org/MailList.html - About the mailing lists
|
|
111
|
|
112 =head2 Reporting Bugs
|
|
113
|
|
114 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
115 the bugs and their resolution. Bug reports can be submitted via email
|
|
116 or the web:
|
|
117
|
|
118 bioperl-bugs@bio.perl.org
|
|
119 http://bio.perl.org/bioperl-bugs/
|
|
120
|
|
121 =head1 AUTHOR - Jason Stajich
|
|
122
|
|
123 Email jason@bioperl.org
|
|
124
|
|
125 =head1 APPENDIX
|
|
126
|
|
127 The rest of the documentation details each of the object
|
|
128 methods. Internal methods are usually preceded with a _
|
|
129
|
|
130 =cut
|
|
131
|
|
132 package Bio::Tools::Run::RemoteBlast;
|
|
133
|
|
134 use vars qw($AUTOLOAD @ISA %BLAST_PARAMS $URLBASE %HEADER %RETRIEVALHEADER
|
|
135 $RIDLINE);
|
|
136 use strict;
|
|
137
|
|
138 use Bio::Root::Root;
|
|
139 use Bio::Root::IO;
|
|
140 use Bio::SeqIO;
|
|
141 use IO::String;
|
|
142 use Bio::Tools::BPlite;
|
|
143 use Bio::SearchIO;
|
|
144 use LWP;
|
|
145 use HTTP::Request::Common;
|
|
146 BEGIN {
|
|
147 $URLBASE = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi';
|
|
148 %HEADER = ('CMD' => 'Put',
|
|
149 'PROGRAM' => '',
|
|
150 'DATABASE' => '',
|
|
151 'FILTER' => 'L',
|
|
152 'EXPECT' => '',
|
|
153 'QUERY' => '',
|
|
154 'CDD_SEARCH' => 'off',
|
|
155 'COMPOSITION_BASED_STATISTICS' => 'off',
|
|
156 'FORMAT_OBJECT' => 'Alignment',
|
|
157 'SERVICE' => 'plain',
|
|
158 );
|
|
159
|
|
160 %RETRIEVALHEADER = ('CMD' => 'Get',
|
|
161 'RID' => '',
|
|
162 'ALIGNMENT_VIEW' => 'Pairwise',
|
|
163 'DESCRIPTIONS' => 100,
|
|
164 'ALIGNMENTS' => 50,
|
|
165 'FORMAT_TYPE' => 'Text',
|
|
166 );
|
|
167
|
|
168 $RIDLINE = 'RID\s+=\s+(\S+)';
|
|
169
|
|
170 %BLAST_PARAMS = ( 'prog' => 'blastp',
|
|
171 'data' => 'nr',
|
|
172 'expect' => '1e-3',
|
|
173 'readmethod' => 'SearchIO'
|
|
174 );
|
|
175
|
|
176 }
|
|
177
|
|
178 @ISA = qw(Bio::Root::Root Bio::Root::IO);
|
|
179
|
|
180 sub new {
|
|
181 my ($caller, @args) = @_;
|
|
182 # chained new
|
|
183 my $self = $caller->SUPER::new(@args);
|
|
184 # so that tempfiles are cleaned up
|
|
185 $self->_initialize_io();
|
|
186 my ($prog, $data, $expect,
|
|
187 $readmethod) = $self->_rearrange([qw(PROG DATA
|
|
188 EXPECT
|
|
189 READMETHOD)],
|
|
190 @args);
|
|
191
|
|
192 $readmethod = $BLAST_PARAMS{'readmethod'} unless defined $readmethod;
|
|
193 $prog = $BLAST_PARAMS{'prog'} unless defined $prog;
|
|
194 $data = $BLAST_PARAMS{'data'} unless defined $data;
|
|
195 $expect = $BLAST_PARAMS{'expect'} unless defined $expect;
|
|
196 $self->readmethod($readmethod);
|
|
197 $self->program($prog);
|
|
198 $self->database($data);
|
|
199 $self->expect($expect);
|
|
200
|
|
201 return $self;
|
|
202 }
|
|
203
|
|
204 =head2 header
|
|
205
|
|
206 Title : header
|
|
207 Usage : my $header = $self->header
|
|
208 Function: Get/Set HTTP header for blast query
|
|
209 Returns : string
|
|
210 Args : none
|
|
211
|
|
212 =cut
|
|
213
|
|
214 sub header {
|
|
215 my ($self) = @_;
|
|
216 my %h = %HEADER;
|
|
217 $h{'PROGRAM'} = $self->program;
|
|
218 $h{'DATABASE'} = $self->database;
|
|
219 $h{'EXPECT'} = $self->expect;
|
|
220 return %h;
|
|
221 }
|
|
222
|
|
223 =head2 readmethod
|
|
224
|
|
225 Title : readmethod
|
|
226 Usage : my $readmethod = $self->readmethod
|
|
227 Function: Get/Set the method to read the blast report
|
|
228 Returns : string
|
|
229 Args : string [ Blast, BPlite ]
|
|
230
|
|
231 =cut
|
|
232
|
|
233 sub readmethod {
|
|
234 my ($self, $val) = @_;
|
|
235 if( defined $val ) {
|
|
236 $self->{'_readmethod'} = $val;
|
|
237 }
|
|
238 return $self->{'_readmethod'};
|
|
239 }
|
|
240
|
|
241
|
|
242 =head2 program
|
|
243
|
|
244 Title : program
|
|
245 Usage : my $prog = $self->program
|
|
246 Function: Get/Set the program to run
|
|
247 Returns : string
|
|
248 Args : string [ blastp, blastn, blastx, tblastn, tblastx ]
|
|
249
|
|
250 =cut
|
|
251
|
|
252 sub program {
|
|
253 my ($self, $val) = @_;
|
|
254 if( defined $val ) {
|
|
255 $val = lc $val;
|
|
256 if( $val !~ /t?blast[pnx]/ ) {
|
|
257 $self->warn("trying to set program to an invalid program name ($val) -- defaulting to blastp");
|
|
258 $val = 'blastp';
|
|
259 }
|
|
260 # $self->{'_program'} = $val;
|
|
261 $HEADER{'PROGRAM'} = $val;
|
|
262 }
|
|
263 return $HEADER{'PROGRAM'};
|
|
264 }
|
|
265
|
|
266
|
|
267 =head2 database
|
|
268
|
|
269 Title : database
|
|
270 Usage : my $db = $self->database
|
|
271 Function: Get/Set the database to search
|
|
272 Returns : string
|
|
273 Args : string [ swissprot, nr, nt, etc... ]
|
|
274
|
|
275 =cut
|
|
276
|
|
277 sub database {
|
|
278 my ($self, $val) = @_;
|
|
279 if( defined $val ) {
|
|
280 # $self->{'_database'} = $val;
|
|
281 $HEADER{'DATABASE'} = $val;
|
|
282 }
|
|
283 return $HEADER{'DATABASE'};
|
|
284 }
|
|
285
|
|
286
|
|
287 =head2 expect
|
|
288
|
|
289 Title : expect
|
|
290 Usage : my $expect = $self->expect
|
|
291 Function: Get/Set the E value cutoff
|
|
292 Returns : string
|
|
293 Args : string [ '1e-4' ]
|
|
294
|
|
295 =cut
|
|
296
|
|
297 sub expect {
|
|
298 my ($self, $val) = @_;
|
|
299 if( defined $val ) {
|
|
300 # $self->{'_expect'} = $val;
|
|
301 $HEADER{'EXPECT'} = $val;
|
|
302 }
|
|
303 return $HEADER{'EXPECT'};
|
|
304 }
|
|
305
|
|
306 =head2 ua
|
|
307
|
|
308 Title : ua
|
|
309 Usage : my $ua = $self->ua or
|
|
310 $self->ua($ua)
|
|
311 Function: Get/Set a LWP::UserAgent for use
|
|
312 Returns : reference to LWP::UserAgent Object
|
|
313 Args : none
|
|
314 Comments: Will create a UserAgent if none has been requested before.
|
|
315
|
|
316 =cut
|
|
317
|
|
318 sub ua {
|
|
319 my ($self, $value) = @_;
|
|
320 if( ! defined $self->{'_ua'} ) {
|
|
321 $self->{'_ua'} = new LWP::UserAgent;
|
|
322 }
|
|
323 return $self->{'_ua'};
|
|
324 }
|
|
325
|
|
326 =head2 proxy
|
|
327
|
|
328 Title : proxy
|
|
329 Usage : $httpproxy = $db->proxy('http') or
|
|
330 $db->proxy(['http','ftp'], 'http://myproxy' )
|
|
331 Function: Get/Set a proxy for use of proxy
|
|
332 Returns : a string indicating the proxy
|
|
333 Args : $protocol : an array ref of the protocol(s) to set/get
|
|
334 $proxyurl : url of the proxy to use for the specified protocol
|
|
335
|
|
336 =cut
|
|
337
|
|
338 sub proxy {
|
|
339 my ($self,$protocol,$proxy) = @_;
|
|
340 return undef if ( !defined $self->ua || !defined $protocol
|
|
341 || !defined $proxy );
|
|
342 return $self->ua->proxy($protocol,$proxy);
|
|
343 }
|
|
344
|
|
345 sub add_rid {
|
|
346 my ($self, @vals) = @_;
|
|
347 foreach ( @vals ) {
|
|
348 $self->{'_rids'}->{$_} = 1;
|
|
349 }
|
|
350 return scalar keys %{$self->{'_rids'}};
|
|
351 }
|
|
352
|
|
353 sub remove_rid {
|
|
354 my ($self, @vals) = @_;
|
|
355 foreach ( @vals ) {
|
|
356 delete $self->{'_rids'}->{$_};
|
|
357 }
|
|
358 return scalar keys %{$self->{'_rids'}};
|
|
359 }
|
|
360
|
|
361 sub each_rid {
|
|
362 my ($self) = @_;
|
|
363 return keys %{$self->{'_rids'}};
|
|
364 }
|
|
365
|
|
366 =head2 submit_blast
|
|
367
|
|
368 Title : submit_blast
|
|
369 Usage : $self->submit_blast([$seq1,$seq2]);
|
|
370 Function: Submit blast jobs to ncbi blast queue on sequence(s)
|
|
371 Returns : Blast report object as defined by $self->readmethod
|
|
372 Args : input can be:
|
|
373 * sequence object
|
|
374 * array ref of sequence objects
|
|
375 * filename of file containing fasta formatted sequences
|
|
376
|
|
377 =cut
|
|
378
|
|
379 sub submit_blast {
|
|
380 my ($self, $input) = @_;
|
|
381 my @seqs = $self->_load_input($input);
|
|
382 return 0 unless ( @seqs );
|
|
383 my $tcount = 0;
|
|
384 my %header = $self->header;
|
|
385 foreach my $seq ( @seqs ) {
|
|
386 #If query has a fasta header, the output has the query line.
|
|
387 $header{'QUERY'} = ">".(defined $seq->display_id() ? $seq->display_id() : "").
|
|
388 " ".(defined $seq->desc() ? $seq->desc() : "")."\n".$seq->seq();
|
|
389 my $request = POST $URLBASE, [%header];
|
|
390 $self->warn($request->as_string) if ( $self->verbose > 0);
|
|
391 my $response = $self->ua->request( $request);
|
|
392
|
|
393 if( $response->is_success ) {
|
|
394 if( $self->verbose > 0 ) {
|
|
395 my ($tempfh) = $self->tempfile();
|
|
396 # Hmm, what exactly are we trying to do here?
|
|
397 print $tempfh $response->content;
|
|
398 close($tempfh);
|
|
399 undef $tempfh;
|
|
400 }
|
|
401 my @subdata = split(/\n/, $response->content );
|
|
402 my $count = 0;
|
|
403 foreach ( @subdata ) {
|
|
404 if( /$RIDLINE/ ) {
|
|
405 $count++;
|
|
406 print STDERR $_ if( $self->verbose > 0);
|
|
407 $self->add_rid($1);
|
|
408 last;
|
|
409 }
|
|
410 }
|
|
411 if( $count == 0 ) {
|
|
412 $self->warn("req was ". $request->as_string() . "\n");
|
|
413 $self->warn(join('', @subdata));
|
|
414 }
|
|
415 $tcount += $count;
|
|
416 } else {
|
|
417 # should try and be a little more verbose here
|
|
418 $self->warn("req was ". $request->as_string() . "\n" .
|
|
419 $response->error_as_HTML);
|
|
420 $tcount = -1;
|
|
421 }
|
|
422 }
|
|
423 return $tcount;
|
|
424 }
|
|
425
|
|
426 =head2 retrieve_blast
|
|
427
|
|
428 Title : retrieve_blast
|
|
429 Usage : my $blastreport = $blastfactory->retrieve_blast($rid);
|
|
430 Function: Attempts to retrieve a blast report from remote blast queue
|
|
431 Returns : -1 on error,
|
|
432 0 on 'job not finished',
|
|
433 Bio::Tools::BPlite or Bio::Tools::Blast object
|
|
434 (depending on how object was initialized) on success
|
|
435 Args : Remote Blast ID (RID)
|
|
436
|
|
437 =cut
|
|
438
|
|
439 sub retrieve_blast {
|
|
440 my($self, $rid) = @_;
|
|
441 my (undef,$tempfile) = $self->tempfile();
|
|
442 my %hdr = %RETRIEVALHEADER;
|
|
443 $hdr{'RID'} = $rid;
|
|
444 my $req = POST $URLBASE, [%hdr];
|
|
445 if( $self->verbose > 0 ) {
|
|
446 $self->warn("retrieve request is " . $req->as_string());
|
|
447 }
|
|
448 my $response = $self->ua->request($req, $tempfile);
|
|
449 if( $self->verbose > 0 ) {
|
|
450 open(TMP, $tempfile) or $self->throw("cannot open $tempfile");
|
|
451 while(<TMP>) { print $_; }
|
|
452 close TMP;
|
|
453 }
|
|
454 if( $response->is_success ) {
|
|
455 my $size = -s $tempfile;
|
|
456 if( $size > 1000 ) {
|
|
457 my $blastobj;
|
|
458 if( $self->readmethod =~ /BPlite/ ) {
|
|
459 $blastobj = new Bio::Tools::BPlite(-file => $tempfile);
|
|
460 } else {
|
|
461 $blastobj = new Bio::SearchIO(-file => $tempfile,
|
|
462 -format => 'blast');
|
|
463 }
|
|
464 #save tempfile
|
|
465 $self->file($tempfile);
|
|
466 return $blastobj;
|
|
467 } elsif( $size < 500 ) { # search had a problem
|
|
468 open(ERR, "<$tempfile") or $self->throw("cannot open file $tempfile");
|
|
469 $self->warn(join("", <ERR>));
|
|
470 close ERR;
|
|
471 return -1;
|
|
472 } else { # still working
|
|
473 return 0;
|
|
474 }
|
|
475 } else {
|
|
476 $self->warn($response->error_as_HTML);
|
|
477 return -1;
|
|
478 }
|
|
479 }
|
|
480
|
|
481 =head2 save_output
|
|
482
|
|
483 Title : saveoutput
|
|
484 Usage : my $saveoutput = $self->save_output($filename)
|
|
485 Function: Method to save the blast report
|
|
486 Returns : 1 (throws error otherwise)
|
|
487 Args : string [rid, filename]
|
|
488
|
|
489 =cut
|
|
490
|
|
491 sub save_output {
|
|
492 my ($self, $filename) = @_;
|
|
493 if( ! defined $filename ) {
|
|
494 $self->throw("Can't save blast output. You must specify a filename to save to.");
|
|
495 }
|
|
496 #should be set when retrieving blast
|
|
497 my $blastfile = $self->file;
|
|
498 #open temp file and output file, have to filter out some HTML
|
|
499 open(TMP, $blastfile) or $self->throw("cannot open $blastfile");
|
|
500 open(SAVEOUT, ">$filename") or $self->throw("cannot open $filename");
|
|
501 my $seentop=0;
|
|
502 while(<TMP>) {
|
|
503 next if (/<pre>/);
|
|
504 if( /^(?:[T]?BLAST[NPX])\s*.+$/i ||
|
|
505 /^RPS-BLAST\s*.+$/i ) {
|
|
506 $seentop=1;
|
|
507 }
|
|
508 next if !$seentop;
|
|
509 if( $seentop ) {
|
|
510 print SAVEOUT;
|
|
511 }
|
|
512 }
|
|
513 close SAVEOUT;
|
|
514 close TMP;
|
|
515 return 1;
|
|
516 }
|
|
517
|
|
518 sub _load_input {
|
|
519 my ($self, $input) = @_;
|
|
520
|
|
521 if( ! defined $input ) {
|
|
522 $self->throw("Calling remote blast with no input");
|
|
523 }
|
|
524 my @seqs;
|
|
525 if( ! ref $input ) {
|
|
526 if( -e $input ) {
|
|
527 my $seqio = new Bio::SeqIO(-format => 'fasta', -file => $input);
|
|
528 while( my $seq = $seqio->next_seq ) {
|
|
529 push @seqs, $seq;
|
|
530 }
|
|
531 } else {
|
|
532 $self->throw("Input $input was not a valid filename");
|
|
533 }
|
|
534 } elsif( ref($input) =~ /ARRAY/i ) {
|
|
535 foreach ( @$input ) {
|
|
536 if( ref($_) && $_->isa('Bio::PrimarySeqI') ) {
|
|
537 push @seqs, $_;
|
|
538 } else {
|
|
539 $self->warn("Trying to add a " . ref($_) .
|
|
540 " but expected a Bio::PrimarySeqI");
|
|
541 }
|
|
542 }
|
|
543 if( ! @seqs) {
|
|
544 $self->throw("Did not pass in valid input -- no sequence objects found");
|
|
545 }
|
|
546 } elsif( $input->isa('Bio::PrimarySeqI') ) {
|
|
547 push @seqs, $input;
|
|
548 }
|
|
549 return @seqs;
|
|
550 }
|
|
551 1;
|
|
552 __END__
|