Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Run/RemoteBlast.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: RemoteBlast.pm,v 1.14.2.2 2003/09/03 18:29:50 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::Tools::Run::RemoteBlast | |
4 # | |
5 # Cared for by Jason Stajich, Mat Wiepert | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Tools::Run::RemoteBlast - Object for remote execution of the NCBI Blast | |
16 via HTTP | |
17 | |
18 =head1 SYNOPSIS | |
19 | |
20 #Remote-blast "factory object" creation and blast-parameter initialization | |
21 | |
22 use Bio::Tools::Run::RemoteBlast; | |
23 use strict; | |
24 my $prog = 'blastp'; | |
25 my $db = 'swissprot'; | |
26 my $e_val= '1e-10'; | |
27 | |
28 my @params = ( '-prog' => $prog, | |
29 '-data' => $db, | |
30 '-expect' => $e_val, | |
31 '-readmethod' => 'SearchIO' ); | |
32 | |
33 my $factory = Bio::Tools::Run::RemoteBlast->new(@params); | |
34 | |
35 #change a paramter | |
36 $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]'; | |
37 | |
38 #remove a parameter | |
39 delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; | |
40 | |
41 my $v = 1; | |
42 #$v is just to turn on and off the messages | |
43 | |
44 my $str = Bio::SeqIO->new(-file=>'amino.fa' , '-format' => 'fasta' ); | |
45 | |
46 while (my $input = $str->next_seq()){ | |
47 #Blast a sequence against a database: | |
48 | |
49 #Alternatively, you could pass in a file with many | |
50 #sequences rather than loop through sequence one at a time | |
51 #Remove the loop starting 'while (my $input = $str->next_seq())' | |
52 #and swap the two lines below for an example of that. | |
53 my $r = $factory->submit_blast($input); | |
54 #my $r = $factory->submit_blast('amino.fa'); | |
55 | |
56 print STDERR "waiting..." if( $v > 0 ); | |
57 while ( my @rids = $factory->each_rid ) { | |
58 foreach my $rid ( @rids ) { | |
59 my $rc = $factory->retrieve_blast($rid); | |
60 if( !ref($rc) ) { | |
61 if( $rc < 0 ) { | |
62 $factory->remove_rid($rid); | |
63 } | |
64 print STDERR "." if ( $v > 0 ); | |
65 sleep 5; | |
66 } else { | |
67 my $result = $rc->next_result(); | |
68 #save the output | |
69 my $filename = $result->query_name()."\.out"; | |
70 $factory->save_output($filename); | |
71 $factory->remove_rid($rid); | |
72 print "\nQuery Name: ", $result->query_name(), "\n"; | |
73 while ( my $hit = $result->next_hit ) { | |
74 next unless ( $v > 0); | |
75 print "\thit name is ", $hit->name, "\n"; | |
76 while( my $hsp = $hit->next_hsp ) { | |
77 print "\t\tscore is ", $hsp->score, "\n"; | |
78 } | |
79 } | |
80 } | |
81 } | |
82 } | |
83 } | |
84 | |
85 # This example shows how to change a CGI parameter: | |
86 $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM25'; | |
87 | |
88 # And this is how to delete a CGI parameter: | |
89 delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; | |
90 | |
91 | |
92 =head1 DESCRIPTION | |
93 | |
94 Class for remote execution of the NCBI Blast via HTTP. | |
95 | |
96 For a description of the many CGI parameters see: | |
97 http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html | |
98 | |
99 Various additional options and input formats are available. | |
100 | |
101 =head1 FEEDBACK | |
102 | |
103 =head2 Mailing Lists | |
104 | |
105 User feedback is an integral part of the evolution of this and other | |
106 Bioperl modules. Send your comments and suggestions preferably to one | |
107 of the Bioperl mailing lists. Your participation is much appreciated. | |
108 | |
109 bioperl-l@bioperl.org - General discussion | |
110 http://bio.perl.org/MailList.html - About the mailing lists | |
111 | |
112 =head2 Reporting Bugs | |
113 | |
114 Report bugs to the Bioperl bug tracking system to help us keep track | |
115 the bugs and their resolution. Bug reports can be submitted via email | |
116 or the web: | |
117 | |
118 bioperl-bugs@bio.perl.org | |
119 http://bio.perl.org/bioperl-bugs/ | |
120 | |
121 =head1 AUTHOR - Jason Stajich | |
122 | |
123 Email jason@bioperl.org | |
124 | |
125 =head1 APPENDIX | |
126 | |
127 The rest of the documentation details each of the object | |
128 methods. Internal methods are usually preceded with a _ | |
129 | |
130 =cut | |
131 | |
132 package Bio::Tools::Run::RemoteBlast; | |
133 | |
134 use vars qw($AUTOLOAD @ISA %BLAST_PARAMS $URLBASE %HEADER %RETRIEVALHEADER | |
135 $RIDLINE); | |
136 use strict; | |
137 | |
138 use Bio::Root::Root; | |
139 use Bio::Root::IO; | |
140 use Bio::SeqIO; | |
141 use IO::String; | |
142 use Bio::Tools::BPlite; | |
143 use Bio::SearchIO; | |
144 use LWP; | |
145 use HTTP::Request::Common; | |
146 BEGIN { | |
147 $URLBASE = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi'; | |
148 %HEADER = ('CMD' => 'Put', | |
149 'PROGRAM' => '', | |
150 'DATABASE' => '', | |
151 'FILTER' => 'L', | |
152 'EXPECT' => '', | |
153 'QUERY' => '', | |
154 'CDD_SEARCH' => 'off', | |
155 'COMPOSITION_BASED_STATISTICS' => 'off', | |
156 'FORMAT_OBJECT' => 'Alignment', | |
157 'SERVICE' => 'plain', | |
158 ); | |
159 | |
160 %RETRIEVALHEADER = ('CMD' => 'Get', | |
161 'RID' => '', | |
162 'ALIGNMENT_VIEW' => 'Pairwise', | |
163 'DESCRIPTIONS' => 100, | |
164 'ALIGNMENTS' => 50, | |
165 'FORMAT_TYPE' => 'Text', | |
166 ); | |
167 | |
168 $RIDLINE = 'RID\s+=\s+(\S+)'; | |
169 | |
170 %BLAST_PARAMS = ( 'prog' => 'blastp', | |
171 'data' => 'nr', | |
172 'expect' => '1e-3', | |
173 'readmethod' => 'SearchIO' | |
174 ); | |
175 | |
176 } | |
177 | |
178 @ISA = qw(Bio::Root::Root Bio::Root::IO); | |
179 | |
180 sub new { | |
181 my ($caller, @args) = @_; | |
182 # chained new | |
183 my $self = $caller->SUPER::new(@args); | |
184 # so that tempfiles are cleaned up | |
185 $self->_initialize_io(); | |
186 my ($prog, $data, $expect, | |
187 $readmethod) = $self->_rearrange([qw(PROG DATA | |
188 EXPECT | |
189 READMETHOD)], | |
190 @args); | |
191 | |
192 $readmethod = $BLAST_PARAMS{'readmethod'} unless defined $readmethod; | |
193 $prog = $BLAST_PARAMS{'prog'} unless defined $prog; | |
194 $data = $BLAST_PARAMS{'data'} unless defined $data; | |
195 $expect = $BLAST_PARAMS{'expect'} unless defined $expect; | |
196 $self->readmethod($readmethod); | |
197 $self->program($prog); | |
198 $self->database($data); | |
199 $self->expect($expect); | |
200 | |
201 return $self; | |
202 } | |
203 | |
204 =head2 header | |
205 | |
206 Title : header | |
207 Usage : my $header = $self->header | |
208 Function: Get/Set HTTP header for blast query | |
209 Returns : string | |
210 Args : none | |
211 | |
212 =cut | |
213 | |
214 sub header { | |
215 my ($self) = @_; | |
216 my %h = %HEADER; | |
217 $h{'PROGRAM'} = $self->program; | |
218 $h{'DATABASE'} = $self->database; | |
219 $h{'EXPECT'} = $self->expect; | |
220 return %h; | |
221 } | |
222 | |
223 =head2 readmethod | |
224 | |
225 Title : readmethod | |
226 Usage : my $readmethod = $self->readmethod | |
227 Function: Get/Set the method to read the blast report | |
228 Returns : string | |
229 Args : string [ Blast, BPlite ] | |
230 | |
231 =cut | |
232 | |
233 sub readmethod { | |
234 my ($self, $val) = @_; | |
235 if( defined $val ) { | |
236 $self->{'_readmethod'} = $val; | |
237 } | |
238 return $self->{'_readmethod'}; | |
239 } | |
240 | |
241 | |
242 =head2 program | |
243 | |
244 Title : program | |
245 Usage : my $prog = $self->program | |
246 Function: Get/Set the program to run | |
247 Returns : string | |
248 Args : string [ blastp, blastn, blastx, tblastn, tblastx ] | |
249 | |
250 =cut | |
251 | |
252 sub program { | |
253 my ($self, $val) = @_; | |
254 if( defined $val ) { | |
255 $val = lc $val; | |
256 if( $val !~ /t?blast[pnx]/ ) { | |
257 $self->warn("trying to set program to an invalid program name ($val) -- defaulting to blastp"); | |
258 $val = 'blastp'; | |
259 } | |
260 # $self->{'_program'} = $val; | |
261 $HEADER{'PROGRAM'} = $val; | |
262 } | |
263 return $HEADER{'PROGRAM'}; | |
264 } | |
265 | |
266 | |
267 =head2 database | |
268 | |
269 Title : database | |
270 Usage : my $db = $self->database | |
271 Function: Get/Set the database to search | |
272 Returns : string | |
273 Args : string [ swissprot, nr, nt, etc... ] | |
274 | |
275 =cut | |
276 | |
277 sub database { | |
278 my ($self, $val) = @_; | |
279 if( defined $val ) { | |
280 # $self->{'_database'} = $val; | |
281 $HEADER{'DATABASE'} = $val; | |
282 } | |
283 return $HEADER{'DATABASE'}; | |
284 } | |
285 | |
286 | |
287 =head2 expect | |
288 | |
289 Title : expect | |
290 Usage : my $expect = $self->expect | |
291 Function: Get/Set the E value cutoff | |
292 Returns : string | |
293 Args : string [ '1e-4' ] | |
294 | |
295 =cut | |
296 | |
297 sub expect { | |
298 my ($self, $val) = @_; | |
299 if( defined $val ) { | |
300 # $self->{'_expect'} = $val; | |
301 $HEADER{'EXPECT'} = $val; | |
302 } | |
303 return $HEADER{'EXPECT'}; | |
304 } | |
305 | |
306 =head2 ua | |
307 | |
308 Title : ua | |
309 Usage : my $ua = $self->ua or | |
310 $self->ua($ua) | |
311 Function: Get/Set a LWP::UserAgent for use | |
312 Returns : reference to LWP::UserAgent Object | |
313 Args : none | |
314 Comments: Will create a UserAgent if none has been requested before. | |
315 | |
316 =cut | |
317 | |
318 sub ua { | |
319 my ($self, $value) = @_; | |
320 if( ! defined $self->{'_ua'} ) { | |
321 $self->{'_ua'} = new LWP::UserAgent; | |
322 } | |
323 return $self->{'_ua'}; | |
324 } | |
325 | |
326 =head2 proxy | |
327 | |
328 Title : proxy | |
329 Usage : $httpproxy = $db->proxy('http') or | |
330 $db->proxy(['http','ftp'], 'http://myproxy' ) | |
331 Function: Get/Set a proxy for use of proxy | |
332 Returns : a string indicating the proxy | |
333 Args : $protocol : an array ref of the protocol(s) to set/get | |
334 $proxyurl : url of the proxy to use for the specified protocol | |
335 | |
336 =cut | |
337 | |
338 sub proxy { | |
339 my ($self,$protocol,$proxy) = @_; | |
340 return undef if ( !defined $self->ua || !defined $protocol | |
341 || !defined $proxy ); | |
342 return $self->ua->proxy($protocol,$proxy); | |
343 } | |
344 | |
345 sub add_rid { | |
346 my ($self, @vals) = @_; | |
347 foreach ( @vals ) { | |
348 $self->{'_rids'}->{$_} = 1; | |
349 } | |
350 return scalar keys %{$self->{'_rids'}}; | |
351 } | |
352 | |
353 sub remove_rid { | |
354 my ($self, @vals) = @_; | |
355 foreach ( @vals ) { | |
356 delete $self->{'_rids'}->{$_}; | |
357 } | |
358 return scalar keys %{$self->{'_rids'}}; | |
359 } | |
360 | |
361 sub each_rid { | |
362 my ($self) = @_; | |
363 return keys %{$self->{'_rids'}}; | |
364 } | |
365 | |
366 =head2 submit_blast | |
367 | |
368 Title : submit_blast | |
369 Usage : $self->submit_blast([$seq1,$seq2]); | |
370 Function: Submit blast jobs to ncbi blast queue on sequence(s) | |
371 Returns : Blast report object as defined by $self->readmethod | |
372 Args : input can be: | |
373 * sequence object | |
374 * array ref of sequence objects | |
375 * filename of file containing fasta formatted sequences | |
376 | |
377 =cut | |
378 | |
379 sub submit_blast { | |
380 my ($self, $input) = @_; | |
381 my @seqs = $self->_load_input($input); | |
382 return 0 unless ( @seqs ); | |
383 my $tcount = 0; | |
384 my %header = $self->header; | |
385 foreach my $seq ( @seqs ) { | |
386 #If query has a fasta header, the output has the query line. | |
387 $header{'QUERY'} = ">".(defined $seq->display_id() ? $seq->display_id() : ""). | |
388 " ".(defined $seq->desc() ? $seq->desc() : "")."\n".$seq->seq(); | |
389 my $request = POST $URLBASE, [%header]; | |
390 $self->warn($request->as_string) if ( $self->verbose > 0); | |
391 my $response = $self->ua->request( $request); | |
392 | |
393 if( $response->is_success ) { | |
394 if( $self->verbose > 0 ) { | |
395 my ($tempfh) = $self->tempfile(); | |
396 # Hmm, what exactly are we trying to do here? | |
397 print $tempfh $response->content; | |
398 close($tempfh); | |
399 undef $tempfh; | |
400 } | |
401 my @subdata = split(/\n/, $response->content ); | |
402 my $count = 0; | |
403 foreach ( @subdata ) { | |
404 if( /$RIDLINE/ ) { | |
405 $count++; | |
406 print STDERR $_ if( $self->verbose > 0); | |
407 $self->add_rid($1); | |
408 last; | |
409 } | |
410 } | |
411 if( $count == 0 ) { | |
412 $self->warn("req was ". $request->as_string() . "\n"); | |
413 $self->warn(join('', @subdata)); | |
414 } | |
415 $tcount += $count; | |
416 } else { | |
417 # should try and be a little more verbose here | |
418 $self->warn("req was ". $request->as_string() . "\n" . | |
419 $response->error_as_HTML); | |
420 $tcount = -1; | |
421 } | |
422 } | |
423 return $tcount; | |
424 } | |
425 | |
426 =head2 retrieve_blast | |
427 | |
428 Title : retrieve_blast | |
429 Usage : my $blastreport = $blastfactory->retrieve_blast($rid); | |
430 Function: Attempts to retrieve a blast report from remote blast queue | |
431 Returns : -1 on error, | |
432 0 on 'job not finished', | |
433 Bio::Tools::BPlite or Bio::Tools::Blast object | |
434 (depending on how object was initialized) on success | |
435 Args : Remote Blast ID (RID) | |
436 | |
437 =cut | |
438 | |
439 sub retrieve_blast { | |
440 my($self, $rid) = @_; | |
441 my (undef,$tempfile) = $self->tempfile(); | |
442 my %hdr = %RETRIEVALHEADER; | |
443 $hdr{'RID'} = $rid; | |
444 my $req = POST $URLBASE, [%hdr]; | |
445 if( $self->verbose > 0 ) { | |
446 $self->warn("retrieve request is " . $req->as_string()); | |
447 } | |
448 my $response = $self->ua->request($req, $tempfile); | |
449 if( $self->verbose > 0 ) { | |
450 open(TMP, $tempfile) or $self->throw("cannot open $tempfile"); | |
451 while(<TMP>) { print $_; } | |
452 close TMP; | |
453 } | |
454 if( $response->is_success ) { | |
455 my $size = -s $tempfile; | |
456 if( $size > 1000 ) { | |
457 my $blastobj; | |
458 if( $self->readmethod =~ /BPlite/ ) { | |
459 $blastobj = new Bio::Tools::BPlite(-file => $tempfile); | |
460 } else { | |
461 $blastobj = new Bio::SearchIO(-file => $tempfile, | |
462 -format => 'blast'); | |
463 } | |
464 #save tempfile | |
465 $self->file($tempfile); | |
466 return $blastobj; | |
467 } elsif( $size < 500 ) { # search had a problem | |
468 open(ERR, "<$tempfile") or $self->throw("cannot open file $tempfile"); | |
469 $self->warn(join("", <ERR>)); | |
470 close ERR; | |
471 return -1; | |
472 } else { # still working | |
473 return 0; | |
474 } | |
475 } else { | |
476 $self->warn($response->error_as_HTML); | |
477 return -1; | |
478 } | |
479 } | |
480 | |
481 =head2 save_output | |
482 | |
483 Title : saveoutput | |
484 Usage : my $saveoutput = $self->save_output($filename) | |
485 Function: Method to save the blast report | |
486 Returns : 1 (throws error otherwise) | |
487 Args : string [rid, filename] | |
488 | |
489 =cut | |
490 | |
491 sub save_output { | |
492 my ($self, $filename) = @_; | |
493 if( ! defined $filename ) { | |
494 $self->throw("Can't save blast output. You must specify a filename to save to."); | |
495 } | |
496 #should be set when retrieving blast | |
497 my $blastfile = $self->file; | |
498 #open temp file and output file, have to filter out some HTML | |
499 open(TMP, $blastfile) or $self->throw("cannot open $blastfile"); | |
500 open(SAVEOUT, ">$filename") or $self->throw("cannot open $filename"); | |
501 my $seentop=0; | |
502 while(<TMP>) { | |
503 next if (/<pre>/); | |
504 if( /^(?:[T]?BLAST[NPX])\s*.+$/i || | |
505 /^RPS-BLAST\s*.+$/i ) { | |
506 $seentop=1; | |
507 } | |
508 next if !$seentop; | |
509 if( $seentop ) { | |
510 print SAVEOUT; | |
511 } | |
512 } | |
513 close SAVEOUT; | |
514 close TMP; | |
515 return 1; | |
516 } | |
517 | |
518 sub _load_input { | |
519 my ($self, $input) = @_; | |
520 | |
521 if( ! defined $input ) { | |
522 $self->throw("Calling remote blast with no input"); | |
523 } | |
524 my @seqs; | |
525 if( ! ref $input ) { | |
526 if( -e $input ) { | |
527 my $seqio = new Bio::SeqIO(-format => 'fasta', -file => $input); | |
528 while( my $seq = $seqio->next_seq ) { | |
529 push @seqs, $seq; | |
530 } | |
531 } else { | |
532 $self->throw("Input $input was not a valid filename"); | |
533 } | |
534 } elsif( ref($input) =~ /ARRAY/i ) { | |
535 foreach ( @$input ) { | |
536 if( ref($_) && $_->isa('Bio::PrimarySeqI') ) { | |
537 push @seqs, $_; | |
538 } else { | |
539 $self->warn("Trying to add a " . ref($_) . | |
540 " but expected a Bio::PrimarySeqI"); | |
541 } | |
542 } | |
543 if( ! @seqs) { | |
544 $self->throw("Did not pass in valid input -- no sequence objects found"); | |
545 } | |
546 } elsif( $input->isa('Bio::PrimarySeqI') ) { | |
547 push @seqs, $input; | |
548 } | |
549 return @seqs; | |
550 } | |
551 1; | |
552 __END__ |