Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/Run/RemoteBlast.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Run/RemoteBlast.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,552 @@ +# $Id: RemoteBlast.pm,v 1.14.2.2 2003/09/03 18:29:50 jason Exp $ +# +# BioPerl module for Bio::Tools::Run::RemoteBlast +# +# Cared for by Jason Stajich, Mat Wiepert +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Run::RemoteBlast - Object for remote execution of the NCBI Blast +via HTTP + +=head1 SYNOPSIS + + #Remote-blast "factory object" creation and blast-parameter initialization + + use Bio::Tools::Run::RemoteBlast; + use strict; + my $prog = 'blastp'; + my $db = 'swissprot'; + my $e_val= '1e-10'; + + my @params = ( '-prog' => $prog, + '-data' => $db, + '-expect' => $e_val, + '-readmethod' => 'SearchIO' ); + + my $factory = Bio::Tools::Run::RemoteBlast->new(@params); + + #change a paramter + $Bio::Tools::Run::RemoteBlast::HEADER{'ENTREZ_QUERY'} = 'Homo sapiens [ORGN]'; + + #remove a parameter + delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; + + my $v = 1; + #$v is just to turn on and off the messages + + my $str = Bio::SeqIO->new(-file=>'amino.fa' , '-format' => 'fasta' ); + + while (my $input = $str->next_seq()){ + #Blast a sequence against a database: + + #Alternatively, you could pass in a file with many + #sequences rather than loop through sequence one at a time + #Remove the loop starting 'while (my $input = $str->next_seq())' + #and swap the two lines below for an example of that. + my $r = $factory->submit_blast($input); + #my $r = $factory->submit_blast('amino.fa'); + + print STDERR "waiting..." if( $v > 0 ); + while ( my @rids = $factory->each_rid ) { + foreach my $rid ( @rids ) { + my $rc = $factory->retrieve_blast($rid); + if( !ref($rc) ) { + if( $rc < 0 ) { + $factory->remove_rid($rid); + } + print STDERR "." if ( $v > 0 ); + sleep 5; + } else { + my $result = $rc->next_result(); + #save the output + my $filename = $result->query_name()."\.out"; + $factory->save_output($filename); + $factory->remove_rid($rid); + print "\nQuery Name: ", $result->query_name(), "\n"; + while ( my $hit = $result->next_hit ) { + next unless ( $v > 0); + print "\thit name is ", $hit->name, "\n"; + while( my $hsp = $hit->next_hsp ) { + print "\t\tscore is ", $hsp->score, "\n"; + } + } + } + } + } + } + + # This example shows how to change a CGI parameter: + $Bio::Tools::Run::RemoteBlast::HEADER{'MATRIX_NAME'} = 'BLOSUM25'; + + # And this is how to delete a CGI parameter: + delete $Bio::Tools::Run::RemoteBlast::HEADER{'FILTER'}; + + +=head1 DESCRIPTION + +Class for remote execution of the NCBI Blast via HTTP. + +For a description of the many CGI parameters see: +http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html + +Various additional options and input formats are available. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Tools::Run::RemoteBlast; + +use vars qw($AUTOLOAD @ISA %BLAST_PARAMS $URLBASE %HEADER %RETRIEVALHEADER + $RIDLINE); +use strict; + +use Bio::Root::Root; +use Bio::Root::IO; +use Bio::SeqIO; +use IO::String; +use Bio::Tools::BPlite; +use Bio::SearchIO; +use LWP; +use HTTP::Request::Common; +BEGIN { + $URLBASE = 'http://www.ncbi.nlm.nih.gov/blast/Blast.cgi'; + %HEADER = ('CMD' => 'Put', + 'PROGRAM' => '', + 'DATABASE' => '', + 'FILTER' => 'L', + 'EXPECT' => '', + 'QUERY' => '', + 'CDD_SEARCH' => 'off', + 'COMPOSITION_BASED_STATISTICS' => 'off', + 'FORMAT_OBJECT' => 'Alignment', + 'SERVICE' => 'plain', + ); + + %RETRIEVALHEADER = ('CMD' => 'Get', + 'RID' => '', + 'ALIGNMENT_VIEW' => 'Pairwise', + 'DESCRIPTIONS' => 100, + 'ALIGNMENTS' => 50, + 'FORMAT_TYPE' => 'Text', + ); + + $RIDLINE = 'RID\s+=\s+(\S+)'; + + %BLAST_PARAMS = ( 'prog' => 'blastp', + 'data' => 'nr', + 'expect' => '1e-3', + 'readmethod' => 'SearchIO' + ); + +} + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +sub new { + my ($caller, @args) = @_; + # chained new + my $self = $caller->SUPER::new(@args); + # so that tempfiles are cleaned up + $self->_initialize_io(); + my ($prog, $data, $expect, + $readmethod) = $self->_rearrange([qw(PROG DATA + EXPECT + READMETHOD)], + @args); + + $readmethod = $BLAST_PARAMS{'readmethod'} unless defined $readmethod; + $prog = $BLAST_PARAMS{'prog'} unless defined $prog; + $data = $BLAST_PARAMS{'data'} unless defined $data; + $expect = $BLAST_PARAMS{'expect'} unless defined $expect; + $self->readmethod($readmethod); + $self->program($prog); + $self->database($data); + $self->expect($expect); + + return $self; +} + +=head2 header + + Title : header + Usage : my $header = $self->header + Function: Get/Set HTTP header for blast query + Returns : string + Args : none + +=cut + +sub header { + my ($self) = @_; + my %h = %HEADER; + $h{'PROGRAM'} = $self->program; + $h{'DATABASE'} = $self->database; + $h{'EXPECT'} = $self->expect; + return %h; +} + +=head2 readmethod + + Title : readmethod + Usage : my $readmethod = $self->readmethod + Function: Get/Set the method to read the blast report + Returns : string + Args : string [ Blast, BPlite ] + +=cut + +sub readmethod { + my ($self, $val) = @_; + if( defined $val ) { + $self->{'_readmethod'} = $val; + } + return $self->{'_readmethod'}; +} + + +=head2 program + + Title : program + Usage : my $prog = $self->program + Function: Get/Set the program to run + Returns : string + Args : string [ blastp, blastn, blastx, tblastn, tblastx ] + +=cut + +sub program { + my ($self, $val) = @_; + if( defined $val ) { + $val = lc $val; + if( $val !~ /t?blast[pnx]/ ) { + $self->warn("trying to set program to an invalid program name ($val) -- defaulting to blastp"); + $val = 'blastp'; + } +# $self->{'_program'} = $val; + $HEADER{'PROGRAM'} = $val; + } + return $HEADER{'PROGRAM'}; +} + + +=head2 database + + Title : database + Usage : my $db = $self->database + Function: Get/Set the database to search + Returns : string + Args : string [ swissprot, nr, nt, etc... ] + +=cut + +sub database { + my ($self, $val) = @_; + if( defined $val ) { +# $self->{'_database'} = $val; + $HEADER{'DATABASE'} = $val; + } + return $HEADER{'DATABASE'}; +} + + +=head2 expect + + Title : expect + Usage : my $expect = $self->expect + Function: Get/Set the E value cutoff + Returns : string + Args : string [ '1e-4' ] + +=cut + +sub expect { + my ($self, $val) = @_; + if( defined $val ) { +# $self->{'_expect'} = $val; + $HEADER{'EXPECT'} = $val; + } + return $HEADER{'EXPECT'}; +} + +=head2 ua + + Title : ua + Usage : my $ua = $self->ua or + $self->ua($ua) + Function: Get/Set a LWP::UserAgent for use + Returns : reference to LWP::UserAgent Object + Args : none + Comments: Will create a UserAgent if none has been requested before. + +=cut + +sub ua { + my ($self, $value) = @_; + if( ! defined $self->{'_ua'} ) { + $self->{'_ua'} = new LWP::UserAgent; + } + return $self->{'_ua'}; +} + +=head2 proxy + + Title : proxy + Usage : $httpproxy = $db->proxy('http') or + $db->proxy(['http','ftp'], 'http://myproxy' ) + Function: Get/Set a proxy for use of proxy + Returns : a string indicating the proxy + Args : $protocol : an array ref of the protocol(s) to set/get + $proxyurl : url of the proxy to use for the specified protocol + +=cut + +sub proxy { + my ($self,$protocol,$proxy) = @_; + return undef if ( !defined $self->ua || !defined $protocol + || !defined $proxy ); + return $self->ua->proxy($protocol,$proxy); +} + +sub add_rid { + my ($self, @vals) = @_; + foreach ( @vals ) { + $self->{'_rids'}->{$_} = 1; + } + return scalar keys %{$self->{'_rids'}}; +} + +sub remove_rid { + my ($self, @vals) = @_; + foreach ( @vals ) { + delete $self->{'_rids'}->{$_}; + } + return scalar keys %{$self->{'_rids'}}; +} + +sub each_rid { + my ($self) = @_; + return keys %{$self->{'_rids'}}; +} + +=head2 submit_blast + + Title : submit_blast + Usage : $self->submit_blast([$seq1,$seq2]); + Function: Submit blast jobs to ncbi blast queue on sequence(s) + Returns : Blast report object as defined by $self->readmethod + Args : input can be: + * sequence object + * array ref of sequence objects + * filename of file containing fasta formatted sequences + +=cut + +sub submit_blast { + my ($self, $input) = @_; + my @seqs = $self->_load_input($input); + return 0 unless ( @seqs ); + my $tcount = 0; + my %header = $self->header; + foreach my $seq ( @seqs ) { + #If query has a fasta header, the output has the query line. + $header{'QUERY'} = ">".(defined $seq->display_id() ? $seq->display_id() : ""). + " ".(defined $seq->desc() ? $seq->desc() : "")."\n".$seq->seq(); + my $request = POST $URLBASE, [%header]; + $self->warn($request->as_string) if ( $self->verbose > 0); + my $response = $self->ua->request( $request); + + if( $response->is_success ) { + if( $self->verbose > 0 ) { + my ($tempfh) = $self->tempfile(); + # Hmm, what exactly are we trying to do here? + print $tempfh $response->content; + close($tempfh); + undef $tempfh; + } + my @subdata = split(/\n/, $response->content ); + my $count = 0; + foreach ( @subdata ) { + if( /$RIDLINE/ ) { + $count++; + print STDERR $_ if( $self->verbose > 0); + $self->add_rid($1); + last; + } + } + if( $count == 0 ) { + $self->warn("req was ". $request->as_string() . "\n"); + $self->warn(join('', @subdata)); + } + $tcount += $count; + } else { + # should try and be a little more verbose here + $self->warn("req was ". $request->as_string() . "\n" . + $response->error_as_HTML); + $tcount = -1; + } + } + return $tcount; +} + +=head2 retrieve_blast + + Title : retrieve_blast + Usage : my $blastreport = $blastfactory->retrieve_blast($rid); + Function: Attempts to retrieve a blast report from remote blast queue + Returns : -1 on error, + 0 on 'job not finished', + Bio::Tools::BPlite or Bio::Tools::Blast object + (depending on how object was initialized) on success + Args : Remote Blast ID (RID) + +=cut + +sub retrieve_blast { + my($self, $rid) = @_; + my (undef,$tempfile) = $self->tempfile(); + my %hdr = %RETRIEVALHEADER; + $hdr{'RID'} = $rid; + my $req = POST $URLBASE, [%hdr]; + if( $self->verbose > 0 ) { + $self->warn("retrieve request is " . $req->as_string()); + } + my $response = $self->ua->request($req, $tempfile); + if( $self->verbose > 0 ) { + open(TMP, $tempfile) or $self->throw("cannot open $tempfile"); + while(<TMP>) { print $_; } + close TMP; + } + if( $response->is_success ) { + my $size = -s $tempfile; + if( $size > 1000 ) { + my $blastobj; + if( $self->readmethod =~ /BPlite/ ) { + $blastobj = new Bio::Tools::BPlite(-file => $tempfile); + } else { + $blastobj = new Bio::SearchIO(-file => $tempfile, + -format => 'blast'); + } + #save tempfile + $self->file($tempfile); + return $blastobj; + } elsif( $size < 500 ) { # search had a problem + open(ERR, "<$tempfile") or $self->throw("cannot open file $tempfile"); + $self->warn(join("", <ERR>)); + close ERR; + return -1; + } else { # still working + return 0; + } + } else { + $self->warn($response->error_as_HTML); + return -1; + } +} + +=head2 save_output + + Title : saveoutput + Usage : my $saveoutput = $self->save_output($filename) + Function: Method to save the blast report + Returns : 1 (throws error otherwise) + Args : string [rid, filename] + +=cut + +sub save_output { + my ($self, $filename) = @_; + if( ! defined $filename ) { + $self->throw("Can't save blast output. You must specify a filename to save to."); + } + #should be set when retrieving blast + my $blastfile = $self->file; + #open temp file and output file, have to filter out some HTML + open(TMP, $blastfile) or $self->throw("cannot open $blastfile"); + open(SAVEOUT, ">$filename") or $self->throw("cannot open $filename"); + my $seentop=0; + while(<TMP>) { + next if (/<pre>/); + if( /^(?:[T]?BLAST[NPX])\s*.+$/i || + /^RPS-BLAST\s*.+$/i ) { + $seentop=1; + } + next if !$seentop; + if( $seentop ) { + print SAVEOUT; + } + } + close SAVEOUT; + close TMP; + return 1; +} + +sub _load_input { + my ($self, $input) = @_; + + if( ! defined $input ) { + $self->throw("Calling remote blast with no input"); + } + my @seqs; + if( ! ref $input ) { + if( -e $input ) { + my $seqio = new Bio::SeqIO(-format => 'fasta', -file => $input); + while( my $seq = $seqio->next_seq ) { + push @seqs, $seq; + } + } else { + $self->throw("Input $input was not a valid filename"); + } + } elsif( ref($input) =~ /ARRAY/i ) { + foreach ( @$input ) { + if( ref($_) && $_->isa('Bio::PrimarySeqI') ) { + push @seqs, $_; + } else { + $self->warn("Trying to add a " . ref($_) . + " but expected a Bio::PrimarySeqI"); + } + } + if( ! @seqs) { + $self->throw("Did not pass in valid input -- no sequence objects found"); + } + } elsif( $input->isa('Bio::PrimarySeqI') ) { + push @seqs, $input; + } + return @seqs; +} +1; +__END__