0
|
1 # $Id: StandAloneBlast.pm,v 1.23.2.3 2003/03/29 20:18:51 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Tools::StandAloneBlast
|
|
4 #
|
|
5 # Cared for by Peter Schattner
|
|
6 #
|
|
7 # Copyright Peter Schattner
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Tools::Run::StandAloneBlast - Object for the local execution of the
|
|
16 NCBI Blast program suite (blastall, blastpgp, bl2seq)
|
|
17
|
|
18 =head1 SYNOPSIS
|
|
19
|
|
20 Local-blast "factory object" creation and blast-parameter initialization:
|
|
21
|
|
22 @params = ('database' => 'swissprot','outfile' => 'blast1.out',
|
|
23 '_READMETHOD' => 'Blast');
|
|
24
|
|
25 $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
|
|
26
|
|
27 Blast a sequence against a database:
|
|
28
|
|
29 $str = Bio::SeqIO->new(-file=>'t/amino.fa' , '-format' => 'Fasta' );
|
|
30 $input = $str->next_seq();
|
|
31 $input2 = $str->next_seq();
|
|
32 $blast_report = $factory->blastall($input);
|
|
33
|
|
34 Run an iterated Blast (psiblast) of a sequence against a database:
|
|
35
|
|
36 $factory->j(3); # 'j' is blast parameter for # of iterations
|
|
37 $factory->outfile('psiblast1.out');
|
|
38 $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
|
|
39 $blast_report = $factory->blastpgp($input);
|
|
40
|
|
41 Use blast to align 2 sequences against each other:
|
|
42
|
|
43 $factory = Bio::Tools::Run::StandAloneBlast->new('outfile' => 'bl2seq.out');
|
|
44 $factory->bl2seq($input, $input2);
|
|
45
|
|
46 Various additional options and input formats are available. See the
|
|
47 DESCRIPTION section for details.
|
|
48
|
|
49 =head1 DESCRIPTION
|
|
50
|
|
51 This DESCRIPTION only documents Bio::Tools::Run::StandAloneBlast: - a
|
|
52 Bioperl object for running the NCBI standAlone BLAST package. Blast,
|
|
53 itself, is a large & complex program - for more information regarding
|
|
54 BLAST, please see the BLAST documentation which accompanies the BLAST
|
|
55 distribution. BLAST is available from ftp://ncbi.nlm.nih.gov/blast/.
|
|
56
|
|
57 (A source of confusion in documenting a BLAST interface is that the
|
|
58 term "program" is used in - at least - three different ways in the
|
|
59 BLAST documentation. In this DESCRIPTION, "program" will refer to the
|
|
60 BLAST routine set by BLAST's C<-p> parameter that can be set to blastn,
|
|
61 blastp, tblastx etc. We will use the term Blast "executable" to refer
|
|
62 to the various different executable files that may be called - ie
|
|
63 blastall, blastpgp or bl2seq. In addition, there are several BLAST
|
|
64 capabilities (which are also referred to as "programs") and are
|
|
65 implemented by using specific combinations of BLAST executables,
|
|
66 programs and parameters. They will be referred by their specific
|
|
67 names - eg PSIBLAST and PHIBLAST. )
|
|
68
|
|
69 StandAloneBlast has been tested so far only under Linux. I expect
|
|
70 that it should also work under other Unix systems. However, since the
|
|
71 module is implemented using (unix) system calls, modification may be
|
|
72 necessary before StandAloneBlast would work under non-Unix
|
|
73 operating systems (eg Windows, MacOS). Before running
|
|
74 StandAloneBlast it is necessary: to install BLAST on your system,
|
|
75 to edit set the environmental variable $BLASTDIR or your $PATH
|
|
76 variable to point to the BLAST directory, and to ensure that users
|
|
77 have execute privileges for the BLAST program. If the databases
|
|
78 which will be searched by BLAST are located in the data subdirectory
|
|
79 of the blast program directory (the default installation location),
|
|
80 StandAloneBlast will find them; however, if the database files are
|
|
81 located in any other location, environmental variable $BLASTDATADIR
|
|
82 will need to be set to point to that directory.
|
|
83
|
|
84 The use of the StandAloneBlast module is as follows: Initially, a
|
|
85 local blast "factory object" is created. The constructor may be passed
|
|
86 an optional array of (non-default) parameters to be used by the
|
|
87 factory, eg:
|
|
88
|
|
89 @params = ('program' => 'blastn', 'database' => 'ecoli.nt');
|
|
90 $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
|
|
91
|
|
92 Any parameters not explicitly set will remain as the defaults of the
|
|
93 BLAST executable. Note each BLAST executable has somewhat different
|
|
94 parameters and options. See the BLAST Documentation for a description
|
|
95 or run the BLAST executable from the command line followed solely with
|
|
96 a "-" to see a list of options and default values for that executable;
|
|
97 eg E<gt>blastall -.
|
|
98
|
|
99 BLAST parameters can be changed and/or examined at any time after the
|
|
100 factory has been created. The program checks that any
|
|
101 parameter/switch being set/read is valid. Except where specifically
|
|
102 noted, StandAloneBlast uses the same single-letter, case-sensitive
|
|
103 parameter names as the actual blast program. Currently no checks are
|
|
104 included to verify that parameters are of the proper type (eg string
|
|
105 or numeric) or that their values are within the proper range.
|
|
106
|
|
107 As an example, to change the value of the Blast parameter 'e' ('e' is
|
|
108 the parameter for expectation-value cutoff)
|
|
109
|
|
110 $expectvalue = 0.01;
|
|
111 $factory->e($expectvalue);
|
|
112
|
|
113 Note that for improved script readibility one can modify the name of
|
|
114 the BLAST parameters as desired as long as the initial letter (and
|
|
115 case) of the parameter are preserved, eg
|
|
116 $factory-E<gt>expectvalue($expectvalue); Unfortunately, some of the BLAST
|
|
117 parameters are not the single letter one might expect (eg "iteration
|
|
118 round" in blastpgp is 'j'). Again one can check by using (eg)
|
|
119
|
|
120 > blastpgp - .
|
|
121
|
|
122 Once the factory has been created and the appropriate parameters set,
|
|
123 one can call one of the supported blast executables. The input
|
|
124 sequence(s) to these executables may be fasta file(s) as described in
|
|
125 the BLAST documentation.
|
|
126
|
|
127 $inputfilename = 't/testquery.fa';
|
|
128 $blast_report = $factory->blastall($inputfilename);
|
|
129
|
|
130 In addition, sequence input may be in the form of either a Bio::Seq
|
|
131 object or or an array of Bio::Seq objects, eg
|
|
132
|
|
133 $input = Bio::Seq->new(-id=>"test query",-seq=>"ACTACCCTTTAAATCAGTGGGGG");
|
|
134 $blast_report = $factory->blastall($input);
|
|
135
|
|
136 For blastall and non-psiblast blastpgp runs, report object is either a
|
|
137 BPlite.pm or Bio::SearchIO object, selected by the user with the
|
|
138 parameter _READMETHOD. (The leading underscore is needed to
|
|
139 distinguish this option from options which are passed to the BLAST
|
|
140 executable.) The default parser is Bio::SearchIO::blast. For
|
|
141 (multiple iteration) psiblast and bl2seq runs the report is
|
|
142 automatically parsed by the BPpsilite.pm and BPbl2seq.pm parsers
|
|
143 respectively, since neither Blast.pm nor BPlite can parse these
|
|
144 reports. In any case, the "raw" blast report is also available. The
|
|
145 filename is set by the in the 'outfile' parameter and has the default
|
|
146 value of "blastreport.out".
|
|
147
|
|
148 For psiblast execution in BLAST's "jumpstart" mode, the program must
|
|
149 be passed (in addition to the query sequence itself) an alignment
|
|
150 containing the query sequence (in the form of a SimpleAlign object) as
|
|
151 well as a "mask" specifying at what residues position-specific scoring
|
|
152 matrices (PSSMs) are to used and at what residues default scoring
|
|
153 matrices (eg BLOSUM) are to be used. See psiblast documentation for
|
|
154 more details. The mask itself is a string of 0's and 1's which is the
|
|
155 same length as each sequence in the alignment and has a "1" at
|
|
156 locations where (PSSMs) are to be used and a "0" at all other
|
|
157 locations. So for example:
|
|
158
|
|
159 $str = Bio::AlignIO->new(-file=> "cysprot.msf", '-format' => 'msf' );
|
|
160 $aln = $str->next_aln();
|
|
161 $len = $aln->length_aln();
|
|
162 $mask = '1' x $len; # simple case where PSSM's to be used at all residues
|
|
163 $report = $factory->blastpgp("cysprot1.fa", $aln, $mask);
|
|
164
|
|
165 For bl2seq execution, StandAloneBlast.pm can be combined with
|
|
166 AlignIO.pm to directly produce a SimpleAlign object from the alignment
|
|
167 of the two sequences produced by bl2seq as in:
|
|
168
|
|
169 #Get 2 sequences
|
|
170 $str = Bio::SeqIO->new(-file=>'t/amino.fa' , '-format' => 'Fasta', );
|
|
171 my $seq3 = $str->next_seq();
|
|
172 my $seq4 = $str->next_seq();
|
|
173
|
|
174 # Run bl2seq on them
|
|
175 $factory = Bio::Tools::Run::StandAloneBlast->new('outfile' => 'bl2seq.out');
|
|
176 my $bl2seq_report = $factory->bl2seq($seq3, $seq4);
|
|
177
|
|
178 # Use AlignIO.pm to create a SimpleAlign object from the bl2seq report
|
|
179 $str = Bio::AlignIO->new(-file=> 'bl2seq.out','-format' => 'bl2seq');
|
|
180 $aln = $str->next_aln();
|
|
181
|
|
182 For more examples of syntax and use of Blast.pm, the user is
|
|
183 encouraged to run the scripts standaloneblast.pl in the bioperl
|
|
184 /examples directory and StandAloneBlast.t in the bioperl /t directory.
|
|
185
|
|
186 Note: There is a similar (but older) perl object interface offered by
|
|
187 nhgri. The nhgri module only supports blastall and does not support
|
|
188 blastpgp, psiblast, phiblast, bl2seq etc. This module can be found at
|
|
189 http://genome.nhgri.nih.gov/blastall/.
|
|
190
|
|
191 =head1 DEVELOPERS NOTES
|
|
192
|
|
193 B<STILL TO BE WRITTEN>
|
|
194
|
|
195 Note: This module is still under development. If you would like that a
|
|
196 specific BLAST feature be added to this perl interface, let me know.
|
|
197
|
|
198 =head1 FEEDBACK
|
|
199
|
|
200 =head2 Mailing Lists
|
|
201
|
|
202 User feedback is an integral part of the evolution of this and other
|
|
203 Bioperl modules. Send your comments and suggestions preferably to one
|
|
204 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
205
|
|
206 bioperl-l@bioperl.org - General discussion
|
|
207 http://bio.perl.org/MailList.html - About the mailing lists
|
|
208
|
|
209 =head2 Reporting Bugs
|
|
210
|
|
211 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
212 the bugs and their resolution. Bug reports can be submitted via email
|
|
213 or the web:
|
|
214
|
|
215 bioperl-bugs@bio.perl.org
|
|
216 http://bio.perl.org/bioperl-bugs/
|
|
217
|
|
218 =head1 AUTHOR - Peter Schattner
|
|
219
|
|
220 Email schattner@alum.mit.edu
|
|
221
|
|
222 =head1 APPENDIX
|
|
223
|
|
224 The rest of the documentation details each of the object
|
|
225 methods. Internal methods are usually preceded with a _
|
|
226
|
|
227 =cut
|
|
228
|
|
229 package Bio::Tools::Run::StandAloneBlast;
|
|
230
|
|
231 use vars qw($AUTOLOAD @ISA $PROGRAMDIR $DATADIR
|
|
232 @BLASTALL_PARAMS @BLASTPGP_PARAMS
|
|
233 @BL2SEQ_PARAMS @OTHER_PARAMS %OK_FIELD
|
|
234 );
|
|
235 use strict;
|
|
236 use Bio::Root::Root;
|
|
237 use Bio::Root::IO;
|
|
238 use Bio::Seq;
|
|
239 use Bio::SeqIO;
|
|
240 use Bio::Tools::BPbl2seq;
|
|
241 use Bio::Tools::BPpsilite;
|
|
242 use Bio::SearchIO;
|
|
243 use Bio::Tools::Run::WrapperBase;
|
|
244 use Bio::Factory::ApplicationFactoryI;
|
|
245
|
|
246 BEGIN {
|
|
247
|
|
248 @BLASTALL_PARAMS = qw( p d i e m o F G E X I q r v b f g Q
|
|
249 D a O J M W z K L Y S T l U y Z);
|
|
250 @BLASTPGP_PARAMS = qw(d i A f e m o y P F G E X N g S H a I h c
|
|
251 j J Z O M v b C R W z K L Y p k T Q B l U);
|
|
252 @BL2SEQ_PARAMS = qw(i j p g o d a G E X W M q r F e S T m);
|
|
253
|
|
254
|
|
255 # Non BLAST parameters start with underscore to differentiate them
|
|
256 # from BLAST parameters
|
|
257 @OTHER_PARAMS = qw(_READMETHOD);
|
|
258
|
|
259 # _READMETHOD = 'BPlite' (default) or 'Blast'
|
|
260 # my @other_switches = qw(QUIET);
|
|
261
|
|
262
|
|
263 # Authorize attribute fields
|
|
264 foreach my $attr (@BLASTALL_PARAMS, @BLASTPGP_PARAMS,
|
|
265 @BL2SEQ_PARAMS, @OTHER_PARAMS )
|
|
266 { $OK_FIELD{$attr}++; }
|
|
267
|
|
268 # You will need to enable Blast to find the Blast program. This can be done
|
|
269 # in (at least) two different ways:
|
|
270 # 1. define an environmental variable blastDIR:
|
|
271 # export BLASTDIR=/home/peter/blast or
|
|
272 # 2. include a definition of an environmental variable BLASTDIR in every script that will
|
|
273 # use StandAloneBlast.pm.
|
|
274 # BEGIN {$ENV{BLASTDIR} = '/home/peter/blast/'; }
|
|
275 $PROGRAMDIR = $ENV{'BLASTDIR'} || '';
|
|
276
|
|
277 # If local BLAST databases are not stored in the standard
|
|
278 # /data directory, the variable BLASTDATADIR will need to be set explicitly
|
|
279 $DATADIR = $ENV{'BLASTDATADIR'} || $ENV{'BLASTDB'} || '';
|
|
280 }
|
|
281
|
|
282 @ISA = qw(Bio::Root::Root
|
|
283 Bio::Tools::Run::WrapperBase
|
|
284 Bio::Factory::ApplicationFactoryI);
|
|
285
|
|
286 =head1 BLAST parameters
|
|
287
|
|
288 Essentially all BLAST parameter can be set via StandAloneBlast.pm.
|
|
289 Some of the most commonly used parameters are listed below. All
|
|
290 parameters have defaults and are optional (I think.) For a complete
|
|
291 listing of settable parameters, run the relevant executable BLAST
|
|
292 program with the option "-" as in blastall -
|
|
293
|
|
294 =head2 Blastall
|
|
295
|
|
296 -p Program Name [String]
|
|
297 Input should be one of "blastp", "blastn", "blastx",
|
|
298 "tblastn", or "tblastx".
|
|
299 -d Database [String] default = nr
|
|
300 The database specified must first be formatted with formatdb.
|
|
301 Multiple database names (bracketed by quotations) will be accepted.
|
|
302 An example would be -d "nr est"
|
|
303 -i Query File [File In] Set by StandAloneBlast.pm from script.
|
|
304 default = stdin. The query should be in FASTA format. If multiple FASTA entries are in the input
|
|
305 file, all queries will be searched.
|
|
306 -e Expectation value (E) [Real] default = 10.0
|
|
307 -o BLAST report Output File [File Out] Optional,
|
|
308 default = ./blastreport.out ; set by StandAloneBlast.pm
|
|
309 -S Query strands to search against database (for blast[nx], and tblastx). 3 is both, 1 is top, 2 is bottom [Integer]
|
|
310 default = 3
|
|
311
|
|
312 =head2 Blastpgp (including Psiblast)
|
|
313
|
|
314 -j is the maximum number of rounds (default 1; i.e., regular BLAST)
|
|
315 -h is the e-value threshold for including sequences in the
|
|
316 score matrix model (default 0.001)
|
|
317 -c is the "constant" used in the pseudocount formula specified in the paper (default 10)
|
|
318 -B Multiple alignment file for PSI-BLAST "jump start mode" Optional
|
|
319 -Q Output File for PSI-BLAST Matrix in ASCII [File Out] Optional
|
|
320
|
|
321 =head2 Bl2seq
|
|
322
|
|
323 -i First sequence [File In]
|
|
324 -j Second sequence [File In]
|
|
325 -p Program name: blastp, blastn, blastx. For blastx 1st argument should be nucleotide [String]
|
|
326 default = blastp
|
|
327 -o alignment output file [File Out] default = stdout
|
|
328 -e Expectation value (E) [Real] default = 10.0
|
|
329 -S Query strands to search against database (blastn only). 3 is both, 1 is top, 2 is bottom [Integer]
|
|
330 default = 3
|
|
331
|
|
332 =cut
|
|
333
|
|
334 sub new {
|
|
335 my ($caller, @args) = @_;
|
|
336 # chained new
|
|
337 my $self = $caller->SUPER::new(@args);
|
|
338
|
|
339 # to facilitiate tempfile cleanup
|
|
340 my ($tfh,$tempfile) = $self->io->tempfile();
|
|
341 close($tfh); # we don't want the filehandle, just a temporary name
|
|
342 $self->outfile($tempfile);
|
|
343 $self->_READMETHOD('Blast');
|
|
344 while (@args) {
|
|
345 my $attr = shift @args;
|
|
346 my $value = shift @args;
|
|
347 next if( $attr eq '-verbose');
|
|
348 # the workaround to deal with initializing
|
|
349 $attr = 'p' if $attr =~ /^\s*program\s*$/;
|
|
350 $self->$attr($value);
|
|
351 }
|
|
352 return $self;
|
|
353 }
|
|
354
|
|
355 sub AUTOLOAD {
|
|
356 my $self = shift;
|
|
357 my $attr = $AUTOLOAD;
|
|
358 $attr =~ s/.*:://;
|
|
359 my $attr_letter = substr($attr, 0, 1) ;
|
|
360
|
|
361 # actual key is first letter of $attr unless first attribute
|
|
362 # letter is underscore (as in _READMETHOD), the $attr is a BLAST
|
|
363 # parameter and should be truncated to its first letter only
|
|
364 $attr = ($attr_letter eq '_') ? $attr : $attr_letter;
|
|
365 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
|
|
366 # $self->throw("Unallowed parameter: $attr !") unless $ok_field{$attr_letter};
|
|
367 $self->{$attr_letter} = shift if @_;
|
|
368 return $self->{$attr_letter};
|
|
369 }
|
|
370
|
|
371 =head1 Methods
|
|
372
|
|
373 =head2 executable
|
|
374
|
|
375 Title : executable
|
|
376 Usage : my $exe = $blastfactory->executable('blastall');
|
|
377 Function: Finds the full path to the 'codeml' executable
|
|
378 Returns : string representing the full path to the exe
|
|
379 Args : [optional] name of executable to set path to
|
|
380 [optional] boolean flag whether or not warn when exe is not found
|
|
381
|
|
382
|
|
383 =cut
|
|
384
|
|
385 sub executable {
|
|
386 my ($self, $exename, $exe,$warn) = @_;
|
|
387 $exename = 'blastall' unless defined $exename;
|
|
388
|
|
389 if( defined $exe && -x $exe ) {
|
|
390 $self->{'_pathtoexe'}->{$exename} = $exe;
|
|
391 }
|
|
392 unless( defined $self->{'_pathtoexe'}->{$exename} ) {
|
|
393 my $f = $self->program_path($exename);
|
|
394 $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f );
|
|
395
|
|
396 # This is how I meant to split up these conditionals --jason
|
|
397 # if exe is null we will execute this (handle the case where
|
|
398 # PROGRAMDIR pointed to something invalid)
|
|
399 unless( $exe ) { # we didn't find it in that last conditional
|
|
400 if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) {
|
|
401 $self->{'_pathtoexe'}->{$exename} = $exe;
|
|
402 } else {
|
|
403 $self->warn("Cannot find executable for $exename") if $warn;
|
|
404 $self->{'_pathtoexe'}->{$exename} = undef;
|
|
405 }
|
|
406 }
|
|
407 }
|
|
408 return $self->{'_pathtoexe'}->{$exename};
|
|
409 }
|
|
410
|
|
411
|
|
412 =head2 program_path
|
|
413
|
|
414 Title : program_path
|
|
415 Usage : my $path = $factory->program_path();
|
|
416 Function: Builds path for executable
|
|
417 Returns : string representing the full path to the exe
|
|
418 Args : none
|
|
419
|
|
420 =cut
|
|
421
|
|
422 sub program_path {
|
|
423 my ($self,$program_name) = @_;
|
|
424 my @path;
|
|
425 push @path, $self->program_dir if $self->program_dir;
|
|
426 push @path, $program_name .($^O =~ /mswin/i ?'.exe':'');
|
|
427
|
|
428 return Bio::Root::IO->catfile(@path);
|
|
429 }
|
|
430
|
|
431 =head2 program_dir
|
|
432
|
|
433 Title : program_dir
|
|
434 Usage : my $dir = $factory->program_dir();
|
|
435 Function: Abstract get method for dir of program. To be implemented
|
|
436 by wrapper.
|
|
437 Returns : string representing program directory
|
|
438 Args : none
|
|
439
|
|
440 =cut
|
|
441
|
|
442 sub program_dir {
|
|
443 $PROGRAMDIR;
|
|
444 }
|
|
445
|
|
446 sub program {
|
|
447 my $self = shift;
|
|
448 if( wantarray ) {
|
|
449 return ($self->executable, $self->p());
|
|
450 } else {
|
|
451 return $self->executable(@_);
|
|
452 }
|
|
453 }
|
|
454
|
|
455 =head2 blastall
|
|
456
|
|
457 Title : blastall
|
|
458 Usage : $blast_report = $factory->blastall('t/testquery.fa');
|
|
459 or
|
|
460 $input = Bio::Seq->new(-id=>"test query",
|
|
461 -seq=>"ACTACCCTTTAAATCAGTGGGGG");
|
|
462 $blast_report = $factory->blastall($input);
|
|
463 or
|
|
464 $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects
|
|
465 $blast_report = $factory->blastall(\@seq_array);
|
|
466 Returns : Reference to a Blast object or BPlite object
|
|
467 containing the blast report.
|
|
468 Args : Name of a file or Bio::Seq object or an array of
|
|
469 Bio::Seq object containing the query sequence(s).
|
|
470 Throws an exception if argument is not either a string
|
|
471 (eg a filename) or a reference to a Bio::Seq object
|
|
472 (or to an array of Seq objects). If argument is string,
|
|
473 throws exception if file corresponding to string name can
|
|
474 not be found.
|
|
475
|
|
476 =cut
|
|
477
|
|
478 sub blastall {
|
|
479 my ($self,$input1) = @_;
|
|
480 $self->io->_io_cleanup();
|
|
481 my $executable = 'blastall';
|
|
482 my $input2;
|
|
483 # Create input file pointer
|
|
484 my $infilename1 = $self->_setinput($executable, $input1);
|
|
485 if (! $infilename1) {$self->throw(" $input1 ($infilename1) not Bio::Seq object or array of Bio::Seq objects or file name!");}
|
|
486
|
|
487 $self->i($infilename1); # set file name of sequence to be blasted to inputfilename1 (-i param of blastall)
|
|
488
|
|
489 my $blast_report = &_generic_local_blast($self, $executable,
|
|
490 $input1, $input2);
|
|
491 }
|
|
492
|
|
493 =head2 blastpgp
|
|
494
|
|
495 Title : blastpgp
|
|
496 Usage : $blast_report = $factory-> blastpgp('t/testquery.fa');
|
|
497 or
|
|
498 $input = Bio::Seq->new(-id=>"test query",
|
|
499 -seq=>"ACTADDEEQQPPTCADEEQQQVVGG");
|
|
500 $blast_report = $factory->blastpgp ($input);
|
|
501 or
|
|
502 $seq_array_ref = \@seq_array; # where @seq_array is an array of Bio::Seq objects
|
|
503 $blast_report = $factory-> blastpgp(\@seq_array);
|
|
504 Returns : Reference to a Blast object or BPlite object containing
|
|
505 the blast report.
|
|
506 Args : Name of a file or Bio::Seq object. In psiblast jumpstart
|
|
507 mode two additional arguments are required: a SimpleAlign
|
|
508 object one of whose elements is the query and a "mask" to
|
|
509 determine how BLAST should select scoring matrices see
|
|
510 DESCRIPTION above for more details.
|
|
511
|
|
512 Throws an exception if argument is not either a string
|
|
513 (eg a filename) or a reference to a Bio::Seq object
|
|
514 (or to an array of Seq objects). If argument is string,
|
|
515 throws exception if file corresponding to string name can
|
|
516 not be found.
|
|
517 Returns : Reference to either a BPlite.pm, Blast.pm or BPpsilite.pm
|
|
518 object containing the blast report.
|
|
519
|
|
520 =cut
|
|
521
|
|
522 sub blastpgp {
|
|
523 my $self = shift;
|
|
524 my $executable = 'blastpgp';
|
|
525 my $input1 = shift;
|
|
526 my $input2 = shift;
|
|
527 my $mask = shift; # used by blastpgp's -B option to specify which residues are position aligned
|
|
528
|
|
529 my ($infilename1, $infilename2 ) = $self->_setinput($executable,
|
|
530 $input1, $input2,
|
|
531 $mask);
|
|
532 if (!$infilename1) {$self->throw(" $input1 not Bio::Seq object or array of Bio::Seq objects or file name!");}
|
|
533 $self->i($infilename1); # set file name of sequence to be blasted to inputfilename1 (-i param of blastpgp)
|
|
534 if ($input2) {
|
|
535 unless ($infilename2) {$self->throw("$input2 not SimpleAlign Object in pre-aligned psiblast\n");}
|
|
536 $self->B($infilename2); # set file name of partial alignment to inputfilename2 (-B param of blastpgp)
|
|
537 }
|
|
538 my $blast_report = &_generic_local_blast($self, $executable, $input1, $input2);
|
|
539 }
|
|
540
|
|
541 =head2 bl2seq
|
|
542
|
|
543 Title : bl2seq
|
|
544 Usage : $factory-> blastpgp('t/seq1.fa', 't/seq2.fa');
|
|
545 or
|
|
546 $input1 = Bio::Seq->new(-id=>"test query1",
|
|
547 -seq=>"ACTADDEEQQPPTCADEEQQQVVGG");
|
|
548 $input2 = Bio::Seq->new(-id=>"test query2",
|
|
549 -seq=>"ACTADDEMMMMMMMDEEQQQVVGG");
|
|
550 $blast_report = $factory->bl2seq ($input1, $input2);
|
|
551 Returns : Reference to a BPbl2seq object containing the blast report.
|
|
552 Args : Names of 2 files or 2 Bio::Seq objects containing the
|
|
553 sequences to be aligned by bl2seq.
|
|
554
|
|
555 Throws an exception if argument is not either a pair of
|
|
556 strings (eg filenames) or references to Bio::Seq objects.
|
|
557 If arguments are strings, throws exception if files
|
|
558 corresponding to string names can not be found.
|
|
559
|
|
560 =cut
|
|
561
|
|
562 sub bl2seq {
|
|
563 my $self = shift;
|
|
564 my $executable = 'bl2seq';
|
|
565 my $input1 = shift;
|
|
566 my $input2 = shift;
|
|
567
|
|
568 # Create input file pointer
|
|
569 my ($infilename1, $infilename2 ) = $self->_setinput($executable,
|
|
570 $input1, $input2);
|
|
571 if (!$infilename1){$self->throw(" $input1 not Seq Object or file name!");}
|
|
572 if (!$infilename2){$self->throw("$input2 not Seq Object or file name!");}
|
|
573
|
|
574 $self->i($infilename1); # set file name of first sequence to
|
|
575 # be aligned to inputfilename1
|
|
576 # (-i param of bl2seq)
|
|
577 $self->j($infilename2); # set file name of first sequence to
|
|
578 # be aligned to inputfilename2
|
|
579 # (-j param of bl2seq)
|
|
580
|
|
581 my $blast_report = &_generic_local_blast($self, $executable);
|
|
582 }
|
|
583 #################################################
|
|
584
|
|
585 =head2 _generic_local_blast
|
|
586
|
|
587 Title : _generic_local_blast
|
|
588 Usage : internal function not called directly
|
|
589 Returns : Blast or BPlite object
|
|
590 Args : Reference to calling object and name of BLAST executable
|
|
591
|
|
592 =cut
|
|
593
|
|
594 sub _generic_local_blast {
|
|
595 my $self = shift;
|
|
596 my $executable = shift;
|
|
597
|
|
598 # Create parameter string to pass to Blast program
|
|
599 my $param_string = $self->_setparams($executable);
|
|
600
|
|
601 # run Blast
|
|
602 my $blast_report = &_runblast($self, $executable, $param_string);
|
|
603 }
|
|
604
|
|
605
|
|
606 =head2 _runblast
|
|
607
|
|
608 Title : _runblast
|
|
609 Usage : Internal function, not to be called directly
|
|
610 Function: makes actual system call to Blast program
|
|
611 Example :
|
|
612 Returns : Report object in the appropriate format (BPlite,
|
|
613 BPpsilite, Blast, or BPbl2seq)
|
|
614 Args : Reference to calling object, name of BLAST executable,
|
|
615 and parameter string for executable
|
|
616
|
|
617 =cut
|
|
618
|
|
619 sub _runblast {
|
|
620 my ($self,$executable,$param_string) = @_;
|
|
621 my ($blast_obj,$exe);
|
|
622 if( ! ($exe = $self->executable($executable)) ) {
|
|
623 $self->warn("cannot find path to $executable");
|
|
624 return undef;
|
|
625 }
|
|
626 my $commandstring = $exe. $param_string;
|
|
627
|
|
628 # next line for debugging
|
|
629 $self->debug( "$commandstring \n");
|
|
630
|
|
631 my $status = system($commandstring);
|
|
632
|
|
633 $self->throw("$executable call crashed: $? $commandstring\n") unless ($status==0) ;
|
|
634 my $outfile = $self->o() ; # get outputfilename
|
|
635 my $signif = $self->e() || 1e-5 ;
|
|
636
|
|
637 # set significance cutoff to set expectation value or default value
|
|
638 # (may want to make this value vary for different executables)
|
|
639
|
|
640 # If running bl2seq or psiblast (blastpgp with multiple iterations),
|
|
641 # the specific parsers for these programs must be used (ie BPbl2seq or
|
|
642 # BPpsilite). Otherwise either the Blast parser or the BPlite
|
|
643 # parsers can be selected.
|
|
644
|
|
645 if ($executable =~ /bl2seq/i) {
|
|
646 if( $self->verbose > 0 ) {
|
|
647 open(OUT, $outfile) || $self->throw("cannot open $outfile");
|
|
648 while(<OUT>) { $self->debug($_)}
|
|
649 close(OUT);
|
|
650 }
|
|
651 # Added program info so BPbl2seq can compute strand info
|
|
652 $blast_obj = Bio::Tools::BPbl2seq->new(-file => $outfile,
|
|
653 -REPORT_TYPE => $self->p );
|
|
654 # $blast_obj = Bio::Tools::BPbl2seq->new(-file => $outfile);
|
|
655 }
|
|
656 elsif ($executable =~ /blastpgp/i && defined $self->j() &&
|
|
657 $self->j() > 1) {
|
|
658 print "using psilite parser\n";
|
|
659 $blast_obj = Bio::Tools::BPpsilite->new(-file => $outfile);
|
|
660 }
|
|
661 elsif ($self->_READMETHOD =~ /^Blast/i ) {
|
|
662 $blast_obj = Bio::SearchIO->new(-file=>$outfile,
|
|
663 -format => 'blast' ) ;
|
|
664 }
|
|
665 elsif ($self->_READMETHOD =~ /^BPlite/i ) {
|
|
666 $blast_obj = Bio::Tools::BPlite->new(-file=>$outfile);
|
|
667 } else {
|
|
668 $self->warn("Unrecognized readmethod ".$self->_READMETHOD. " or executable $executable\n");
|
|
669 }
|
|
670 return $blast_obj;
|
|
671 }
|
|
672
|
|
673 =head2 _setinput
|
|
674
|
|
675 Title : _setinput
|
|
676 Usage : Internal function, not to be called directly
|
|
677 Function: Create input file(s) for Blast executable
|
|
678 Example :
|
|
679 Returns : name of file containing Blast data input
|
|
680 Args : Seq object reference or input file name
|
|
681
|
|
682 =cut
|
|
683
|
|
684 sub _setinput {
|
|
685 my ($self, $executable, $input1, $input2) = @_;
|
|
686 my ($seq, $temp, $infilename1, $infilename2,$fh ) ;
|
|
687 # If $input1 is not a reference it better be the name of a file with
|
|
688 # the sequence/ alignment data...
|
|
689 $self->io->_io_cleanup();
|
|
690
|
|
691 SWITCH: {
|
|
692 unless (ref $input1) {
|
|
693 $infilename1 = (-e $input1) ? $input1 : 0 ;
|
|
694 last SWITCH;
|
|
695 }
|
|
696 # $input may be an array of BioSeq objects...
|
|
697 if (ref($input1) =~ /ARRAY/i ) {
|
|
698 ($fh,$infilename1) = $self->io->tempfile();
|
|
699 $temp = Bio::SeqIO->new(-fh=> $fh, '-format' => 'Fasta');
|
|
700 foreach $seq (@$input1) {
|
|
701 unless ($seq->isa("Bio::PrimarySeqI")) {return 0;}
|
|
702 $temp->write_seq($seq);
|
|
703 }
|
|
704 close $fh;
|
|
705 $fh = undef;
|
|
706 last SWITCH;
|
|
707 }
|
|
708 # $input may be a single BioSeq object...
|
|
709 elsif ($input1->isa("Bio::PrimarySeqI")) {
|
|
710 ($fh,$infilename1) = $self->io->tempfile();
|
|
711
|
|
712 # just in case $input1 is taken from an alignment and has spaces (ie
|
|
713 # deletions) indicated within it, we have to remove them - otherwise
|
|
714 # the BLAST programs will be unhappy
|
|
715
|
|
716 my $seq_string = $input1->seq();
|
|
717 $seq_string =~ s/\W+//g; # get rid of spaces in sequence
|
|
718 $input1->seq($seq_string);
|
|
719 $temp = Bio::SeqIO->new(-fh=> $fh, '-format' => 'Fasta');
|
|
720 $temp->write_seq($input1);
|
|
721 close $fh;
|
|
722 undef $fh;
|
|
723 # $temp->write_seq($input1);
|
|
724 last SWITCH;
|
|
725 }
|
|
726 $infilename1 = 0; # Set error flag if you get here
|
|
727 } # End SWITCH
|
|
728 unless ($input2) { return $infilename1; }
|
|
729 SWITCH2: {
|
|
730 unless (ref $input2) {
|
|
731 $infilename2 = (-e $input2) ? $input2 : 0 ;
|
|
732 last SWITCH2;
|
|
733 }
|
|
734 if ($input2->isa("Bio::PrimarySeqI") && $executable eq 'bl2seq' ) {
|
|
735 ($fh,$infilename2) = $self->io->tempfile();
|
|
736
|
|
737 $temp = Bio::SeqIO->new(-fh=> $fh, '-format' => 'Fasta');
|
|
738 $temp->write_seq($input2);
|
|
739 close $fh;
|
|
740 undef $fh;
|
|
741 last SWITCH2;
|
|
742 }
|
|
743 # Option for using psiblast's pre-alignment "jumpstart" feature
|
|
744 elsif ($input2->isa("Bio::SimpleAlign") &&
|
|
745 $executable eq 'blastpgp' ) {
|
|
746 # a bit of a lie since it won't be a fasta file
|
|
747 ($fh,$infilename2) = $self->io->tempfile();
|
|
748
|
|
749 # first we retrieve the "mask" that determines which residues should
|
|
750 # by scored according to their position and which should be scored
|
|
751 # using the non-position-specific matrices
|
|
752
|
|
753 my @mask = split("", shift ); # get mask
|
|
754
|
|
755 # then we have to convert all the residues in every sequence to upper
|
|
756 # case at the positions that we want psiblast to use position specific
|
|
757 # scoring
|
|
758
|
|
759 foreach $seq ( $input2->each_seq() ) {
|
|
760 my @seqstringlist = split("",$seq->seq());
|
|
761 for (my $i = 0; $i < scalar(@mask); $i++) {
|
|
762 unless ( $seqstringlist[$i] =~ /[a-zA-Z]/ ) {next}
|
|
763 $seqstringlist[$i] = $mask[$i] ? uc $seqstringlist[$i]: lc $seqstringlist[$i] ;
|
|
764 }
|
|
765 my $newseqstring = join("", @seqstringlist);
|
|
766 $seq->seq($newseqstring);
|
|
767 }
|
|
768 # Now we need to write out the alignment to a file
|
|
769 # in the "psi format" which psiblast is expecting
|
|
770 $input2->map_chars('\.','-');
|
|
771 $temp = Bio::AlignIO->new(-fh=> $fh, '-format' => 'psi');
|
|
772 $temp->write_aln($input2);
|
|
773 close $fh;
|
|
774 undef $fh;
|
|
775 last SWITCH2;
|
|
776 }
|
|
777 $infilename2 = 0; # Set error flag if you get here
|
|
778 } # End SWITCH2
|
|
779 return ($infilename1, $infilename2);
|
|
780 }
|
|
781
|
|
782 =head2 _setparams
|
|
783
|
|
784 Title : _setparams
|
|
785 Usage : Internal function, not to be called directly
|
|
786 Function: Create parameter inputs for Blast program
|
|
787 Example :
|
|
788 Returns : parameter string to be passed to Blast
|
|
789 Args : Reference to calling object and name of BLAST executable
|
|
790
|
|
791 =cut
|
|
792
|
|
793 sub _setparams {
|
|
794 my ($self,$executable) = @_;
|
|
795 my ($attr, $value, @execparams);
|
|
796
|
|
797 if ($executable eq 'blastall') {@execparams = @BLASTALL_PARAMS; }
|
|
798 if ($executable eq 'blastpgp') {@execparams = @BLASTPGP_PARAMS; }
|
|
799 if ($executable eq 'bl2seq') {@execparams = @BL2SEQ_PARAMS; }
|
|
800
|
|
801 my $param_string = "";
|
|
802 for $attr ( @execparams ) {
|
|
803 $value = $self->$attr();
|
|
804 next unless (defined $value);
|
|
805 # Need to prepend datadirectory to database name
|
|
806 if ($attr eq 'd' && ($executable ne 'bl2seq')) {
|
|
807 # This is added so that you can specify a DB with a full path
|
|
808 if (! (-e $value.".nin" || -e $value.".pin")){
|
|
809 $value = File::Spec->catdir($DATADIR,$value);
|
|
810 }
|
|
811 }
|
|
812 # put params in format expected by Blast
|
|
813 $attr = '-'. $attr ;
|
|
814 $param_string .= " $attr $value ";
|
|
815 }
|
|
816
|
|
817 # if ($self->quiet()) { $param_string .= ' >/dev/null';}
|
|
818
|
|
819 return $param_string;
|
|
820 }
|
|
821
|
|
822
|
|
823 =head1 Bio::Tools::Run::Wrapper methods
|
|
824
|
|
825 =cut
|
|
826
|
|
827 =head2 no_param_checks
|
|
828
|
|
829 Title : no_param_checks
|
|
830 Usage : $obj->no_param_checks($newval)
|
|
831 Function: Boolean flag as to whether or not we should
|
|
832 trust the sanity checks for parameter values
|
|
833 Returns : value of no_param_checks
|
|
834 Args : newvalue (optional)
|
|
835
|
|
836
|
|
837 =cut
|
|
838
|
|
839 =head2 save_tempfiles
|
|
840
|
|
841 Title : save_tempfiles
|
|
842 Usage : $obj->save_tempfiles($newval)
|
|
843 Function:
|
|
844 Returns : value of save_tempfiles
|
|
845 Args : newvalue (optional)
|
|
846
|
|
847
|
|
848 =cut
|
|
849
|
|
850 =head2 outfile_name
|
|
851
|
|
852 Title : outfile_name
|
|
853 Usage : my $outfile = $tcoffee->outfile_name();
|
|
854 Function: Get/Set the name of the output file for this run
|
|
855 (if you wanted to do something special)
|
|
856 Returns : string
|
|
857 Args : [optional] string to set value to
|
|
858
|
|
859
|
|
860 =cut
|
|
861
|
|
862
|
|
863 =head2 tempdir
|
|
864
|
|
865 Title : tempdir
|
|
866 Usage : my $tmpdir = $self->tempdir();
|
|
867 Function: Retrieve a temporary directory name (which is created)
|
|
868 Returns : string which is the name of the temporary directory
|
|
869 Args : none
|
|
870
|
|
871
|
|
872 =cut
|
|
873
|
|
874 =head2 cleanup
|
|
875
|
|
876 Title : cleanup
|
|
877 Usage : $tcoffee->cleanup();
|
|
878 Function: Will cleanup the tempdir directory after a PAML run
|
|
879 Returns : none
|
|
880 Args : none
|
|
881
|
|
882
|
|
883 =cut
|
|
884
|
|
885 =head2 io
|
|
886
|
|
887 Title : io
|
|
888 Usage : $obj->io($newval)
|
|
889 Function: Gets a L<Bio::Root::IO> object
|
|
890 Returns : L<Bio::Root::IO>
|
|
891 Args : none
|
|
892
|
|
893
|
|
894 =cut
|
|
895
|
|
896 sub DESTROY {
|
|
897 my $self= shift;
|
|
898 unless ( $self->save_tempfiles ) {
|
|
899 $self->cleanup();
|
|
900 }
|
|
901 $self->SUPER::DESTROY();
|
|
902 }
|
|
903
|
|
904 1;
|
|
905 __END__
|