0
|
1 ## $Id: pSW.pm,v 1.21 2002/10/22 07:45:22 lapp Exp $
|
|
2
|
|
3 #
|
|
4 # BioPerl module for Bio::Tools::pSW
|
|
5 #
|
|
6 # Cared for by Ewan Birney <birney@sanger.ac.uk>
|
|
7 #
|
|
8 # Copyright Ewan Birney
|
|
9 #
|
|
10 # You may distribute this module under the same terms as perl itself
|
|
11
|
|
12 # POD documentation - main docs before the code
|
|
13
|
|
14 =head1 NAME
|
|
15
|
|
16 Bio::Tools::pSW - pairwise Smith Waterman object
|
|
17
|
|
18 =head1 SYNOPSIS
|
|
19
|
|
20 use Bio::Tools::pSW;
|
|
21 use Bio::AlignIO;
|
|
22 my $factory = new Bio::Tools::pSW( '-matrix' => 'blosum62.bla',
|
|
23 '-gap' => 12,
|
|
24 '-ext' => 2,
|
|
25 );
|
|
26
|
|
27 #use the factory to make some output
|
|
28
|
|
29 $factory->align_and_show($seq1,$seq2,STDOUT);
|
|
30
|
|
31 # make a Bio::SimpleAlign and do something with it
|
|
32
|
|
33 my $aln = $factory->pairwise_alignment($seq1,$seq2);
|
|
34 my $alnout = new Bio::AlignIO(-format => 'msf',
|
|
35 -fh => \*STDOUT);
|
|
36
|
|
37 $alnout->write_aln($aln);
|
|
38
|
|
39 =head1 INSTALLATION
|
|
40
|
|
41 This module is included with the central Bioperl distribution:
|
|
42
|
|
43 http://bio.perl.org/Core/Latest
|
|
44 ftp://bio.perl.org/pub/DIST
|
|
45
|
|
46 Follow the installation instructions included in the INSTALL file.
|
|
47
|
|
48 =head1 DESCRIPTION
|
|
49
|
|
50 pSW is an Alignment Factory for protein sequences. It builds pairwise
|
|
51 alignments using the Smith-Waterman algorithm. The alignment algorithm is
|
|
52 implemented in C and added in using an XS extension. The XS extension basically
|
|
53 comes from the Wise2 package, but has been slimmed down to only be the
|
|
54 alignment part of that (this is a good thing!). The XS extension comes
|
|
55 from the bioperl-ext package which is distributed along with bioperl.
|
|
56 I<Warning:> This package will not work if you have not compiled the
|
|
57 bioperl-ext package.
|
|
58
|
|
59 The mixture of C and Perl is ideal for this sort of
|
|
60 problem. Here are some plus points for this strategy:
|
|
61
|
|
62 =over 2
|
|
63
|
|
64 =item Speed and Memory
|
|
65
|
|
66 The algorithm is actually implemented in C, which means it is faster than
|
|
67 a pure perl implementation (I have never done one, so I have no idea
|
|
68 how faster) and will use considerably less memory, as it efficiently
|
|
69 assigns memory for the calculation.
|
|
70
|
|
71 =item Algorithm efficiency
|
|
72
|
|
73 The algorithm was written using Dynamite, and so contains an automatic
|
|
74 switch to the linear space divide-and-conquer method. This means you
|
|
75 could effectively align very large sequences without killing your machine
|
|
76 (it could take a while though!).
|
|
77
|
|
78 =back
|
|
79
|
|
80 =head1 FEEDBACK
|
|
81
|
|
82 =head2 Mailing Lists
|
|
83
|
|
84 User feedback is an integral part of the evolution of this and other
|
|
85 Bioperl modules. Send your comments and suggestions preferably to one
|
|
86 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
87
|
|
88 bioperl-l@bioperl.org - General discussion
|
|
89 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
90
|
|
91 =head2 Reporting Bugs
|
|
92
|
|
93 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
94 the bugs and their resolution. Bug reports can be submitted via email
|
|
95 or the web:
|
|
96
|
|
97 bioperl-bugs@bio.perl.org
|
|
98 http://bugzilla.bioperl.org/
|
|
99
|
|
100 =head1 AUTHOR
|
|
101
|
|
102 Ewan Birney, birney@sanger.ac.uk or birney@ebi.ac.uk
|
|
103
|
|
104 =head1 CONTRIBUTORS
|
|
105
|
|
106 Jason Stajich, jason@bioperl.org
|
|
107
|
|
108 =head1 APPENDIX
|
|
109
|
|
110 The rest of the documentation details each of the object
|
|
111 methods. Internal methods are usually preceded with an underscore "_".
|
|
112
|
|
113 =cut
|
|
114
|
|
115 # Let the code begin...
|
|
116
|
|
117 package Bio::Tools::pSW;
|
|
118 use vars qw(@ISA);
|
|
119 use strict;
|
|
120 no strict ( 'refs');
|
|
121
|
|
122 BEGIN {
|
|
123 eval {
|
|
124 require Bio::Ext::Align;
|
|
125 };
|
|
126 if ( $@ ) {
|
|
127 die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n");
|
|
128 exit(1);
|
|
129 }
|
|
130 }
|
|
131
|
|
132 use Bio::Tools::AlignFactory;
|
|
133 use Bio::SimpleAlign;
|
|
134
|
|
135
|
|
136 @ISA = qw(Bio::Tools::AlignFactory);
|
|
137
|
|
138
|
|
139
|
|
140 sub new {
|
|
141 my($class,@args) = @_;
|
|
142
|
|
143 my $self = $class->SUPER::new(@args);
|
|
144
|
|
145 my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX
|
|
146 GAP
|
|
147 EXT
|
|
148 )],@args);
|
|
149
|
|
150 #default values - we have to load matrix into memory, so
|
|
151 # we need to check it out now
|
|
152 if( ! defined $matrix || !($matrix =~ /\w/) ) {
|
|
153 $matrix = 'blosum62.bla';
|
|
154 }
|
|
155
|
|
156 $self->matrix($matrix); # will throw exception if it can't load it
|
|
157 $self->gap(12) unless defined $gap;
|
|
158 $self->ext(2) unless defined $ext;
|
|
159
|
|
160 # I'm pretty sure I am not doing this right... ho hum...
|
|
161 # This was not roght ($gap and $ext could not be 0) It is fixed now /AE
|
|
162 if( defined $gap ) {
|
|
163 if( $gap =~ /^\d+$/ ) {
|
|
164 $self->gap($gap);
|
|
165 } else {
|
|
166 $self->throw("Gap penalty must be a number, not [$gap]");
|
|
167 }
|
|
168 }
|
|
169 if( defined $ext ) {
|
|
170 if( $ext =~ /^\d+$/ ) {
|
|
171 $self->ext($ext);
|
|
172 } else {
|
|
173 $self->throw("Extension penalty must be a number, not [$ext]");
|
|
174 }
|
|
175 }
|
|
176
|
|
177 return $self;
|
|
178 }
|
|
179
|
|
180
|
|
181 =head2 pairwise_alignment
|
|
182
|
|
183 Title : pairwise_alignment
|
|
184 Usage : $aln = $factory->pairwise_alignment($seq1,$seq2)
|
|
185 Function: Makes a SimpleAlign object from two sequences
|
|
186 Returns : A SimpleAlign object
|
|
187 Args :
|
|
188
|
|
189
|
|
190 =cut
|
|
191
|
|
192 sub pairwise_alignment{
|
|
193 my ($self,$seq1,$seq2) = @_;
|
|
194 my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp);
|
|
195
|
|
196 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
|
|
197 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
|
|
198 $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)");
|
|
199 return undef;
|
|
200 }
|
|
201 # fix Jitterbug #1044
|
|
202 if( $seq1->length() < 2 ||
|
|
203 $seq2->length() < 2 ) {
|
|
204 $self->warn("cannot align sequences with length less than 2");
|
|
205 return undef;
|
|
206 }
|
|
207 $self->set_memory_and_report();
|
|
208 # create engine objects
|
|
209 $seq1->display_id('seq1') unless ( defined $seq1->id() );
|
|
210 $seq2->display_id('seq2') unless ( defined $seq2->id() );
|
|
211
|
|
212 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),
|
|
213 $seq1->seq());
|
|
214 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),
|
|
215 $seq2->seq());
|
|
216 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
|
|
217 if( ! defined $aln || $aln == 0 ) {
|
|
218 $self->throw("Unable to build an alignment");
|
|
219 }
|
|
220
|
|
221 # free sequence engine objects
|
|
222
|
|
223 $t1 = $t2 = 0;
|
|
224
|
|
225 # now we have to get into the AlnBlock structure and
|
|
226 # figure out what is aligned to what...
|
|
227
|
|
228 # we are going to need the sequences as arrays for convience
|
|
229
|
|
230 @str1 = split(//, $seq1->seq());
|
|
231 @str2 = split(//, $seq2->seq());
|
|
232
|
|
233 # get out start points
|
|
234
|
|
235 # The alignment is in alignment coordinates - ie the first
|
|
236 # residues starts at -1 and ends at 0. (weird I know).
|
|
237 # bio-coordinates are +2 from this...
|
|
238
|
|
239 $start1 = $aln->start()->alu(0)->start +2;
|
|
240 $start2 = $aln->start()->alu(1)->start +2;
|
|
241
|
|
242 # step along the linked list of alc units...
|
|
243
|
|
244 for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) {
|
|
245 if( $alc->alu(0)->text_label eq 'SEQUENCE' ) {
|
|
246 push(@ostr1,$str1[$alc->alu(0)->start+1]);
|
|
247 } else {
|
|
248 # assumme it is in insert!
|
|
249 push(@ostr1,'-');
|
|
250 }
|
|
251
|
|
252 if( $alc->alu(1)->text_label eq 'SEQUENCE' ) {
|
|
253 push(@ostr2,$str2[$alc->alu(1)->start+1]);
|
|
254 } else {
|
|
255 # assumme it is in insert!
|
|
256 push(@ostr2,'-');
|
|
257 }
|
|
258 $alctemp = $alc;
|
|
259 }
|
|
260
|
|
261 #
|
|
262 # get out end points
|
|
263 #
|
|
264
|
|
265 # end points = real residue end in 'C' coordinates = residue
|
|
266 # end in biocoordinates. Oh... the wonder of coordinate systems!
|
|
267
|
|
268 $end1 = $alctemp->alu(0)->end+1;
|
|
269 $end2 = $alctemp->alu(1)->end+1;
|
|
270
|
|
271 # get rid of the alnblock
|
|
272 $alc = 0;
|
|
273 $aln = 0;
|
|
274
|
|
275 # new SimpleAlignment
|
|
276 $out = Bio::SimpleAlign->new(); # new SimpleAlignment
|
|
277
|
|
278 $tstr = join('',@ostr1);
|
|
279 $tid = $seq1->id();
|
|
280 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr,
|
|
281 -start => $start1,
|
|
282 -end => $end1,
|
|
283 -id=>$tid ));
|
|
284
|
|
285 $tstr = join('',@ostr2);
|
|
286 $tid = $seq2->id();
|
|
287 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr,
|
|
288 -start => $start2,
|
|
289 -end => $end2,
|
|
290 -id=> $tid ));
|
|
291
|
|
292 # give'm back the alignment
|
|
293
|
|
294 return $out;
|
|
295 }
|
|
296
|
|
297 =head2 align_and_show
|
|
298
|
|
299 Title : align_and_show
|
|
300 Usage : $factory->align_and_show($seq1,$seq2,STDOUT)
|
|
301
|
|
302 =cut
|
|
303
|
|
304 sub align_and_show {
|
|
305 my($self,$seq1,$seq2,$fh) = @_;
|
|
306 my($t1,$t2,$aln,$id,$str);
|
|
307
|
|
308 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
|
|
309 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
|
|
310 $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)");
|
|
311 return undef;
|
|
312 }
|
|
313 # fix Jitterbug #1044
|
|
314 if( $seq1->length() < 2 ||
|
|
315 $seq2->length() < 2 ) {
|
|
316 $self->warn("cannot align sequences with length less than 2");
|
|
317 return undef;
|
|
318 }
|
|
319 if( ! defined $fh ) {
|
|
320 $fh = \*STDOUT;
|
|
321 }
|
|
322 $self->set_memory_and_report();
|
|
323 $seq1->display_id('seq1') unless ( defined $seq1->id() );
|
|
324 $seq2->display_id('seq2') unless ( defined $seq2->id() );
|
|
325
|
|
326 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),$seq1->seq());
|
|
327
|
|
328 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),$seq2->seq());
|
|
329 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
|
|
330 if( ! defined $aln || $aln == 0 ) {
|
|
331 $self->throw("Unable to build an alignment");
|
|
332 }
|
|
333
|
|
334 &Bio::Ext::Align::write_pretty_seq_align($aln,$t1,$t2,12,50,$fh);
|
|
335
|
|
336 }
|
|
337
|
|
338 =head2 matrix
|
|
339
|
|
340 Title : matrix()
|
|
341 Usage : $factory->matrix('blosum62.bla');
|
|
342 Function : Reads in comparison matrix based on name
|
|
343 :
|
|
344 Returns :
|
|
345 Argument : comparison matrix
|
|
346
|
|
347 =cut
|
|
348
|
|
349 sub matrix {
|
|
350 my($self,$comp) = @_;
|
|
351 my $temp;
|
|
352
|
|
353 if( !defined $comp ) {
|
|
354 $self->throw("You must have a comparison matrix to set!");
|
|
355 }
|
|
356
|
|
357 # talking to the engine here...
|
|
358
|
|
359 $temp = &Bio::Ext::Align::CompMat::read_Blast_file_CompMat($comp);
|
|
360
|
|
361 if( !(defined $temp) || $temp == 0 ) {
|
|
362 $self->throw("$comp cannot be read as a BLAST comparison matrix file");
|
|
363 }
|
|
364
|
|
365 $self->{'matrix'} = $temp;
|
|
366 }
|
|
367
|
|
368
|
|
369
|
|
370 =head2 gap
|
|
371
|
|
372 Title : gap
|
|
373 Usage : $gap = $factory->gap() #get
|
|
374 : $factory->gap($value) #set
|
|
375 Function : the set get for the gap penalty
|
|
376 Example :
|
|
377 Returns : gap value
|
|
378 Arguments : new value
|
|
379
|
|
380 =cut
|
|
381
|
|
382 sub gap {
|
|
383 my ($self,$val) = @_;
|
|
384
|
|
385
|
|
386 if( defined $val ) {
|
|
387 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
|
|
388 $self->throw("Can't have a gap penalty less than 0");
|
|
389 }
|
|
390 $self->{'gap'} = $val;
|
|
391 }
|
|
392 return $self->{'gap'};
|
|
393 }
|
|
394
|
|
395
|
|
396 =head2 ext
|
|
397
|
|
398 Title : ext
|
|
399 Usage : $ext = $factory->ext() #get
|
|
400 : $factory->ext($value) #set
|
|
401 Function : the set get for the ext penalty
|
|
402 Example :
|
|
403 Returns : ext value
|
|
404 Arguments : new value
|
|
405
|
|
406 =cut
|
|
407
|
|
408 sub ext {
|
|
409 my ($self,$val) = @_;
|
|
410
|
|
411 if( defined $val ) {
|
|
412 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
|
|
413 $self->throw("Can't have a gap penalty less than 0");
|
|
414 }
|
|
415 $self->{'ext'} = $val;
|
|
416 }
|
|
417 return $self->{'ext'};
|
|
418 }
|
|
419
|
|
420
|
|
421
|