comparison variant_effect_predictor/Bio/Tools/pSW.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 ## $Id: pSW.pm,v 1.21 2002/10/22 07:45:22 lapp Exp $
2
3 #
4 # BioPerl module for Bio::Tools::pSW
5 #
6 # Cared for by Ewan Birney <birney@sanger.ac.uk>
7 #
8 # Copyright Ewan Birney
9 #
10 # You may distribute this module under the same terms as perl itself
11
12 # POD documentation - main docs before the code
13
14 =head1 NAME
15
16 Bio::Tools::pSW - pairwise Smith Waterman object
17
18 =head1 SYNOPSIS
19
20 use Bio::Tools::pSW;
21 use Bio::AlignIO;
22 my $factory = new Bio::Tools::pSW( '-matrix' => 'blosum62.bla',
23 '-gap' => 12,
24 '-ext' => 2,
25 );
26
27 #use the factory to make some output
28
29 $factory->align_and_show($seq1,$seq2,STDOUT);
30
31 # make a Bio::SimpleAlign and do something with it
32
33 my $aln = $factory->pairwise_alignment($seq1,$seq2);
34 my $alnout = new Bio::AlignIO(-format => 'msf',
35 -fh => \*STDOUT);
36
37 $alnout->write_aln($aln);
38
39 =head1 INSTALLATION
40
41 This module is included with the central Bioperl distribution:
42
43 http://bio.perl.org/Core/Latest
44 ftp://bio.perl.org/pub/DIST
45
46 Follow the installation instructions included in the INSTALL file.
47
48 =head1 DESCRIPTION
49
50 pSW is an Alignment Factory for protein sequences. It builds pairwise
51 alignments using the Smith-Waterman algorithm. The alignment algorithm is
52 implemented in C and added in using an XS extension. The XS extension basically
53 comes from the Wise2 package, but has been slimmed down to only be the
54 alignment part of that (this is a good thing!). The XS extension comes
55 from the bioperl-ext package which is distributed along with bioperl.
56 I<Warning:> This package will not work if you have not compiled the
57 bioperl-ext package.
58
59 The mixture of C and Perl is ideal for this sort of
60 problem. Here are some plus points for this strategy:
61
62 =over 2
63
64 =item Speed and Memory
65
66 The algorithm is actually implemented in C, which means it is faster than
67 a pure perl implementation (I have never done one, so I have no idea
68 how faster) and will use considerably less memory, as it efficiently
69 assigns memory for the calculation.
70
71 =item Algorithm efficiency
72
73 The algorithm was written using Dynamite, and so contains an automatic
74 switch to the linear space divide-and-conquer method. This means you
75 could effectively align very large sequences without killing your machine
76 (it could take a while though!).
77
78 =back
79
80 =head1 FEEDBACK
81
82 =head2 Mailing Lists
83
84 User feedback is an integral part of the evolution of this and other
85 Bioperl modules. Send your comments and suggestions preferably to one
86 of the Bioperl mailing lists. Your participation is much appreciated.
87
88 bioperl-l@bioperl.org - General discussion
89 http://bioperl.org/MailList.shtml - About the mailing lists
90
91 =head2 Reporting Bugs
92
93 Report bugs to the Bioperl bug tracking system to help us keep track
94 the bugs and their resolution. Bug reports can be submitted via email
95 or the web:
96
97 bioperl-bugs@bio.perl.org
98 http://bugzilla.bioperl.org/
99
100 =head1 AUTHOR
101
102 Ewan Birney, birney@sanger.ac.uk or birney@ebi.ac.uk
103
104 =head1 CONTRIBUTORS
105
106 Jason Stajich, jason@bioperl.org
107
108 =head1 APPENDIX
109
110 The rest of the documentation details each of the object
111 methods. Internal methods are usually preceded with an underscore "_".
112
113 =cut
114
115 # Let the code begin...
116
117 package Bio::Tools::pSW;
118 use vars qw(@ISA);
119 use strict;
120 no strict ( 'refs');
121
122 BEGIN {
123 eval {
124 require Bio::Ext::Align;
125 };
126 if ( $@ ) {
127 die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n");
128 exit(1);
129 }
130 }
131
132 use Bio::Tools::AlignFactory;
133 use Bio::SimpleAlign;
134
135
136 @ISA = qw(Bio::Tools::AlignFactory);
137
138
139
140 sub new {
141 my($class,@args) = @_;
142
143 my $self = $class->SUPER::new(@args);
144
145 my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX
146 GAP
147 EXT
148 )],@args);
149
150 #default values - we have to load matrix into memory, so
151 # we need to check it out now
152 if( ! defined $matrix || !($matrix =~ /\w/) ) {
153 $matrix = 'blosum62.bla';
154 }
155
156 $self->matrix($matrix); # will throw exception if it can't load it
157 $self->gap(12) unless defined $gap;
158 $self->ext(2) unless defined $ext;
159
160 # I'm pretty sure I am not doing this right... ho hum...
161 # This was not roght ($gap and $ext could not be 0) It is fixed now /AE
162 if( defined $gap ) {
163 if( $gap =~ /^\d+$/ ) {
164 $self->gap($gap);
165 } else {
166 $self->throw("Gap penalty must be a number, not [$gap]");
167 }
168 }
169 if( defined $ext ) {
170 if( $ext =~ /^\d+$/ ) {
171 $self->ext($ext);
172 } else {
173 $self->throw("Extension penalty must be a number, not [$ext]");
174 }
175 }
176
177 return $self;
178 }
179
180
181 =head2 pairwise_alignment
182
183 Title : pairwise_alignment
184 Usage : $aln = $factory->pairwise_alignment($seq1,$seq2)
185 Function: Makes a SimpleAlign object from two sequences
186 Returns : A SimpleAlign object
187 Args :
188
189
190 =cut
191
192 sub pairwise_alignment{
193 my ($self,$seq1,$seq2) = @_;
194 my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp);
195
196 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
197 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
198 $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)");
199 return undef;
200 }
201 # fix Jitterbug #1044
202 if( $seq1->length() < 2 ||
203 $seq2->length() < 2 ) {
204 $self->warn("cannot align sequences with length less than 2");
205 return undef;
206 }
207 $self->set_memory_and_report();
208 # create engine objects
209 $seq1->display_id('seq1') unless ( defined $seq1->id() );
210 $seq2->display_id('seq2') unless ( defined $seq2->id() );
211
212 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),
213 $seq1->seq());
214 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),
215 $seq2->seq());
216 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
217 if( ! defined $aln || $aln == 0 ) {
218 $self->throw("Unable to build an alignment");
219 }
220
221 # free sequence engine objects
222
223 $t1 = $t2 = 0;
224
225 # now we have to get into the AlnBlock structure and
226 # figure out what is aligned to what...
227
228 # we are going to need the sequences as arrays for convience
229
230 @str1 = split(//, $seq1->seq());
231 @str2 = split(//, $seq2->seq());
232
233 # get out start points
234
235 # The alignment is in alignment coordinates - ie the first
236 # residues starts at -1 and ends at 0. (weird I know).
237 # bio-coordinates are +2 from this...
238
239 $start1 = $aln->start()->alu(0)->start +2;
240 $start2 = $aln->start()->alu(1)->start +2;
241
242 # step along the linked list of alc units...
243
244 for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) {
245 if( $alc->alu(0)->text_label eq 'SEQUENCE' ) {
246 push(@ostr1,$str1[$alc->alu(0)->start+1]);
247 } else {
248 # assumme it is in insert!
249 push(@ostr1,'-');
250 }
251
252 if( $alc->alu(1)->text_label eq 'SEQUENCE' ) {
253 push(@ostr2,$str2[$alc->alu(1)->start+1]);
254 } else {
255 # assumme it is in insert!
256 push(@ostr2,'-');
257 }
258 $alctemp = $alc;
259 }
260
261 #
262 # get out end points
263 #
264
265 # end points = real residue end in 'C' coordinates = residue
266 # end in biocoordinates. Oh... the wonder of coordinate systems!
267
268 $end1 = $alctemp->alu(0)->end+1;
269 $end2 = $alctemp->alu(1)->end+1;
270
271 # get rid of the alnblock
272 $alc = 0;
273 $aln = 0;
274
275 # new SimpleAlignment
276 $out = Bio::SimpleAlign->new(); # new SimpleAlignment
277
278 $tstr = join('',@ostr1);
279 $tid = $seq1->id();
280 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr,
281 -start => $start1,
282 -end => $end1,
283 -id=>$tid ));
284
285 $tstr = join('',@ostr2);
286 $tid = $seq2->id();
287 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr,
288 -start => $start2,
289 -end => $end2,
290 -id=> $tid ));
291
292 # give'm back the alignment
293
294 return $out;
295 }
296
297 =head2 align_and_show
298
299 Title : align_and_show
300 Usage : $factory->align_and_show($seq1,$seq2,STDOUT)
301
302 =cut
303
304 sub align_and_show {
305 my($self,$seq1,$seq2,$fh) = @_;
306 my($t1,$t2,$aln,$id,$str);
307
308 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') ||
309 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) {
310 $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)");
311 return undef;
312 }
313 # fix Jitterbug #1044
314 if( $seq1->length() < 2 ||
315 $seq2->length() < 2 ) {
316 $self->warn("cannot align sequences with length less than 2");
317 return undef;
318 }
319 if( ! defined $fh ) {
320 $fh = \*STDOUT;
321 }
322 $self->set_memory_and_report();
323 $seq1->display_id('seq1') unless ( defined $seq1->id() );
324 $seq2->display_id('seq2') unless ( defined $seq2->id() );
325
326 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),$seq1->seq());
327
328 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),$seq2->seq());
329 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext);
330 if( ! defined $aln || $aln == 0 ) {
331 $self->throw("Unable to build an alignment");
332 }
333
334 &Bio::Ext::Align::write_pretty_seq_align($aln,$t1,$t2,12,50,$fh);
335
336 }
337
338 =head2 matrix
339
340 Title : matrix()
341 Usage : $factory->matrix('blosum62.bla');
342 Function : Reads in comparison matrix based on name
343 :
344 Returns :
345 Argument : comparison matrix
346
347 =cut
348
349 sub matrix {
350 my($self,$comp) = @_;
351 my $temp;
352
353 if( !defined $comp ) {
354 $self->throw("You must have a comparison matrix to set!");
355 }
356
357 # talking to the engine here...
358
359 $temp = &Bio::Ext::Align::CompMat::read_Blast_file_CompMat($comp);
360
361 if( !(defined $temp) || $temp == 0 ) {
362 $self->throw("$comp cannot be read as a BLAST comparison matrix file");
363 }
364
365 $self->{'matrix'} = $temp;
366 }
367
368
369
370 =head2 gap
371
372 Title : gap
373 Usage : $gap = $factory->gap() #get
374 : $factory->gap($value) #set
375 Function : the set get for the gap penalty
376 Example :
377 Returns : gap value
378 Arguments : new value
379
380 =cut
381
382 sub gap {
383 my ($self,$val) = @_;
384
385
386 if( defined $val ) {
387 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
388 $self->throw("Can't have a gap penalty less than 0");
389 }
390 $self->{'gap'} = $val;
391 }
392 return $self->{'gap'};
393 }
394
395
396 =head2 ext
397
398 Title : ext
399 Usage : $ext = $factory->ext() #get
400 : $factory->ext($value) #set
401 Function : the set get for the ext penalty
402 Example :
403 Returns : ext value
404 Arguments : new value
405
406 =cut
407
408 sub ext {
409 my ($self,$val) = @_;
410
411 if( defined $val ) {
412 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE
413 $self->throw("Can't have a gap penalty less than 0");
414 }
415 $self->{'ext'} = $val;
416 }
417 return $self->{'ext'};
418 }
419
420
421