Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/pSW.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 ## $Id: pSW.pm,v 1.21 2002/10/22 07:45:22 lapp Exp $ | |
2 | |
3 # | |
4 # BioPerl module for Bio::Tools::pSW | |
5 # | |
6 # Cared for by Ewan Birney <birney@sanger.ac.uk> | |
7 # | |
8 # Copyright Ewan Birney | |
9 # | |
10 # You may distribute this module under the same terms as perl itself | |
11 | |
12 # POD documentation - main docs before the code | |
13 | |
14 =head1 NAME | |
15 | |
16 Bio::Tools::pSW - pairwise Smith Waterman object | |
17 | |
18 =head1 SYNOPSIS | |
19 | |
20 use Bio::Tools::pSW; | |
21 use Bio::AlignIO; | |
22 my $factory = new Bio::Tools::pSW( '-matrix' => 'blosum62.bla', | |
23 '-gap' => 12, | |
24 '-ext' => 2, | |
25 ); | |
26 | |
27 #use the factory to make some output | |
28 | |
29 $factory->align_and_show($seq1,$seq2,STDOUT); | |
30 | |
31 # make a Bio::SimpleAlign and do something with it | |
32 | |
33 my $aln = $factory->pairwise_alignment($seq1,$seq2); | |
34 my $alnout = new Bio::AlignIO(-format => 'msf', | |
35 -fh => \*STDOUT); | |
36 | |
37 $alnout->write_aln($aln); | |
38 | |
39 =head1 INSTALLATION | |
40 | |
41 This module is included with the central Bioperl distribution: | |
42 | |
43 http://bio.perl.org/Core/Latest | |
44 ftp://bio.perl.org/pub/DIST | |
45 | |
46 Follow the installation instructions included in the INSTALL file. | |
47 | |
48 =head1 DESCRIPTION | |
49 | |
50 pSW is an Alignment Factory for protein sequences. It builds pairwise | |
51 alignments using the Smith-Waterman algorithm. The alignment algorithm is | |
52 implemented in C and added in using an XS extension. The XS extension basically | |
53 comes from the Wise2 package, but has been slimmed down to only be the | |
54 alignment part of that (this is a good thing!). The XS extension comes | |
55 from the bioperl-ext package which is distributed along with bioperl. | |
56 I<Warning:> This package will not work if you have not compiled the | |
57 bioperl-ext package. | |
58 | |
59 The mixture of C and Perl is ideal for this sort of | |
60 problem. Here are some plus points for this strategy: | |
61 | |
62 =over 2 | |
63 | |
64 =item Speed and Memory | |
65 | |
66 The algorithm is actually implemented in C, which means it is faster than | |
67 a pure perl implementation (I have never done one, so I have no idea | |
68 how faster) and will use considerably less memory, as it efficiently | |
69 assigns memory for the calculation. | |
70 | |
71 =item Algorithm efficiency | |
72 | |
73 The algorithm was written using Dynamite, and so contains an automatic | |
74 switch to the linear space divide-and-conquer method. This means you | |
75 could effectively align very large sequences without killing your machine | |
76 (it could take a while though!). | |
77 | |
78 =back | |
79 | |
80 =head1 FEEDBACK | |
81 | |
82 =head2 Mailing Lists | |
83 | |
84 User feedback is an integral part of the evolution of this and other | |
85 Bioperl modules. Send your comments and suggestions preferably to one | |
86 of the Bioperl mailing lists. Your participation is much appreciated. | |
87 | |
88 bioperl-l@bioperl.org - General discussion | |
89 http://bioperl.org/MailList.shtml - About the mailing lists | |
90 | |
91 =head2 Reporting Bugs | |
92 | |
93 Report bugs to the Bioperl bug tracking system to help us keep track | |
94 the bugs and their resolution. Bug reports can be submitted via email | |
95 or the web: | |
96 | |
97 bioperl-bugs@bio.perl.org | |
98 http://bugzilla.bioperl.org/ | |
99 | |
100 =head1 AUTHOR | |
101 | |
102 Ewan Birney, birney@sanger.ac.uk or birney@ebi.ac.uk | |
103 | |
104 =head1 CONTRIBUTORS | |
105 | |
106 Jason Stajich, jason@bioperl.org | |
107 | |
108 =head1 APPENDIX | |
109 | |
110 The rest of the documentation details each of the object | |
111 methods. Internal methods are usually preceded with an underscore "_". | |
112 | |
113 =cut | |
114 | |
115 # Let the code begin... | |
116 | |
117 package Bio::Tools::pSW; | |
118 use vars qw(@ISA); | |
119 use strict; | |
120 no strict ( 'refs'); | |
121 | |
122 BEGIN { | |
123 eval { | |
124 require Bio::Ext::Align; | |
125 }; | |
126 if ( $@ ) { | |
127 die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n"); | |
128 exit(1); | |
129 } | |
130 } | |
131 | |
132 use Bio::Tools::AlignFactory; | |
133 use Bio::SimpleAlign; | |
134 | |
135 | |
136 @ISA = qw(Bio::Tools::AlignFactory); | |
137 | |
138 | |
139 | |
140 sub new { | |
141 my($class,@args) = @_; | |
142 | |
143 my $self = $class->SUPER::new(@args); | |
144 | |
145 my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX | |
146 GAP | |
147 EXT | |
148 )],@args); | |
149 | |
150 #default values - we have to load matrix into memory, so | |
151 # we need to check it out now | |
152 if( ! defined $matrix || !($matrix =~ /\w/) ) { | |
153 $matrix = 'blosum62.bla'; | |
154 } | |
155 | |
156 $self->matrix($matrix); # will throw exception if it can't load it | |
157 $self->gap(12) unless defined $gap; | |
158 $self->ext(2) unless defined $ext; | |
159 | |
160 # I'm pretty sure I am not doing this right... ho hum... | |
161 # This was not roght ($gap and $ext could not be 0) It is fixed now /AE | |
162 if( defined $gap ) { | |
163 if( $gap =~ /^\d+$/ ) { | |
164 $self->gap($gap); | |
165 } else { | |
166 $self->throw("Gap penalty must be a number, not [$gap]"); | |
167 } | |
168 } | |
169 if( defined $ext ) { | |
170 if( $ext =~ /^\d+$/ ) { | |
171 $self->ext($ext); | |
172 } else { | |
173 $self->throw("Extension penalty must be a number, not [$ext]"); | |
174 } | |
175 } | |
176 | |
177 return $self; | |
178 } | |
179 | |
180 | |
181 =head2 pairwise_alignment | |
182 | |
183 Title : pairwise_alignment | |
184 Usage : $aln = $factory->pairwise_alignment($seq1,$seq2) | |
185 Function: Makes a SimpleAlign object from two sequences | |
186 Returns : A SimpleAlign object | |
187 Args : | |
188 | |
189 | |
190 =cut | |
191 | |
192 sub pairwise_alignment{ | |
193 my ($self,$seq1,$seq2) = @_; | |
194 my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp); | |
195 | |
196 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') || | |
197 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) { | |
198 $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)"); | |
199 return undef; | |
200 } | |
201 # fix Jitterbug #1044 | |
202 if( $seq1->length() < 2 || | |
203 $seq2->length() < 2 ) { | |
204 $self->warn("cannot align sequences with length less than 2"); | |
205 return undef; | |
206 } | |
207 $self->set_memory_and_report(); | |
208 # create engine objects | |
209 $seq1->display_id('seq1') unless ( defined $seq1->id() ); | |
210 $seq2->display_id('seq2') unless ( defined $seq2->id() ); | |
211 | |
212 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(), | |
213 $seq1->seq()); | |
214 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(), | |
215 $seq2->seq()); | |
216 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext); | |
217 if( ! defined $aln || $aln == 0 ) { | |
218 $self->throw("Unable to build an alignment"); | |
219 } | |
220 | |
221 # free sequence engine objects | |
222 | |
223 $t1 = $t2 = 0; | |
224 | |
225 # now we have to get into the AlnBlock structure and | |
226 # figure out what is aligned to what... | |
227 | |
228 # we are going to need the sequences as arrays for convience | |
229 | |
230 @str1 = split(//, $seq1->seq()); | |
231 @str2 = split(//, $seq2->seq()); | |
232 | |
233 # get out start points | |
234 | |
235 # The alignment is in alignment coordinates - ie the first | |
236 # residues starts at -1 and ends at 0. (weird I know). | |
237 # bio-coordinates are +2 from this... | |
238 | |
239 $start1 = $aln->start()->alu(0)->start +2; | |
240 $start2 = $aln->start()->alu(1)->start +2; | |
241 | |
242 # step along the linked list of alc units... | |
243 | |
244 for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) { | |
245 if( $alc->alu(0)->text_label eq 'SEQUENCE' ) { | |
246 push(@ostr1,$str1[$alc->alu(0)->start+1]); | |
247 } else { | |
248 # assumme it is in insert! | |
249 push(@ostr1,'-'); | |
250 } | |
251 | |
252 if( $alc->alu(1)->text_label eq 'SEQUENCE' ) { | |
253 push(@ostr2,$str2[$alc->alu(1)->start+1]); | |
254 } else { | |
255 # assumme it is in insert! | |
256 push(@ostr2,'-'); | |
257 } | |
258 $alctemp = $alc; | |
259 } | |
260 | |
261 # | |
262 # get out end points | |
263 # | |
264 | |
265 # end points = real residue end in 'C' coordinates = residue | |
266 # end in biocoordinates. Oh... the wonder of coordinate systems! | |
267 | |
268 $end1 = $alctemp->alu(0)->end+1; | |
269 $end2 = $alctemp->alu(1)->end+1; | |
270 | |
271 # get rid of the alnblock | |
272 $alc = 0; | |
273 $aln = 0; | |
274 | |
275 # new SimpleAlignment | |
276 $out = Bio::SimpleAlign->new(); # new SimpleAlignment | |
277 | |
278 $tstr = join('',@ostr1); | |
279 $tid = $seq1->id(); | |
280 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr, | |
281 -start => $start1, | |
282 -end => $end1, | |
283 -id=>$tid )); | |
284 | |
285 $tstr = join('',@ostr2); | |
286 $tid = $seq2->id(); | |
287 $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr, | |
288 -start => $start2, | |
289 -end => $end2, | |
290 -id=> $tid )); | |
291 | |
292 # give'm back the alignment | |
293 | |
294 return $out; | |
295 } | |
296 | |
297 =head2 align_and_show | |
298 | |
299 Title : align_and_show | |
300 Usage : $factory->align_and_show($seq1,$seq2,STDOUT) | |
301 | |
302 =cut | |
303 | |
304 sub align_and_show { | |
305 my($self,$seq1,$seq2,$fh) = @_; | |
306 my($t1,$t2,$aln,$id,$str); | |
307 | |
308 if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') || | |
309 ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) { | |
310 $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)"); | |
311 return undef; | |
312 } | |
313 # fix Jitterbug #1044 | |
314 if( $seq1->length() < 2 || | |
315 $seq2->length() < 2 ) { | |
316 $self->warn("cannot align sequences with length less than 2"); | |
317 return undef; | |
318 } | |
319 if( ! defined $fh ) { | |
320 $fh = \*STDOUT; | |
321 } | |
322 $self->set_memory_and_report(); | |
323 $seq1->display_id('seq1') unless ( defined $seq1->id() ); | |
324 $seq2->display_id('seq2') unless ( defined $seq2->id() ); | |
325 | |
326 $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),$seq1->seq()); | |
327 | |
328 $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),$seq2->seq()); | |
329 $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext); | |
330 if( ! defined $aln || $aln == 0 ) { | |
331 $self->throw("Unable to build an alignment"); | |
332 } | |
333 | |
334 &Bio::Ext::Align::write_pretty_seq_align($aln,$t1,$t2,12,50,$fh); | |
335 | |
336 } | |
337 | |
338 =head2 matrix | |
339 | |
340 Title : matrix() | |
341 Usage : $factory->matrix('blosum62.bla'); | |
342 Function : Reads in comparison matrix based on name | |
343 : | |
344 Returns : | |
345 Argument : comparison matrix | |
346 | |
347 =cut | |
348 | |
349 sub matrix { | |
350 my($self,$comp) = @_; | |
351 my $temp; | |
352 | |
353 if( !defined $comp ) { | |
354 $self->throw("You must have a comparison matrix to set!"); | |
355 } | |
356 | |
357 # talking to the engine here... | |
358 | |
359 $temp = &Bio::Ext::Align::CompMat::read_Blast_file_CompMat($comp); | |
360 | |
361 if( !(defined $temp) || $temp == 0 ) { | |
362 $self->throw("$comp cannot be read as a BLAST comparison matrix file"); | |
363 } | |
364 | |
365 $self->{'matrix'} = $temp; | |
366 } | |
367 | |
368 | |
369 | |
370 =head2 gap | |
371 | |
372 Title : gap | |
373 Usage : $gap = $factory->gap() #get | |
374 : $factory->gap($value) #set | |
375 Function : the set get for the gap penalty | |
376 Example : | |
377 Returns : gap value | |
378 Arguments : new value | |
379 | |
380 =cut | |
381 | |
382 sub gap { | |
383 my ($self,$val) = @_; | |
384 | |
385 | |
386 if( defined $val ) { | |
387 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE | |
388 $self->throw("Can't have a gap penalty less than 0"); | |
389 } | |
390 $self->{'gap'} = $val; | |
391 } | |
392 return $self->{'gap'}; | |
393 } | |
394 | |
395 | |
396 =head2 ext | |
397 | |
398 Title : ext | |
399 Usage : $ext = $factory->ext() #get | |
400 : $factory->ext($value) #set | |
401 Function : the set get for the ext penalty | |
402 Example : | |
403 Returns : ext value | |
404 Arguments : new value | |
405 | |
406 =cut | |
407 | |
408 sub ext { | |
409 my ($self,$val) = @_; | |
410 | |
411 if( defined $val ) { | |
412 if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE | |
413 $self->throw("Can't have a gap penalty less than 0"); | |
414 } | |
415 $self->{'ext'} = $val; | |
416 } | |
417 return $self->{'ext'}; | |
418 } | |
419 | |
420 | |
421 |