Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/Tools/pSW.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/pSW.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,421 @@ +## $Id: pSW.pm,v 1.21 2002/10/22 07:45:22 lapp Exp $ + +# +# BioPerl module for Bio::Tools::pSW +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::pSW - pairwise Smith Waterman object + +=head1 SYNOPSIS + + use Bio::Tools::pSW; + use Bio::AlignIO; + my $factory = new Bio::Tools::pSW( '-matrix' => 'blosum62.bla', + '-gap' => 12, + '-ext' => 2, + ); + + #use the factory to make some output + + $factory->align_and_show($seq1,$seq2,STDOUT); + + # make a Bio::SimpleAlign and do something with it + + my $aln = $factory->pairwise_alignment($seq1,$seq2); + my $alnout = new Bio::AlignIO(-format => 'msf', + -fh => \*STDOUT); + + $alnout->write_aln($aln); + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the INSTALL file. + +=head1 DESCRIPTION + +pSW is an Alignment Factory for protein sequences. It builds pairwise +alignments using the Smith-Waterman algorithm. The alignment algorithm is +implemented in C and added in using an XS extension. The XS extension basically +comes from the Wise2 package, but has been slimmed down to only be the +alignment part of that (this is a good thing!). The XS extension comes +from the bioperl-ext package which is distributed along with bioperl. +I<Warning:> This package will not work if you have not compiled the +bioperl-ext package. + +The mixture of C and Perl is ideal for this sort of +problem. Here are some plus points for this strategy: + +=over 2 + +=item Speed and Memory + +The algorithm is actually implemented in C, which means it is faster than +a pure perl implementation (I have never done one, so I have no idea +how faster) and will use considerably less memory, as it efficiently +assigns memory for the calculation. + +=item Algorithm efficiency + +The algorithm was written using Dynamite, and so contains an automatic +switch to the linear space divide-and-conquer method. This means you +could effectively align very large sequences without killing your machine +(it could take a while though!). + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Ewan Birney, birney@sanger.ac.uk or birney@ebi.ac.uk + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with an underscore "_". + +=cut + +# Let the code begin... + +package Bio::Tools::pSW; +use vars qw(@ISA); +use strict; +no strict ( 'refs'); + +BEGIN { + eval { + require Bio::Ext::Align; + }; + if ( $@ ) { + die("\nThe C-compiled engine for Smith Waterman alignments (Bio::Ext::Align) has not been installed.\n Please read the install the bioperl-ext package\n\n"); + exit(1); + } +} + +use Bio::Tools::AlignFactory; +use Bio::SimpleAlign; + + +@ISA = qw(Bio::Tools::AlignFactory); + + + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my($matrix,$gap,$ext) = $self->_rearrange([qw(MATRIX + GAP + EXT + )],@args); + + #default values - we have to load matrix into memory, so + # we need to check it out now + if( ! defined $matrix || !($matrix =~ /\w/) ) { + $matrix = 'blosum62.bla'; + } + + $self->matrix($matrix); # will throw exception if it can't load it + $self->gap(12) unless defined $gap; + $self->ext(2) unless defined $ext; + + # I'm pretty sure I am not doing this right... ho hum... + # This was not roght ($gap and $ext could not be 0) It is fixed now /AE + if( defined $gap ) { + if( $gap =~ /^\d+$/ ) { + $self->gap($gap); + } else { + $self->throw("Gap penalty must be a number, not [$gap]"); + } + } + if( defined $ext ) { + if( $ext =~ /^\d+$/ ) { + $self->ext($ext); + } else { + $self->throw("Extension penalty must be a number, not [$ext]"); + } + } + + return $self; +} + + +=head2 pairwise_alignment + + Title : pairwise_alignment + Usage : $aln = $factory->pairwise_alignment($seq1,$seq2) + Function: Makes a SimpleAlign object from two sequences + Returns : A SimpleAlign object + Args : + + +=cut + +sub pairwise_alignment{ + my ($self,$seq1,$seq2) = @_; + my($t1,$t2,$aln,$out,@str1,@str2,@ostr1,@ostr2,$alc,$tstr,$tid,$start1,$end1,$start2,$end2,$alctemp); + + if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') || + ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) { + $self->warn("Cannot call pairwise_alignment without specifing 2 sequences (Bio::PrimarySeqI objects)"); + return undef; + } + # fix Jitterbug #1044 + if( $seq1->length() < 2 || + $seq2->length() < 2 ) { + $self->warn("cannot align sequences with length less than 2"); + return undef; + } + $self->set_memory_and_report(); + # create engine objects + $seq1->display_id('seq1') unless ( defined $seq1->id() ); + $seq2->display_id('seq2') unless ( defined $seq2->id() ); + + $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(), + $seq1->seq()); + $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(), + $seq2->seq()); + $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext); + if( ! defined $aln || $aln == 0 ) { + $self->throw("Unable to build an alignment"); + } + + # free sequence engine objects + + $t1 = $t2 = 0; + + # now we have to get into the AlnBlock structure and + # figure out what is aligned to what... + + # we are going to need the sequences as arrays for convience + + @str1 = split(//, $seq1->seq()); + @str2 = split(//, $seq2->seq()); + + # get out start points + + # The alignment is in alignment coordinates - ie the first + # residues starts at -1 and ends at 0. (weird I know). + # bio-coordinates are +2 from this... + + $start1 = $aln->start()->alu(0)->start +2; + $start2 = $aln->start()->alu(1)->start +2; + + # step along the linked list of alc units... + + for($alc = $aln->start();$alc->at_end() != 1;$alc = $alc->next()) { + if( $alc->alu(0)->text_label eq 'SEQUENCE' ) { + push(@ostr1,$str1[$alc->alu(0)->start+1]); + } else { + # assumme it is in insert! + push(@ostr1,'-'); + } + + if( $alc->alu(1)->text_label eq 'SEQUENCE' ) { + push(@ostr2,$str2[$alc->alu(1)->start+1]); + } else { + # assumme it is in insert! + push(@ostr2,'-'); + } + $alctemp = $alc; + } + + # + # get out end points + # + + # end points = real residue end in 'C' coordinates = residue + # end in biocoordinates. Oh... the wonder of coordinate systems! + + $end1 = $alctemp->alu(0)->end+1; + $end2 = $alctemp->alu(1)->end+1; + + # get rid of the alnblock + $alc = 0; + $aln = 0; + + # new SimpleAlignment + $out = Bio::SimpleAlign->new(); # new SimpleAlignment + + $tstr = join('',@ostr1); + $tid = $seq1->id(); + $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr, + -start => $start1, + -end => $end1, + -id=>$tid )); + + $tstr = join('',@ostr2); + $tid = $seq2->id(); + $out->add_seq(Bio::LocatableSeq->new( -seq=> $tstr, + -start => $start2, + -end => $end2, + -id=> $tid )); + + # give'm back the alignment + + return $out; +} + +=head2 align_and_show + + Title : align_and_show + Usage : $factory->align_and_show($seq1,$seq2,STDOUT) + +=cut + +sub align_and_show { + my($self,$seq1,$seq2,$fh) = @_; + my($t1,$t2,$aln,$id,$str); + +if( ! defined $seq1 || ! $seq1->isa('Bio::PrimarySeqI') || + ! defined $seq2 || ! $seq2->isa('Bio::PrimarySeqI') ) { + $self->warn("Cannot call align_and_show without specifing 2 sequences (Bio::PrimarySeqI objects)"); + return undef; + } + # fix Jitterbug #1044 + if( $seq1->length() < 2 || + $seq2->length() < 2 ) { + $self->warn("cannot align sequences with length less than 2"); + return undef; + } + if( ! defined $fh ) { + $fh = \*STDOUT; + } + $self->set_memory_and_report(); + $seq1->display_id('seq1') unless ( defined $seq1->id() ); + $seq2->display_id('seq2') unless ( defined $seq2->id() ); + + $t1 = &Bio::Ext::Align::new_Sequence_from_strings($seq1->id(),$seq1->seq()); + + $t2 = &Bio::Ext::Align::new_Sequence_from_strings($seq2->id(),$seq2->seq()); + $aln = &Bio::Ext::Align::Align_Sequences_ProteinSmithWaterman($t1,$t2,$self->{'matrix'},-$self->gap,-$self->ext); + if( ! defined $aln || $aln == 0 ) { + $self->throw("Unable to build an alignment"); + } + + &Bio::Ext::Align::write_pretty_seq_align($aln,$t1,$t2,12,50,$fh); + +} + +=head2 matrix + + Title : matrix() + Usage : $factory->matrix('blosum62.bla'); + Function : Reads in comparison matrix based on name + : + Returns : + Argument : comparison matrix + +=cut + +sub matrix { + my($self,$comp) = @_; + my $temp; + + if( !defined $comp ) { + $self->throw("You must have a comparison matrix to set!"); + } + + # talking to the engine here... + + $temp = &Bio::Ext::Align::CompMat::read_Blast_file_CompMat($comp); + + if( !(defined $temp) || $temp == 0 ) { + $self->throw("$comp cannot be read as a BLAST comparison matrix file"); + } + + $self->{'matrix'} = $temp; +} + + + +=head2 gap + + Title : gap + Usage : $gap = $factory->gap() #get + : $factory->gap($value) #set + Function : the set get for the gap penalty + Example : + Returns : gap value + Arguments : new value + +=cut + +sub gap { + my ($self,$val) = @_; + + + if( defined $val ) { + if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE + $self->throw("Can't have a gap penalty less than 0"); + } + $self->{'gap'} = $val; + } + return $self->{'gap'}; +} + + +=head2 ext + + Title : ext + Usage : $ext = $factory->ext() #get + : $factory->ext($value) #set + Function : the set get for the ext penalty + Example : + Returns : ext value + Arguments : new value + +=cut + +sub ext { + my ($self,$val) = @_; + + if( defined $val ) { + if( $val < 0 ) { # Fixed so that gap==0 is allowed /AE + $self->throw("Can't have a gap penalty less than 0"); + } + $self->{'ext'} = $val; + } + return $self->{'ext'}; +} + + +