diff variant_effect_predictor/Bio/AlignIO/emboss.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/AlignIO/emboss.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,251 @@
+# $Id: emboss.pm,v 1.11 2002/10/22 07:45:10 lapp Exp $
+#
+# BioPerl module for Bio::AlignIO::emboss
+#
+# Cared for by Jason Stajich <jason@bioperl.org>
+#
+# Copyright Jason Stajich
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::AlignIO::emboss - Parse EMBOSS alignment output (from applications water and needle)
+
+=head1 SYNOPSIS
+
+    # do not use the object directly
+    use Bio::AlignIO;
+    # read in an alignment from the EMBOSS program water
+    my $in = new Bio::AlignIO(-format => 'emboss',
+			      -file   => 'seq.water');
+    while( my $aln = $in->next_aln ) {
+	# do something with the alignment
+    }
+
+=head1 DESCRIPTION
+
+This object handles parsing and writing pairwise sequence alignments
+from the EMBOSS suite.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bioperl.org/MailList.shtml  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bioperl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Jason Stajich
+
+Email jason@bioperl.org
+
+Describe contact details here
+
+=head1 CONTRIBUTORS
+
+Additional contributors names and emails here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::AlignIO::emboss;
+use vars qw(@ISA $EMBOSSTitleLen $EMBOSSLineLen);
+use strict;
+
+use Bio::AlignIO;
+use Bio::LocatableSeq;
+
+@ISA = qw(Bio::AlignIO );
+
+BEGIN { 
+    $EMBOSSTitleLen    = 13;
+    $EMBOSSLineLen     = 50;
+}
+
+sub _initialize {
+    my($self,@args) = @_;
+    $self->SUPER::_initialize(@args);
+    $self->{'_type'} = undef;
+}
+
+=head2 next_aln
+
+ Title   : next_aln
+ Usage   : $aln = $stream->next_aln()
+ Function: returns the next alignment in the stream.
+ Returns : L<Bio::Align::AlignI> object - returns 0 on end of file
+	    or on error
+ Args    : NONE
+
+=cut
+
+sub next_aln {
+    my ($self) = @_;
+    my $seenbegin = 0;
+    my %data = ( 'seq1' => { 
+		     'start'=> undef,
+		     'end'=> undef,		
+		     'name' => '',
+		     'data' => '' },
+		 'seq2' => { 
+		     'start'=> undef,
+		     'end'=> undef,
+		     'name' => '',
+		     'data' => '' },
+		 'align' => '',
+		 'type'  => $self->{'_type'},  # to restore type from 
+		                                     # previous aln if possible
+		 );
+    my %names;
+    while( defined($_ = $self->_readline) ) {
+	next if( /^\#?\s+$/ || /^\#+\s*$/ );
+	if( /^\#(\=|\-)+\s*$/) {
+	    last if( $seenbegin);
+	} elsif( /(Local|Global):\s*(\S+)\s+vs\s+(\S+)/ ||
+		 /^\#\s+Program:\s+(\S+)/ )
+	{
+	    my ($name1,$name2) = ($2,$3);
+	    if( ! defined $name1 ) { # Handle EMBOSS 2.2.X
+		$data{'type'} = $1;
+		$name1 = $name2 = '';
+	    } else { 
+		$data{'type'} = $1 eq 'Local' ? 'water' : 'needle';
+	    }	    
+	    $data{'seq1'}->{'name'} = $name1;
+	    $data{'seq2'}->{'name'} = $name2;
+
+	    $self->{'_type'} = $data{'type'};
+
+	} elsif( /Score:\s+(\S+)/ ) {
+	    $data{'score'} = $1;		
+	} elsif( /^\#\s+(1|2):\s+(\S+)/ && !  $data{"seq$1"}->{'name'} ) {
+	    my $nm = $2;
+	    $nm = substr($nm,0,$EMBOSSTitleLen); # emboss has a max seq length
+	    if( $names{$nm} ) {
+		$nm .= "-". $names{$nm};
+	    }
+	    $names{$nm}++;
+	    $data{"seq$1"}->{'name'} = $nm;	
+	} elsif( $data{'seq1'}->{'name'} &&
+		 /^$data{'seq1'}->{'name'}/ ) {	    
+	    my $count = 0;
+	    $seenbegin = 1;
+	    my @current;
+	    while( defined ($_) ) {
+		my $align_other = '';
+		my $delayed;		
+		if($count == 0 || $count == 2 ) {
+		    my @l = split;
+		    my ($seq,$align,$start,$end);
+		    if( $count == 2 && $data{'seq2'}->{'name'} eq '' ) {
+			# weird boundary condition 
+			($start,$align,$end) = @l;
+		    } elsif( @l == 3 ) {
+			$align = '';
+			($seq,$start,$end) = @l
+		    } else { 
+			($seq,$start,$align,$end) = @l;
+ 		    }
+
+		    my $seqname = sprintf("seq%d", ($count == 0) ? '1' : '2'); 
+		    $data{$seqname}->{'data'} .= $align;
+		    $data{$seqname}->{'start'} ||= $start;
+		    $data{$seqname}->{'end'} = $end;
+		    $current[$count] = [ $start,$align || ''];
+		} else { 
+		    s/^\s+//;
+		    s/\s+$//;
+		    $data{'align'} .= $_;
+		}
+
+	      BOTTOM:
+		last if( $count++ == 2);
+		$_ = $self->_readline();
+	    }
+
+	    if( $data{'type'} eq 'needle' ) {
+		# which ever one is shorter we want to bring it up to 
+		# length.  Man this stinks.
+		my ($s1,$s2) =  ($data{'seq1'}, $data{'seq2'});
+		
+		my $d = length($current[0]->[1]) - length($current[2]->[1]);
+		if( $d < 0 ) { # s1 is smaller, need to add some
+		    # compare the starting points for this alignment line
+		    if( $current[0]->[0] <= 1 && $current[2]->[0] > 1) { 
+			$s1->{'data'} = ('-' x abs($d)) . $s1->{'data'};
+			$data{'align'} = (' 'x abs($d)).$data{'align'};
+		    } else { 
+			$s1->{'data'} .= '-' x abs($d);
+			$data{'align'} .= ' 'x abs($d);
+		    }
+		} elsif( $d > 0) { # s2 is smaller, need to add some  
+		    if( $current[2]->[0] <= 1 && $current[0]->[0] > 1) { 
+			$s2->{'data'} = ('-' x abs($d)) . $s2->{'data'};
+			$data{'align'} = (' 'x abs($d)).$data{'align'};
+		    } else { 
+			$s2->{'data'} .= '-' x abs($d);
+			$data{'align'} .= ' 'x abs($d);
+		    }
+		}
+	    }
+	    
+	}
+    }
+    return undef unless $seenbegin;
+    my $aln =  Bio::SimpleAlign->new(-verbose => $self->verbose(),
+				     -source => "EMBOSS-".$data{'type'});
+    
+    foreach my $seqname ( qw(seq1 seq2) ) { 
+	return undef unless ( defined $data{$seqname} );	
+	$data{$seqname}->{'name'} ||= $seqname;
+	my $seq = new Bio::LocatableSeq('-seq' => $data{$seqname}->{'data'},
+					'-id'  => $data{$seqname}->{'name'},
+					'-start'=> $data{$seqname}->{'start'},
+					'-end' => $data{$seqname}->{'end'},
+					);
+	$aln->add_seq($seq);
+    }
+    return $aln;
+}
+
+=head2 write_aln
+
+ Title   : write_aln
+ Usage   : $stream->write_aln(@aln)
+ Function: writes the $aln object into the stream in emboss format
+ Returns : 1 for success and 0 for error
+ Args    : L<Bio::Align::AlignI> object
+
+
+=cut
+
+sub write_aln {
+    my ($self,@aln) = @_;
+
+    $self->throw("Sorry: writing emboss output is not currently available! \n");
+}
+
+1;