view variant_effect_predictor/Bio/AlignIO/phylip.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

# $Id: phylip.pm,v 1.24.2.1 2003/01/26 15:52:30 jason Exp $
#
# BioPerl module for Bio::AlignIO::phylip
#
# Copyright Heikki Lehvaslaiho
#

=head1 NAME

Bio::AlignIO::phylip - PHYLIP format sequence input/output stream

=head1 SYNOPSIS

# Do not use this module directly.  Use it via the Bio::AlignIO class.

    use Bio::AlignIO;
    use Bio::SimpleAlign;
	#you can set the name length to something other than the default 10
	#if you use a version of phylip (hacked) that accepts ids > 10
    my $phylipstream = new Bio::AlignIO(-format => 'phylip',
					-fh   => \*STDOUT,
					-idlength=>30);
    # convert data from one format to another
    my $gcgstream     =  new Bio::AlignIO(-format => 'msf',
					  -file   => 't/data/cysprot1a.msf');

    while( my $aln = $gcgstream->next_aln ) {
	$phylipstream->write_aln($aln);
    }

    # do it again with phylip sequential format format 
    $phylipstream->interleaved(0);
    # can also initialize the object like this
    $phylipstream = new Bio::AlignIO(-interleaved => 0,
				     -format => 'phylip',
				     -fh   => \*STDOUT,
				     -idlength=>10);
    $gcgstream     =  new Bio::AlignIO(-format => 'msf',
				       -file   => 't/data/cysprot1a.msf');    

    while( my $aln = $gcgstream->next_aln ) {
	$phylipstream->write_aln($aln);
    }

=head1 DESCRIPTION

This object can transform Bio::SimpleAlign objects to and from PHYLIP
interleaved format. It will not work with PHYLIP sequencial format.

This module will output PHYLIP sequential format.  By specifying the
flag -interleaved =E<gt> 0 in the initialization the module can output
data in interleaved format.

=head1 FEEDBACK

=head2 Reporting Bugs

Report bugs to the Bioperl bug tracking system to help us keep track
 the bugs and their resolution.
 Bug reports can be submitted via email or the web:

  bioperl-bugs@bio.perl.org
  http://bugzilla.bioperl.org/

=head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich

Email: heikki@ebi.ac.uk
Email: jason@bioperl.org

=head1 APPENDIX

The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _

=cut

# Let the code begin...

package Bio::AlignIO::phylip;
use vars qw(@ISA $DEFAULTIDLENGTH $DEFAULTLINELEN);
use strict;

use Bio::SimpleAlign;
use Bio::AlignIO;

@ISA = qw(Bio::AlignIO);

BEGIN { 
    $DEFAULTIDLENGTH = 10;
    $DEFAULTLINELEN = 60;
}

=head2 new

 Title   : new
 Usage   : my $alignio = new Bio::AlignIO(-format => 'phylip'
					  -file   => '>file',
					  -idlength => 10,
					  -idlinebreak => 1);
 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
 Returns : L<Bio::AlignIO> object
 Args    : [specific for writing of phylip format files]
           -idlength => integer - length of the id (will pad w/ 
						    spaces if needed) 
           -interleaved => boolean - whether or not write as interleaved 
                                     or sequential format
           -linelength  => integer of how long a sequence lines should be 
           -idlinebreak => insert a line break after the sequence id
                           so that sequence starts on the next line 

=cut

sub _initialize {
  my($self,@args) = @_;
  $self->SUPER::_initialize(@args);

  my ($interleave,$linelen,$idlinebreak,
      $idlength) = $self->_rearrange([qw(INTERLEAVED 
					 LINELENGTH
					 IDLINEBREAK
					 IDLENGTH)],@args);
  $self->interleaved(1) if( $interleave || ! defined $interleave);
  $self->idlength($idlength || $DEFAULTIDLENGTH);
  $self->id_linebreak(1) if( $idlinebreak );
  $self->line_length($linelen) if defined $linelen && $linelen > 0;
  1;
}

=head2 next_aln

 Title   : next_aln
 Usage   : $aln = $stream->next_aln()
 Function: returns the next alignment in the stream.
           Throws an exception if trying to read in PHYLIP
           sequential format.
 Returns : L<Bio::SimpleAlign> object
 Args    : 

=cut

sub next_aln {
    my $self = shift;
    my $entry;
    my ($seqcount, $residuecount, %hash, $name,$str,
	@names,$seqname,$start,$end,$count,$seq);
    
    my $aln =  Bio::SimpleAlign->new(-source => 'phylip');
    $entry = $self->_readline and 
        ($seqcount, $residuecount) = $entry =~ /\s*(\d+)\s+(\d+)/;
    return 0 unless $seqcount and $residuecount;
    
    # first alignment section
    my $idlen = $self->idlength;
    $count = 0;
    my $non_interleaved = ! $self->interleaved ;
    while( $entry = $self->_readline) {
	last if( $entry =~ /^\s?$/ && ! $non_interleaved );

	if( $entry =~ /^\s+(.+)$/ ) {
	    $str = $1;
	    $non_interleaved = 1;
	    $str =~ s/\s//g;
	    $count = scalar @names;
	    $hash{$count} .= $str;
	} elsif( $entry =~ /^(.{$idlen})\s+(.*)\s$/ ) {
	    $name = $1;
	    $str = $2;
	    $name =~ s/[\s\/]/_/g;
	    $name =~ s/_+$//; # remove any trailing _'s
	    push @names, $name;
	    $str =~ s/\s//g;
	    $count = scalar @names;
	    $hash{$count} = $str;
	} 
	$self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; 
    }
    
    unless( $non_interleaved ) {    
	# interleaved sections
	$count = 0;
	while( $entry = $self->_readline) {
	    # finish current entry
	    if($entry =~/\s*\d+\s+\d+/){
		$self->_pushback($entry);
		last;
	    }
	    $count = 0, next if $entry =~ /^\s$/;
	    
	    $entry =~ /\s*(.*)$/ && do {
		$str = $1;
		$str =~ s/\s//g;
		$count++;
		$hash{$count} .= $str;
	    };
	    $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; 
	}
    }
    return 0 if scalar @names < 1;
    
    # sequence creation
    $count = 0;
    foreach $name ( @names ) {
	$count++;
	if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
	    $seqname = $1;
	    $start = $2;
	    $end = $3;
	} else {
	    $seqname=$name;
	    $start = 1;
	    $str = $hash{$count};
	    $str =~ s/[^A-Za-z]//g;
	    $end = length($str);
	}
	# consistency test
	$self->throw("Length of sequence [$seqname] is not [$residuecount]! ") 
	    unless CORE::length($hash{$count}) == $residuecount; 

       $seq = new Bio::LocatableSeq('-seq'=>$hash{$count},
				    '-id'=>$seqname,
				    '-start'=>$start,
				    '-end'=>$end,
				    );

       $aln->add_seq($seq);

   }
   return $aln;
}


=head2 write_aln

 Title   : write_aln
 Usage   : $stream->write_aln(@aln)
 Function: writes the $aln object into the stream in MSF format
 Returns : 1 for success and 0 for error
 Args    : L<Bio::Align::AlignI> object

=cut

sub write_aln {
    my ($self,@aln) = @_;
    my $count = 0;
    my $wrapped = 0;
    my $maxname;
    my ($length,$date,$name,$seq,$miss,$pad,
	%hash,@arr,$tempcount,$index,$idlength);
    
    foreach my $aln (@aln) {
	if( ! $aln || ! $aln->isa('Bio::Align::AlignI')  ) { 
	    $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
	    next;
	}
	$self->throw("All sequences in the alignment must be the same length") 
	    unless $aln->is_flush(1) ;

	$aln->set_displayname_flat(); # plain
	$length  = $aln->length();
	$self->_print (sprintf(" %s %s\n", $aln->no_sequences, $aln->length));

	$idlength = $self->idlength();	
	foreach $seq ( $aln->each_seq() ) {
	    $name = $aln->displayname($seq->get_nse);
	    $name = substr($name, 0, $idlength) if length($name) > $idlength;
	    $name = sprintf("%-".$idlength."s",$name);	    
	    if( $self->interleaved() ) {
		$name .= '   ' ;
	    } elsif( $self->id_linebreak) { 
		$name .= "\n"; 
	    }

      #phylip needs dashes not dots 
      my $seq = $seq->seq();
      $seq=~s/\./-/g;
	    $hash{$name} = $seq;
	    push(@arr,$name);
	}

	if( $self->interleaved() ) {
	    while( $count < $length ) {	
		
		# there is another block to go!
		foreach $name ( @arr ) {
		    my $dispname = $name;
		    $dispname = '' if $wrapped;
		    $self->_print (sprintf("%".($idlength+3)."s",$dispname));
		    $tempcount = $count;
		    $index = 0;
		    while( ($tempcount + $idlength < $length) && ($index < 5)  ) {
			$self->_print (sprintf("%s ",substr($hash{$name},
							    $tempcount,
							    $idlength)));
			$tempcount += $idlength;
			$index++;
		    }
		    # last
		    if( $index < 5) {
			# space to print!
			$self->_print (sprintf("%s ",substr($hash{$name},
							    $tempcount)));
			$tempcount += $idlength;
		    }
		    $self->_print ("\n");
		}
		$self->_print ("\n");
		$count = $tempcount;
		$wrapped = 1;
	    } 			
	} else {
	    foreach $name ( @arr ) {
		my $dispname = $name;
		$dispname = '' if $wrapped;
		$self->_print (sprintf("%s%s\n",$dispname,$hash{$name}));
	    }	
	}
    }
    $self->flush if $self->_flush_on_write && defined $self->_fh;
    return 1;
}

=head2 interleaved

 Title   : interleaved
 Usage   : my $interleaved = $obj->interleaved
 Function: Get/Set Interleaved status
 Returns : boolean
 Args    : boolean


=cut

sub interleaved{
   my ($self,$value) = @_;
   my $previous = $self->{'_interleaved'};
   if( defined $value ) { 
       $self->{'_interleaved'} = $value;
   }
   return $previous;
}

=head2 idlength

 Title   : idlength
 Usage   : my $idlength = $obj->interleaved
 Function: Get/Set value of id length 
 Returns : string 
 Args    : string 


=cut

sub idlength {
	my($self,$value) = @_;
	if (defined $value){
	   $self->{'_idlength'} = $value;
	}
	return $self->{'_idlength'};
}

=head2 line_length

 Title   : line_length
 Usage   : $obj->line_length($newval)
 Function: 
 Returns : value of line_length
 Args    : newvalue (optional)


=cut

sub line_length{
   my ($self,$value) = @_;
   if( defined $value) {
      $self->{'line_length'} = $value;
    }
    return $self->{'line_length'} || $DEFAULTLINELEN;

}

=head2 id_linebreak

 Title   : id_linebreak
 Usage   : $obj->id_linebreak($newval)
 Function: 
 Returns : value of id_linebreak
 Args    : newvalue (optional)


=cut

sub id_linebreak{
   my ($self,$value) = @_;
   if( defined $value) {
      $self->{'_id_linebreak'} = $value;
    }
    return $self->{'_id_linebreak'} || 0;
}

1;