Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/AlignIO/phylip.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/phylip.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,400 @@ +# $Id: phylip.pm,v 1.24.2.1 2003/01/26 15:52:30 jason Exp $ +# +# BioPerl module for Bio::AlignIO::phylip +# +# Copyright Heikki Lehvaslaiho +# + +=head1 NAME + +Bio::AlignIO::phylip - PHYLIP format sequence input/output stream + +=head1 SYNOPSIS + +# Do not use this module directly. Use it via the Bio::AlignIO class. + + use Bio::AlignIO; + use Bio::SimpleAlign; + #you can set the name length to something other than the default 10 + #if you use a version of phylip (hacked) that accepts ids > 10 + my $phylipstream = new Bio::AlignIO(-format => 'phylip', + -fh => \*STDOUT, + -idlength=>30); + # convert data from one format to another + my $gcgstream = new Bio::AlignIO(-format => 'msf', + -file => 't/data/cysprot1a.msf'); + + while( my $aln = $gcgstream->next_aln ) { + $phylipstream->write_aln($aln); + } + + # do it again with phylip sequential format format + $phylipstream->interleaved(0); + # can also initialize the object like this + $phylipstream = new Bio::AlignIO(-interleaved => 0, + -format => 'phylip', + -fh => \*STDOUT, + -idlength=>10); + $gcgstream = new Bio::AlignIO(-format => 'msf', + -file => 't/data/cysprot1a.msf'); + + while( my $aln = $gcgstream->next_aln ) { + $phylipstream->write_aln($aln); + } + +=head1 DESCRIPTION + +This object can transform Bio::SimpleAlign objects to and from PHYLIP +interleaved format. It will not work with PHYLIP sequencial format. + +This module will output PHYLIP sequential format. By specifying the +flag -interleaved =E<gt> 0 in the initialization the module can output +data in interleaved format. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich + +Email: heikki@ebi.ac.uk +Email: jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::phylip; +use vars qw(@ISA $DEFAULTIDLENGTH $DEFAULTLINELEN); +use strict; + +use Bio::SimpleAlign; +use Bio::AlignIO; + +@ISA = qw(Bio::AlignIO); + +BEGIN { + $DEFAULTIDLENGTH = 10; + $DEFAULTLINELEN = 60; +} + +=head2 new + + Title : new + Usage : my $alignio = new Bio::AlignIO(-format => 'phylip' + -file => '>file', + -idlength => 10, + -idlinebreak => 1); + Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer + Returns : L<Bio::AlignIO> object + Args : [specific for writing of phylip format files] + -idlength => integer - length of the id (will pad w/ + spaces if needed) + -interleaved => boolean - whether or not write as interleaved + or sequential format + -linelength => integer of how long a sequence lines should be + -idlinebreak => insert a line break after the sequence id + so that sequence starts on the next line + +=cut + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + + my ($interleave,$linelen,$idlinebreak, + $idlength) = $self->_rearrange([qw(INTERLEAVED + LINELENGTH + IDLINEBREAK + IDLENGTH)],@args); + $self->interleaved(1) if( $interleave || ! defined $interleave); + $self->idlength($idlength || $DEFAULTIDLENGTH); + $self->id_linebreak(1) if( $idlinebreak ); + $self->line_length($linelen) if defined $linelen && $linelen > 0; + 1; +} + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Throws an exception if trying to read in PHYLIP + sequential format. + Returns : L<Bio::SimpleAlign> object + Args : + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my ($seqcount, $residuecount, %hash, $name,$str, + @names,$seqname,$start,$end,$count,$seq); + + my $aln = Bio::SimpleAlign->new(-source => 'phylip'); + $entry = $self->_readline and + ($seqcount, $residuecount) = $entry =~ /\s*(\d+)\s+(\d+)/; + return 0 unless $seqcount and $residuecount; + + # first alignment section + my $idlen = $self->idlength; + $count = 0; + my $non_interleaved = ! $self->interleaved ; + while( $entry = $self->_readline) { + last if( $entry =~ /^\s?$/ && ! $non_interleaved ); + + if( $entry =~ /^\s+(.+)$/ ) { + $str = $1; + $non_interleaved = 1; + $str =~ s/\s//g; + $count = scalar @names; + $hash{$count} .= $str; + } elsif( $entry =~ /^(.{$idlen})\s+(.*)\s$/ ) { + $name = $1; + $str = $2; + $name =~ s/[\s\/]/_/g; + $name =~ s/_+$//; # remove any trailing _'s + push @names, $name; + $str =~ s/\s//g; + $count = scalar @names; + $hash{$count} = $str; + } + $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; + } + + unless( $non_interleaved ) { + # interleaved sections + $count = 0; + while( $entry = $self->_readline) { + # finish current entry + if($entry =~/\s*\d+\s+\d+/){ + $self->_pushback($entry); + last; + } + $count = 0, next if $entry =~ /^\s$/; + + $entry =~ /\s*(.*)$/ && do { + $str = $1; + $str =~ s/\s//g; + $count++; + $hash{$count} .= $str; + }; + $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; + } + } + return 0 if scalar @names < 1; + + # sequence creation + $count = 0; + foreach $name ( @names ) { + $count++; + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $str = $hash{$count}; + $str =~ s/[^A-Za-z]//g; + $end = length($str); + } + # consistency test + $self->throw("Length of sequence [$seqname] is not [$residuecount]! ") + unless CORE::length($hash{$count}) == $residuecount; + + $seq = new Bio::LocatableSeq('-seq'=>$hash{$count}, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + ); + + $aln->add_seq($seq); + + } + return $aln; +} + + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in MSF format + Returns : 1 for success and 0 for error + Args : L<Bio::Align::AlignI> object + +=cut + +sub write_aln { + my ($self,@aln) = @_; + my $count = 0; + my $wrapped = 0; + my $maxname; + my ($length,$date,$name,$seq,$miss,$pad, + %hash,@arr,$tempcount,$index,$idlength); + + foreach my $aln (@aln) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + next; + } + $self->throw("All sequences in the alignment must be the same length") + unless $aln->is_flush(1) ; + + $aln->set_displayname_flat(); # plain + $length = $aln->length(); + $self->_print (sprintf(" %s %s\n", $aln->no_sequences, $aln->length)); + + $idlength = $self->idlength(); + foreach $seq ( $aln->each_seq() ) { + $name = $aln->displayname($seq->get_nse); + $name = substr($name, 0, $idlength) if length($name) > $idlength; + $name = sprintf("%-".$idlength."s",$name); + if( $self->interleaved() ) { + $name .= ' ' ; + } elsif( $self->id_linebreak) { + $name .= "\n"; + } + + #phylip needs dashes not dots + my $seq = $seq->seq(); + $seq=~s/\./-/g; + $hash{$name} = $seq; + push(@arr,$name); + } + + if( $self->interleaved() ) { + while( $count < $length ) { + + # there is another block to go! + foreach $name ( @arr ) { + my $dispname = $name; + $dispname = '' if $wrapped; + $self->_print (sprintf("%".($idlength+3)."s",$dispname)); + $tempcount = $count; + $index = 0; + while( ($tempcount + $idlength < $length) && ($index < 5) ) { + $self->_print (sprintf("%s ",substr($hash{$name}, + $tempcount, + $idlength))); + $tempcount += $idlength; + $index++; + } + # last + if( $index < 5) { + # space to print! + $self->_print (sprintf("%s ",substr($hash{$name}, + $tempcount))); + $tempcount += $idlength; + } + $self->_print ("\n"); + } + $self->_print ("\n"); + $count = $tempcount; + $wrapped = 1; + } + } else { + foreach $name ( @arr ) { + my $dispname = $name; + $dispname = '' if $wrapped; + $self->_print (sprintf("%s%s\n",$dispname,$hash{$name})); + } + } + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 interleaved + + Title : interleaved + Usage : my $interleaved = $obj->interleaved + Function: Get/Set Interleaved status + Returns : boolean + Args : boolean + + +=cut + +sub interleaved{ + my ($self,$value) = @_; + my $previous = $self->{'_interleaved'}; + if( defined $value ) { + $self->{'_interleaved'} = $value; + } + return $previous; +} + +=head2 idlength + + Title : idlength + Usage : my $idlength = $obj->interleaved + Function: Get/Set value of id length + Returns : string + Args : string + + +=cut + +sub idlength { + my($self,$value) = @_; + if (defined $value){ + $self->{'_idlength'} = $value; + } + return $self->{'_idlength'}; +} + +=head2 line_length + + Title : line_length + Usage : $obj->line_length($newval) + Function: + Returns : value of line_length + Args : newvalue (optional) + + +=cut + +sub line_length{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'line_length'} = $value; + } + return $self->{'line_length'} || $DEFAULTLINELEN; + +} + +=head2 id_linebreak + + Title : id_linebreak + Usage : $obj->id_linebreak($newval) + Function: + Returns : value of id_linebreak + Args : newvalue (optional) + + +=cut + +sub id_linebreak{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_id_linebreak'} = $value; + } + return $self->{'_id_linebreak'} || 0; +} + +1;