Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/AlignIO/nexus.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/nexus.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,343 @@ +# $Id: nexus.pm,v 1.12.2.1 2003/04/07 15:17:17 heikki Exp $ +# +# BioPerl module for Bio::AlignIO::nexus +# +# Copyright Heikki Lehvaslaiho +# + +=head1 NAME + +Bio::AlignIO::nexus - NEXUS format sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the L<Bio::AlignIO> class. + +=head1 DESCRIPTION + +This object can transform L<Bio::Align::AlignI> objects to and from NEXUS +data blocks. See method documentation for supported NEXUS features. + +=head1 ACKNOWLEDGEMENTS + +Will Fisher has written an excellent standalone NEXUS format parser in +perl, readnexus. A number of tricks were adapted from it. + +=head1 FEEDBACK + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::AlignIO::nexus; +use vars qw(@ISA %valid_type); +use strict; +no strict "refs"; + +use Bio::AlignIO; + +@ISA = qw(Bio::AlignIO); + +BEGIN { + %valid_type = map {$_, 1} qw( dna rna protein standard); +} + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: Returns the next alignment in the stream. + + Supports the following NEXUS format features: + - The file has to start with '#NEXUS' + - Reads in the name of the alignment from a comment + (anything after 'TITLE: ') . + - Sequence names can be given in a taxa block, too. + - If matchchar notation is used, converts + them back to sequence characters. + - Does character conversions specified in the + NEXUS equate command. + - Sequence names of type 'Homo sapiens' and + Homo_sapiens are treated identically. + + Returns : L<Bio::Align::AlignI> object + Args : + +=cut + +sub next_aln { + my $self = shift; + my $entry; + my ($aln_name, $seqcount, $residuecount, %hash, $alphabet, + $match, $gap, $missing, $equate, $interleave, + $name,$str,@names,$seqname,$start,$end,$count,$seq); + + my $aln = Bio::SimpleAlign->new(-source => 'nexus'); + + # file starts with '#NEXUS' but we allow white space only lines before it + $entry = $self->_readline; + $entry = $self->_readline while $entry =~ /^\s+$/; + + return unless $entry; + $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry") + unless $entry =~ /^#NEXUS/i; + + # skip anything before either the taxa or data block + # but read in the optional title in a comment + while (defined($entry = $self->_readline)) { + local ($_) = $entry; + /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1; + last if /^begin +data/i || /^begin +taxa/i; + } + $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name; + + # data and taxa blocks + my $taxlabels; + while ($entry = $self->_readline) { + local ($_) = $entry; + + # read in seq names if in taxa block + $taxlabels = 1 if /taxlabels/i; + if ($taxlabels) { + @names = $self->_read_taxlabels; + $taxlabels = 0; + } + + /ntax ?= ?(\d+)/i and $seqcount = $1; + /nchar ?= ?(\d+)/i and $residuecount = $1; + /matchchar ?= ?(.)/i and $match = $1; + /gap ?= ?(.)/i and $gap = $1; + /missing ?= ?(.)/i and $missing = $1; + /equate ?= ?"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A"; + /datatype ?= ?(\w+)/i and $alphabet = lc $1; + /interleave/i and $interleave = 1 ; + + last if /matrix/i; + } + $self->throw("Not a valid NEXUS sequence file. Datatype not specified") + unless $alphabet; + $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]") + unless $valid_type{$alphabet}; + + $aln->gap_char($gap); + $aln->missing_char($missing); + + # + # if data is not right after the matrix line + # read the empty lines out + # + while ($entry = $self->_readline) { + unless ($entry =~ /^\s+$/) { + $self->_pushback($entry); + last; + } + } + + # + # matrix command + # + # first alignment section + if (@names == 0) { # taxa block did not exist + while ($entry = $self->_readline) { + local ($_) = $entry; + + s/\[[^[]+\]//g; #] remove comments + if ($interleave) { + /^\s+$/ and last; + } else { + /^\s+$/ and next; + } + /^\s*;\s*$/ and last; + if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #' + $name = ($2 || $3); + $str = $4; + $name =~ s/ /_/g; + push @names, $name; + + $str =~ s/\s//g; + $count = @names; + $hash{$count} = $str; + }; + $self->throw("Not a valid interleaved NEXUS file! +seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount; + } + } + + # interleaved sections + $count = 0; + while( $entry = $self->_readline) { + local ($_) = $entry; + s/\[[^[]+\]//g; #] remove comments + last if /^\s*;/; + + $count = 0, next if $entry =~ /^\s*$/; + if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #' + $str = $4; + $str =~ s/\s//g; + $count++; + $hash{$count} .= $str; + }; + $self->throw("Not a valid interleaved NEXUS file! +seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount; + + } + + return 0 if @names < 1; + + # sequence creation + $count = 0; + foreach $name ( @names ) { + $count++; + if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { + $seqname = $1; + $start = $2; + $end = $3; + } else { + $seqname=$name; + $start = 1; + $str = $hash{$count}; + $str =~ s/[^A-Za-z]//g; + $end = length($str); + } + + # consistency test + $self->throw("Length of sequence [$seqname] is not [$residuecount]! ") + unless CORE::length($hash{$count}) == $residuecount; + + $seq = new Bio::LocatableSeq('-seq'=>$hash{$count}, + '-id'=>$seqname, + '-start'=>$start, + '-end'=>$end, + 'alphabet'=>$alphabet + ); + $aln->add_seq($seq); + } + + # if matchchar is used + $aln->unmatch($match) if $match; + + # if equate ( e.g. equate="T=C G=A") is used + if ($equate) { + $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g; + } + + while ($entry !~ /endblock/i) { + $entry = $self->_readline; + } + + return $aln; +} + +sub _read_taxlabels { + my ($self) = @_; + my ($name, @names); + while (my $entry = $self->_readline) { + ($name) = $entry =~ /\s*(\S+)\s+/; + $name =~ s/\[[^\[]+\]//g; + $name =~ s/\W/_/g; + push @names, $name; + last if /^\s*;/; + } + return @names; +} + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: Writes the $aln object into the stream in interleaved NEXUS + format. Everything is written into a data block. + SimpleAlign methods match_char, missing_char and gap_char must be set + if you want to see them in the output. + Returns : 1 for success and 0 for error + Args : L<Bio::Align::AlignI> object + +=cut + +sub write_aln { + my ($self,@aln) = @_; + my $count = 0; + my $wrapped = 0; + my $maxname; + my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index ); + my ($match, $missing, $gap,$symbols) = ('', '', '',''); + + foreach my $aln (@aln) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + next; + } + $self->throw("All sequences in the alignment must be the same length") + unless $aln->is_flush($self->verbose); + + $length = $aln->length(); + + $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n", + $aln->id, $aln->no_sequences, $length)); + $match = "match=". $aln->match_char if $aln->match_char; + $missing = "missing=". $aln->missing_char if $aln->missing_char; + $gap = "gap=". $aln->gap_char if $aln->gap_char; + $symbols = 'symbols="'.join('',$aln->symbol_chars). '"' if( $aln->symbol_chars); + $self->_print (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n", + $aln->get_seq_by_pos(1)->alphabet, $match, $missing, $gap, $symbols)); + + my $indent = $aln->maxdisplayname_length; + $aln->set_displayname_flat(); + foreach $seq ( $aln->each_seq() ) { + $name = $aln->displayname($seq->get_nse()); + $name = sprintf("%-${indent}s", $name); + $hash{$name} = $seq->seq(); + push(@arr,$name); + } + + while( $count < $length ) { + # there is another block to go! + foreach $name ( @arr ) { + my $dispname = $name; +# $dispname = '' if $wrapped; + $self->_print (sprintf("%${indent}s ",$dispname)); + $tempcount = $count; + $index = 0; + while( ($tempcount + 10 < $length) && ($index < 5) ) { + $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10))); + $tempcount += 10; + $index++; + } + # last + if( $index < 5) { + # space to print! + $self->_print (sprintf("%s ",substr($hash{$name},$tempcount))); + $tempcount += 10; + } + $self->_print ("\n"); + } + $self->_print ("\n\n"); + $count = $tempcount; + $wrapped = 1; + } + $self->_print (";\n\nendblock;\n"); + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +1;