Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/AlignIO/mega.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/AlignIO/mega.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,229 @@ +# $Id: mega.pm,v 1.8 2002/10/22 07:45:10 lapp Exp $ +# +# BioPerl module for Bio::AlignIO::mega +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::AlignIO::mega - Parse and Create MEGA format data files + +=head1 SYNOPSIS + + use Bio::AlignIO; + my $alignio = new Bio::AlignIO(-format => 'mega', + -file => 't/data/hemoglobinA.meg'); + + while( my $aln = $alignio->next_aln ) { + # process each alignment or convert to another format like NEXUS + } + +=head1 DESCRIPTION + +This object handles reading and writing data streams in the MEGA +format (Kumar and Nei). + + +=head1 FEEDBACK + + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich + +Email jason@bioperl.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::AlignIO::mega; +use vars qw(@ISA $MEGANAMELEN %VALID_TYPES $LINELEN $BLOCKLEN); +use strict; + +use Bio::AlignIO; +use Bio::SimpleAlign; +use Bio::LocatableSeq; + +BEGIN { + $MEGANAMELEN = 10; + $LINELEN = 60; + $BLOCKLEN = 10; + %VALID_TYPES = map {$_, 1} qw( dna rna protein standard); +} +@ISA = qw(Bio::AlignIO ); + + +=head2 next_aln + + Title : next_aln + Usage : $aln = $stream->next_aln() + Function: returns the next alignment in the stream. + Supports the following MEGA format features: + - The file has to start with '#mega' + - Reads in the name of the alignment from a comment + (anything after '!TITLE: ') . + - Reads in the format parameters datatype + + Returns : L<Bio::Align::AlignI> object - returns 0 on end of file + or on error + Args : NONE + + +=cut + +sub next_aln{ + my ($self) = @_; + my $entry; + my ($alphabet,%seqs); + + my $aln = Bio::SimpleAlign->new(-source => 'mega'); + + while( defined($entry = $self->_readline()) && ($entry =~ /^\s+$/) ) {} + + $self->throw("Not a valid MEGA file! [#mega] not starting the file!") + unless $entry =~ /^#mega/i; + + while( defined($entry = $self->_readline() ) ) { + local($_) = $entry; + if(/\!Title:\s*([^\;]+)\s*/i) { $aln->id($1)} + elsif( s/\!Format\s+([^\;]+)\s*/$1/ ) { + my (@fields) = split(/\s+/,$1); + foreach my $f ( @fields ) { + my ($name,$value) = split(/\=/,$f); + if( $name eq 'datatype' ) { + $alphabet = $value; + } elsif( $name eq 'identical' ) { + $aln->match_char($value); + } elsif( $name eq 'indel' ) { + $aln->gap_char($value); + } + } + } elsif( /^\#/ ) { + last; + } + } + my @order; + while( defined($entry) ) { + if( $entry !~ /^\s+$/ ) { + # this is to skip the leading '#' + my $seqname = substr($entry,1,$MEGANAMELEN-1); + $seqname =~ s/(\S+)\s+$/$1/g; + my $line = substr($entry,$MEGANAMELEN); + $line =~ s/\s+//g; + if( ! defined $seqs{$seqname} ) {push @order, $seqname; } + $seqs{$seqname} .= $line; + } + $entry = $self->_readline(); + } + + foreach my $seqname ( @order ) { + my $s = $seqs{$seqname}; + $s =~ s/\-//g; + my $end = length($s); + my $seq = new Bio::LocatableSeq(-alphabet => $alphabet, + -id => $seqname, + -seq => $seqs{$seqname}, + -start => 1, + -end => $end); + + $aln->add_seq($seq); + } + return $aln; +} + +=head2 write_aln + + Title : write_aln + Usage : $stream->write_aln(@aln) + Function: writes the $aln object into the stream in MEGA format + Returns : 1 for success and 0 for error + Args : L<Bio::Align::AlignI> object + +=cut + +sub write_aln{ + my ($self,@aln) = @_; + my $count = 0; + my $wrapped = 0; + my $maxname; + + foreach my $aln ( @aln ) { + if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { + $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); + return 0; + } elsif( ! $aln->is_flush($self->verbose) ) { + $self->warn("All Sequences in the alignment must be the same length"); + return 0; + } + $aln->match(); + my $len = $aln->length(); + my $format = sprintf('datatype=%s identical=%s indel=%s;', + $aln->get_seq_by_pos(1)->alphabet(), + $aln->match_char, $aln->gap_char); + + $self->_print(sprintf("#mega\n!Title: %s;\n!Format %s\n\n\n", + $aln->id, $format)); + + my ($count, $blockcount,$length) = ( 0,0,$aln->length()); + $aln->set_displayname_flat(); + while( $count < $length ) { + foreach my $seq ( $aln->each_seq ) { + my $seqchars = $seq->seq(); + $blockcount = 0; + my $substring = substr($seqchars, $count, $LINELEN); + my @blocks; + while( $blockcount < length($substring) ) { + push @blocks, substr($substring, $blockcount,$BLOCKLEN); + $blockcount += $BLOCKLEN; + } + $self->_print(sprintf("#%-".($MEGANAMELEN-1)."s%s\n", + substr($aln->displayname($seq->get_nse()), + 0,$MEGANAMELEN-2), + join(' ', @blocks))); + } + $self->_print("\n"); + $count += $LINELEN; + } + } + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + + +1;