Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/Tools/Phylo/PAML.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/Phylo/PAML.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,647 @@ +# PAML.pm,v 1.3 2002/06/20 18:50:37 amackey Exp +# +# BioPerl module for Bio::Tools::Phylo::PAML +# +# Cared for by Jason Stajich <jason@bioperl.org> +# +# Copyright Jason Stajich, Aaron J Mackey +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::Phylo::PAML - Parses output from the PAML programs codeml, +baseml, basemlg, codemlsites and yn00 + +=head1 SYNOPSIS + + #!/usr/bin/perl -Tw + use strict; + + use Bio::Tools::Phylo::PAML; + + # need to specify the output file name (or a fh) (defaults to + # -file => "codeml.mlc"); also, optionally, the directory in which + # the other result files (rst, 2ML.dS, etc) may be found (defaults + # to "./") + my $parser = new Bio::Tools::Phylo::PAML + (-file => "./results/mlc", -dir => "./results/"); + + # get the first/next result; a Bio::Tools::Phylo::PAML::Result object, + # which isa Bio::SeqAnalysisResultI object. + my $result = $parser->next_result(); + + # get the sequences used in the analysis; returns Bio::PrimarySeq + # objects (OTU = Operational Taxonomic Unit). + my @otus = $result->get_seqs(); + + # codon summary: codon usage of each sequence [ arrayref of { + # hashref of counts for each codon } for each sequence and the + # overall sum ], and positional nucleotide distribution [ arrayref + # of { hashref of frequencies for each nucleotide } for each + # sequence and overall frequencies ]: + my ($codonusage, $ntdist) = $result->get_codon_summary(); + + # example manipulations of $codonusage and $ntdist: + printf "There were %d '%s' codons in the first seq (%s)\n", + $codonusage->[0]->{AAA}, 'AAA', $otus[0]->id(); + printf "There were %d '%s' codons used in all the sequences\n", + $codonusage->[$#{$codonusage}]->{AAA}, 'AAA'; + printf "Nucleotide '%c' was present %g of the time in seq %s\n", + 'A', $ntdist->[1]->{A}, $otus[1]->id(); + + # get Nei & Gojobori dN/dS matrix: + my $NGmatrix = $result->get_NGmatrix(); + + # get ML-estimated dN/dS matrix, if calculated; this corresponds to + # the runmode = -2, pairwise comparison usage of codeml + my $MLmatrix = $result->get_MLmatrix(); + + # These matrices are length(@otu) x length(@otu) "strict lower + # triangle" 2D-matrices, which means that the diagonal and + # everything above it is undefined. Each of the defined cells is a + # hashref of estimates for "dN", "dS", "omega" (dN/dS ratio), "t", + # "S" and "N". If a ML matrix, "lnL" will also be defined. + printf "The omega ratio for sequences %s vs %s was: %g\n", + $otus[0]->id, $otus[1]->id, $MLmatrix->[0]->[1]->{omega}; + + # with a little work, these matrices could also be passed to + # Bio::Tools::Run::Phylip::Neighbor, or other similar tree-building + # method that accepts a matrix of "distances" (using the LOWTRI + # option): + my $distmat = [ map { [ map { $$_{omega} } @$_ ] } @$MLmatrix ]; + + # for runmode's other than -2, get tree topology with estimated + # branch lengths; returns a Bio::Tree::TreeI-based tree object with + # added PAML parameters at each node + my $tree = $result->get_tree(); + for my $node ($tree->get_nodes()) { + # inspect the tree: the "t" (time) parameter is available via + # $node->branch_length(); all other branch-specific parameters + # ("omega", "dN", etc.) are available via $node->param('omega'); + } + + # get any general model parameters: kappa (the + # transition/transversion ratio), NSsites model parameters ("p0", + # "p1", "w0", "w1", etc.), etc. + my $params = $result->get_model_params(); + printf "M1 params: p0 = %g\tp1 = %g\n", $params->{p0}, $params->{p1}; + + # for NSsites models, obtain arrayrefs of posterior probabilities + # for membership in each class for every position; probabilities + # correspond to classes w0, w1, ... etc. + my @probs = $result->get_posteriors(); + + # find, say, positively selected sites! + if ($params->{w2} > 1) { + for (my $i = 0; $i < @probs ; $i++) { + if ($probs[$i]->[2] > 0.5) { + # assumes model M1: three w's, w0, w1 and w2 (positive selection) + printf "position %d: (%g prob, %g omega, %g mean w)\n", + $i, $probs[$i]->[2], $params->{w2}, $probs[$i]->[3]; + } + } + } else { print "No positive selection found!\n"; } + +=head1 DESCRIPTION + +This module is used to parse the output from the PAML programs codeml, +baseml, basemlg, codemlsites and yn00. You can use the +Bio::Tools::Run::Phylo::PAML::* modules to actually run some of the +PAML programs, but this module is only useful to parse the output. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Jason Stajich, Aaron Mackey + +Email jason@bioperl.org +Email amackey@virginia.edu + +=head1 TODO + +check output from pre 1.12 + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::Tools::Phylo::PAML; +use vars qw(@ISA); +use strict; + +# Object preamble - inherits from Bio::Root::Root + +use Bio::Root::Root; +use Bio::AnalysisParserI; +use Bio::Root::IO; + +@ISA = qw(Bio::Root::Root Bio::Root::IO Bio::AnalysisParserI); + +# other objects used: +use IO::String; +use Bio::TreeIO; +use Bio::Tools::Phylo::PAML::Result; +use Bio::PrimarySeq; + +=head2 new + + Title : new + Usage : my $obj = new Bio::Tools::Phylo::PAML(%args); + Function: Builds a new Bio::Tools::Phylo::PAML object + Returns : Bio::Tools::Phylo::PAML + Args : Hash of options: -file, -fh, -dir + -file (or -fh) should contain the contents of the PAML + outfile; -dir is the (optional) name of the directory in + which the PAML program was run (and includes other + PAML-generated files from which we can try to gather data) + +=cut + +sub new { + + my ($class, @args) = @_; + + my $self = $class->SUPER::new(@args); + $self->_initialize_io(@args); + + my ($dir) = $self->_rearrange([qw(DIR)], @args); + $self->{_dir} = $dir if defined $dir; + + return $self; +} + +=head2 Implement Bio::AnalysisParserI interface + +=cut + +=head2 next_result + + Title : next_result + Usage : $result = $obj->next_result(); + Function: Returns the next result available from the input, or + undef if there are no more results. + Example : + Returns : a Bio::Tools::Phylo::PAML::Result object + Args : none + +=cut + +sub next_result { + + my ($self) = @_; + + my %data; + # get the various codon and other sequence summary data, if necessary: + $self->_parse_summary + unless ($self->{'_summary'} && !$self->{'_summary'}->{'multidata'}); + + # OK, depending on seqtype and runmode now, one of a few things can happen: + my $seqtype = $self->{'_summary'}->{'seqtype'}; + if ($seqtype eq 'CODONML' || $seqtype eq 'AAML') { + while ($_ = $self->_readline) { + if ($seqtype eq 'CODONML' && + m/^pairwise comparison, codon frequencies:/o) { + + # runmode = -2, CODONML + $self->_pushback($_); + %data = $self->_parse_PairwiseCodon; + last; + + } elsif ($seqtype eq 'AAML' && m/^ML distances of aa seqs\.$/o) { + + # runmode = -2, AAML + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "Pairwise AA not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_PairwiseAA; + # last; + } elsif (m/^Model \d+: /o) { + + # NSSitesBatch + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "NSsitesBatch not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_NSsitesBatch; + # last; + + } elsif (m/^TREE/) { + + # runmode = 0 + $self->_pushback($_); + %data = $self->_parse_Forestry; + last; + + } elsif (m/Heuristic tree search by stepwise addition$/o) { + + # runmode = 3 + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "StepwiseAddition not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_StepwiseAddition; + # last; + + } elsif (m/Heuristic tree search by NNI perturbation$/o) { + + # runmode = 4 + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "NNI Perturbation not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_Perturbation; + # last; + + } elsif (m/^stage 0:/o) { + + # runmode = (1 or 2) + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => "StarDecomposition not yet implemented!" + ); + + # $self->_pushback($_); + # %data = $self->_parse_StarDecomposition; + # last; + + } + } + } elsif ($seqtype eq 'BASEML') { + } elsif ($seqtype eq 'YN00') { + while ($_ = $self->_readline) { + if( m/^Estimation by the method/ ) { + $self->_pushback($_); + %data = $self->_parse_YN_Pairwise; + last; + } + } + } + if (%data) { + $data{'-version'} = $self->{'_summary'}->{'version'}; + $data{'-seqs'} = $self->{'_summary'}->{'seqs'}; + $data{'-patterns'} = $self->{'_summary'}->{'patterns'}; + $data{'-ngmatrix'} = $self->{'_summary'}->{'ngmatrix'}; + $data{'-codonpos'} = $self->{'_summary'}->{'codonposition'}; + $data{'-codonfreq'} = $self->{'_summary'}->{'codonfreqs'}; + return new Bio::Tools::Phylo::PAML::Result %data; + } else { + return undef; + } +} + + +sub _parse_summary { + + my ($self) = @_; + + # Depending on whether verbose > 0 or not, and whether the result + # set comes from a multi-data run, the first few lines could be + # various things; we're going to throw away any sequence data + # here, since we'll get it later anyways + + # multidata ? : \n\nData set 1\n + # verbose ? : cleandata ? : \nBefore deleting alignment gaps. \d sites\n + # [ sequence printout ] + # \nAfter deleting gaps. \d sites\n" + # : [ sequence printout ] + # CODONML (in paml 3.12 February 2002) <<-- what we want to see! + + my $SEQTYPES = qr( (?: (?: CODON | AA | BASE | CODON2AA ) ML ) | YN00 )x; + while ($_ = $self->_readline) { + if ( m/^($SEQTYPES) \s+ # seqtype: CODONML, AAML, BASEML, CODON2AAML, YN00, etc + (?: \(in \s+ ([^\)]+?) \s* \) \s* )? # version: "paml 3.12 February 2002"; not present < 3.1 or YN00 + (\S+) \s* # tree filename + (?: (.+?) )? # model description (not there in YN00) + \s* $ # trim any trailing space + /ox + ) { + + @{$self->{_summary}}{qw(seqtype version treefile model)} = ($1, + $2, + $3, + $4); + last; + + } elsif (m/^Data set \d$/o) { + $self->{'_summary'} = {}; + $self->{'_summary'}->{'multidata'}++; + } + } + + unless (defined $self->{'_summary'}->{'seqtype'}) { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'Unknown format of PAML output'); + } + + + my $seqtype = $self->{'_summary'}->{'seqtype'}; + $self->debug( "seqtype is $seqtype\n"); + if ($seqtype eq "CODONML") { + + $self->_parse_inputparams(); # settings from the .ctl file that get printed + $self->_parse_patterns(); # codon patterns - not very interesting + $self->_parse_seqs(); # the sequences data used for analysis + $self->_parse_codoncts(); # counts and distributions of codon/nt usage + $self->_parse_codon_freqs(); # codon frequencies + $self->_parse_distmat(); # NG distance matrices + + + } elsif ($seqtype eq "AAML") { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'AAML parsing not yet implemented!'); + } elsif ($seqtype eq "CODON2AAML") { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'CODON2AAML parsing not yet implemented!'); + } elsif ($seqtype eq "BASEML") { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'BASEML parsing not yet implemented!'); + } elsif ($seqtype eq "YN00") { + $self->_parse_codon_freqs(); + $self->_parse_codoncts(); + $self->_parse_distmat(); # NG distance matrices + + } else { + $self->throw( -class => 'Bio::Root::NotImplemented', + -text => 'Unknown seqtype, not yet implemented!', + -value => $seqtype + ); + } + +} + + +sub _parse_inputparams { + my ($self) = @_; + +} + +sub _parse_codon_freqs { + my ($self) = @_; + my ($okay,$done) = (0,0); + while( defined($_ = $self->_readline ) ) { + if( /^Nei/ ) { $self->_pushback($_); last } + last if( $done); + next if ( /^\s+/); + next unless($okay || /^Codon position x base \(3x4\) table\, overall/ ); + $okay = 1; + if( s/^position\s+(\d+):\s+// ) { + my $pos = $1; + s/\s+$//; + my @bases = split; + foreach my $str ( @bases ) { + my ( $base,$freq) = split(/:/,$str,2); + $self->{'_summary'}->{'codonposition'}->[$pos-1]->{$base} = $freq; + } + $done = 1 if $pos == 3; + } + } + $done = 0; + while( defined( $_ = $self->_readline) ) { + if( /^Nei\s\&\sGojobori/ ) { $self->_pushback($_); last } + last if ( $done ); + if( /^Codon frequencies under model, for use in evolver:/ ){ + while( defined( $_ = $self->_readline) ) { + last if( /^\s+$/ ); + s/^\s+//; + s/\s+$//; + push @{$self->{'_summary'}->{'codonfreqs'}},[split]; + } + $done = 1; + } + } +} + +sub _parse_patterns { + my ($self) = @_; + my ($patternct,@patterns,$ns,$ls); + while( defined($_ = $self->_readline) ) { + if( $patternct ) { +# last unless ( @patterns == $patternct ); + last if( /^\s+$/ ); + s/^\s+//; + push @patterns, split; + } elsif( /^ns\s+\=\s*(\d+)\s+ls\s+\=\s*(\d+)/ ) { + ($ns,$ls) = ($1,$2); + } elsif( /^\# site patterns \=\s*(\d+)/ ) { + $patternct = $1; + } else { +# $self->debug("Unknown line: $_"); + } + } + $self->{'_summary'}->{'patterns'} = { -patterns => \@patterns, + -ns => $ns, + -ls => $ls}; +} + +sub _parse_seqs { + + # this should in fact be packed into a Bio::SimpleAlign object instead of + # an array but we'll stay with this for now + my ($self) = @_; + my (@firstseq,@seqs); + while( defined ($_ = $self->_readline) ) { + last if( /^\s+$/ && @seqs > 0 ); + next if ( /^\s+$/ ); + next if( /^\d+\s+$/ ); + + my ($name,$seqstr) = split(/\s+/,$_,2); + $seqstr =~ s/\s+//g; # remove whitespace + unless( @firstseq) { + @firstseq = split(//,$seqstr); + push @seqs, new Bio::PrimarySeq(-id => $name, + -seq => $seqstr); + } else { + + my $i = 0; + my $v; + while(($v = index($seqstr,'.',$i)) >= $i ) { + # replace the '.' with the correct seq from the + substr($seqstr,$v,1,$firstseq[$v]); + $i = $v; + } + $self->debug( "adding seq $seqstr\n"); + push @seqs, new Bio::PrimarySeq(-id => $name, + -seq => $seqstr); + } + } + $self->{'_summary'}->{'seqs'} = \@seqs; +} + +sub _parse_codoncts { } + +sub _parse_distmat { + my ($self) = @_; + my @results; + while( defined ($_ = $self->_readline) ) { + next if/^\s+$/; + last; + } + return unless (/^Nei\s*\&\s*Gojobori/); + # skip the next 3 lines + if( $self->{'_summary'}->{'seqtype'} eq 'CODONML' ) { + $self->_readline; + $self->_readline; + $self->_readline; + } + my $seqct = 0; + while( defined ($_ = $self->_readline ) ) { + last if( /^\s+$/ && exists $self->{'_summary'}->{'ngmatrix'} ); + next if( /^\s+$/ ); + chomp; + my ($seq,$rest) = split(/\s+/,$_,2); + my $j = 0; + while( $rest =~ + /(\-?\d+(\.\d+)?)\s*\(\-?(\d+(\.\d+)?)\s+(\-?\d+(\.\d+)?)\)/g ) { + $self->{'_summary'}->{'ngmatrix'}->[$j++]->[$seqct] = + { 'omega' => $1, + 'dN' => $3, + 'dS' => $5 }; + } + $seqct++; + } +} + +sub _parse_PairwiseCodon { + my ($self) = @_; + my @result; + my ($a,$b,$log,$model); + while( defined( $_ = $self->_readline) ) { + if( /^pairwise comparison, codon frequencies\:\s*(\S+)\./) { + $model = $1; + } elsif( /^(\d+)\s+\((\S+)\)\s+\.\.\.\s+(\d+)\s+\((\S+)\)/ ) { + ($a,$b) = ($1,$3); + } elsif( /^lnL\s+\=\s*(\-?\d+(\.\d+)?)/ ) { + $log = $1; + } elsif( m/^t\=\s*(\d+(\.\d+)?)\s+ + S\=\s*(\d+(\.\d+)?)\s+ + N\=\s*(\d+(\.\d+)?)\s+ + dN\/dS\=\s*(\d+(\.\d+)?)\s+ + dN\=\s*(\d+(\.\d+)?)\s+ + dS\=\s*(\d+(\.\d+)?)/ox ) { + $result[$b-1]->[$a-1] = { + 'lnL' => $log, + 't' => $1, + 'S' => $3, + 'N' => $5, + 'omega' => $7, + 'dN' => $9, + 'dS' => $11 }; + } elsif( /^\s+$/ ) { + next; + } elsif( /^\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)/ ) { + } else { + $self->debug( "unknown line: $_"); + } + } + return ( -mlmatrix => \@result); +} + +sub _parse_YN_Pairwise { + my ($self) = @_; + my @result; + while( defined( $_ = $self->_readline) ) { + last if( /^seq\.\s+seq\./); + } + while( defined( $_ = $self->_readline) ) { + if( m/^\s+(\d+)\s+ # seq # + (\d+)\s+ # seq # + (\d+(\.\d+))\s+ # S + (\d+(\.\d+))\s+ # N + (\d+(\.\d+))\s+ # t + (\d+(\.\d+))\s+ # kappa + (\d+(\.\d+))\s+ # omega + (\d+(\.\d+))\s+ # dN + \+\-\s+ + (\d+(\.\d+))\s+ # dN SE + (\d+(\.\d+))\s+ # dS + \+\-\s+ + (\d+(\.\d+))\s+ # dS SE + /ox + ) { + + $result[$2-1]->[$1-1] = { + 'S' => $3, + 'N' => $5, + 't' => $7, + 'kappa' => $9, + 'omega' => $11, + 'dN' => $13, + 'dN_SE' => $15, + 'dS' => $17, + 'dS_SE' => $19, + }; + } elsif( /^\s+$/ ) { + next; + } + } + return ( -mlmatrix => \@result); +} + +sub _parse_Forestry { + + my ($self) = @_; + my %data = (-trees => []); + + + return %data +}; + +# parse the mlc file + +sub _parse_mlc { + my ($self) = @_; + my %data; + while( defined( $_ = $self->_readline) ) { + $self->debug( "mlc parse: $_"); + # Aaron this is where the parsing should begin + + # I'll do the Tree objects if you like - I'd do it by building + # an IO::String for the the tree data or does it make more + # sense to parse this out of a collection of files? + if( /^TREE/ ) { + # ... + while( defined($_ = $self->_readline) ) { + if( /^\(/) { + my $treestr = new IO::String($_); + my $treeio = new Bio::TreeIO(-fh => $treestr, + -format => 'newick'); + # this is very tenative here!! + push @{$self->{'_trees'}}, $treeio->next_tree; + } + } + } + } +} + +1;