comparison variant_effect_predictor/Bio/Tools/Phylo/PAML.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 # PAML.pm,v 1.3 2002/06/20 18:50:37 amackey Exp
2 #
3 # BioPerl module for Bio::Tools::Phylo::PAML
4 #
5 # Cared for by Jason Stajich <jason@bioperl.org>
6 #
7 # Copyright Jason Stajich, Aaron J Mackey
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tools::Phylo::PAML - Parses output from the PAML programs codeml,
16 baseml, basemlg, codemlsites and yn00
17
18 =head1 SYNOPSIS
19
20 #!/usr/bin/perl -Tw
21 use strict;
22
23 use Bio::Tools::Phylo::PAML;
24
25 # need to specify the output file name (or a fh) (defaults to
26 # -file => "codeml.mlc"); also, optionally, the directory in which
27 # the other result files (rst, 2ML.dS, etc) may be found (defaults
28 # to "./")
29 my $parser = new Bio::Tools::Phylo::PAML
30 (-file => "./results/mlc", -dir => "./results/");
31
32 # get the first/next result; a Bio::Tools::Phylo::PAML::Result object,
33 # which isa Bio::SeqAnalysisResultI object.
34 my $result = $parser->next_result();
35
36 # get the sequences used in the analysis; returns Bio::PrimarySeq
37 # objects (OTU = Operational Taxonomic Unit).
38 my @otus = $result->get_seqs();
39
40 # codon summary: codon usage of each sequence [ arrayref of {
41 # hashref of counts for each codon } for each sequence and the
42 # overall sum ], and positional nucleotide distribution [ arrayref
43 # of { hashref of frequencies for each nucleotide } for each
44 # sequence and overall frequencies ]:
45 my ($codonusage, $ntdist) = $result->get_codon_summary();
46
47 # example manipulations of $codonusage and $ntdist:
48 printf "There were %d '%s' codons in the first seq (%s)\n",
49 $codonusage->[0]->{AAA}, 'AAA', $otus[0]->id();
50 printf "There were %d '%s' codons used in all the sequences\n",
51 $codonusage->[$#{$codonusage}]->{AAA}, 'AAA';
52 printf "Nucleotide '%c' was present %g of the time in seq %s\n",
53 'A', $ntdist->[1]->{A}, $otus[1]->id();
54
55 # get Nei & Gojobori dN/dS matrix:
56 my $NGmatrix = $result->get_NGmatrix();
57
58 # get ML-estimated dN/dS matrix, if calculated; this corresponds to
59 # the runmode = -2, pairwise comparison usage of codeml
60 my $MLmatrix = $result->get_MLmatrix();
61
62 # These matrices are length(@otu) x length(@otu) "strict lower
63 # triangle" 2D-matrices, which means that the diagonal and
64 # everything above it is undefined. Each of the defined cells is a
65 # hashref of estimates for "dN", "dS", "omega" (dN/dS ratio), "t",
66 # "S" and "N". If a ML matrix, "lnL" will also be defined.
67 printf "The omega ratio for sequences %s vs %s was: %g\n",
68 $otus[0]->id, $otus[1]->id, $MLmatrix->[0]->[1]->{omega};
69
70 # with a little work, these matrices could also be passed to
71 # Bio::Tools::Run::Phylip::Neighbor, or other similar tree-building
72 # method that accepts a matrix of "distances" (using the LOWTRI
73 # option):
74 my $distmat = [ map { [ map { $$_{omega} } @$_ ] } @$MLmatrix ];
75
76 # for runmode's other than -2, get tree topology with estimated
77 # branch lengths; returns a Bio::Tree::TreeI-based tree object with
78 # added PAML parameters at each node
79 my $tree = $result->get_tree();
80 for my $node ($tree->get_nodes()) {
81 # inspect the tree: the "t" (time) parameter is available via
82 # $node->branch_length(); all other branch-specific parameters
83 # ("omega", "dN", etc.) are available via $node->param('omega');
84 }
85
86 # get any general model parameters: kappa (the
87 # transition/transversion ratio), NSsites model parameters ("p0",
88 # "p1", "w0", "w1", etc.), etc.
89 my $params = $result->get_model_params();
90 printf "M1 params: p0 = %g\tp1 = %g\n", $params->{p0}, $params->{p1};
91
92 # for NSsites models, obtain arrayrefs of posterior probabilities
93 # for membership in each class for every position; probabilities
94 # correspond to classes w0, w1, ... etc.
95 my @probs = $result->get_posteriors();
96
97 # find, say, positively selected sites!
98 if ($params->{w2} > 1) {
99 for (my $i = 0; $i < @probs ; $i++) {
100 if ($probs[$i]->[2] > 0.5) {
101 # assumes model M1: three w's, w0, w1 and w2 (positive selection)
102 printf "position %d: (%g prob, %g omega, %g mean w)\n",
103 $i, $probs[$i]->[2], $params->{w2}, $probs[$i]->[3];
104 }
105 }
106 } else { print "No positive selection found!\n"; }
107
108 =head1 DESCRIPTION
109
110 This module is used to parse the output from the PAML programs codeml,
111 baseml, basemlg, codemlsites and yn00. You can use the
112 Bio::Tools::Run::Phylo::PAML::* modules to actually run some of the
113 PAML programs, but this module is only useful to parse the output.
114
115 =head1 FEEDBACK
116
117 =head2 Mailing Lists
118
119 User feedback is an integral part of the evolution of this and other
120 Bioperl modules. Send your comments and suggestions preferably to
121 the Bioperl mailing list. Your participation is much appreciated.
122
123 bioperl-l@bioperl.org - General discussion
124 http://bioperl.org/MailList.shtml - About the mailing lists
125
126 =head2 Reporting Bugs
127
128 Report bugs to the Bioperl bug tracking system to help us keep track
129 of the bugs and their resolution. Bug reports can be submitted via
130 email or the web:
131
132 bioperl-bugs@bioperl.org
133 http://bugzilla.bioperl.org/
134
135 =head1 AUTHOR - Jason Stajich, Aaron Mackey
136
137 Email jason@bioperl.org
138 Email amackey@virginia.edu
139
140 =head1 TODO
141
142 check output from pre 1.12
143
144 =head1 APPENDIX
145
146 The rest of the documentation details each of the object methods.
147 Internal methods are usually preceded with a _
148
149 =cut
150
151
152 # Let the code begin...
153
154
155 package Bio::Tools::Phylo::PAML;
156 use vars qw(@ISA);
157 use strict;
158
159 # Object preamble - inherits from Bio::Root::Root
160
161 use Bio::Root::Root;
162 use Bio::AnalysisParserI;
163 use Bio::Root::IO;
164
165 @ISA = qw(Bio::Root::Root Bio::Root::IO Bio::AnalysisParserI);
166
167 # other objects used:
168 use IO::String;
169 use Bio::TreeIO;
170 use Bio::Tools::Phylo::PAML::Result;
171 use Bio::PrimarySeq;
172
173 =head2 new
174
175 Title : new
176 Usage : my $obj = new Bio::Tools::Phylo::PAML(%args);
177 Function: Builds a new Bio::Tools::Phylo::PAML object
178 Returns : Bio::Tools::Phylo::PAML
179 Args : Hash of options: -file, -fh, -dir
180 -file (or -fh) should contain the contents of the PAML
181 outfile; -dir is the (optional) name of the directory in
182 which the PAML program was run (and includes other
183 PAML-generated files from which we can try to gather data)
184
185 =cut
186
187 sub new {
188
189 my ($class, @args) = @_;
190
191 my $self = $class->SUPER::new(@args);
192 $self->_initialize_io(@args);
193
194 my ($dir) = $self->_rearrange([qw(DIR)], @args);
195 $self->{_dir} = $dir if defined $dir;
196
197 return $self;
198 }
199
200 =head2 Implement Bio::AnalysisParserI interface
201
202 =cut
203
204 =head2 next_result
205
206 Title : next_result
207 Usage : $result = $obj->next_result();
208 Function: Returns the next result available from the input, or
209 undef if there are no more results.
210 Example :
211 Returns : a Bio::Tools::Phylo::PAML::Result object
212 Args : none
213
214 =cut
215
216 sub next_result {
217
218 my ($self) = @_;
219
220 my %data;
221 # get the various codon and other sequence summary data, if necessary:
222 $self->_parse_summary
223 unless ($self->{'_summary'} && !$self->{'_summary'}->{'multidata'});
224
225 # OK, depending on seqtype and runmode now, one of a few things can happen:
226 my $seqtype = $self->{'_summary'}->{'seqtype'};
227 if ($seqtype eq 'CODONML' || $seqtype eq 'AAML') {
228 while ($_ = $self->_readline) {
229 if ($seqtype eq 'CODONML' &&
230 m/^pairwise comparison, codon frequencies:/o) {
231
232 # runmode = -2, CODONML
233 $self->_pushback($_);
234 %data = $self->_parse_PairwiseCodon;
235 last;
236
237 } elsif ($seqtype eq 'AAML' && m/^ML distances of aa seqs\.$/o) {
238
239 # runmode = -2, AAML
240 $self->throw( -class => 'Bio::Root::NotImplemented',
241 -text => "Pairwise AA not yet implemented!"
242 );
243
244 # $self->_pushback($_);
245 # %data = $self->_parse_PairwiseAA;
246 # last;
247 } elsif (m/^Model \d+: /o) {
248
249 # NSSitesBatch
250 $self->throw( -class => 'Bio::Root::NotImplemented',
251 -text => "NSsitesBatch not yet implemented!"
252 );
253
254 # $self->_pushback($_);
255 # %data = $self->_parse_NSsitesBatch;
256 # last;
257
258 } elsif (m/^TREE/) {
259
260 # runmode = 0
261 $self->_pushback($_);
262 %data = $self->_parse_Forestry;
263 last;
264
265 } elsif (m/Heuristic tree search by stepwise addition$/o) {
266
267 # runmode = 3
268 $self->throw( -class => 'Bio::Root::NotImplemented',
269 -text => "StepwiseAddition not yet implemented!"
270 );
271
272 # $self->_pushback($_);
273 # %data = $self->_parse_StepwiseAddition;
274 # last;
275
276 } elsif (m/Heuristic tree search by NNI perturbation$/o) {
277
278 # runmode = 4
279 $self->throw( -class => 'Bio::Root::NotImplemented',
280 -text => "NNI Perturbation not yet implemented!"
281 );
282
283 # $self->_pushback($_);
284 # %data = $self->_parse_Perturbation;
285 # last;
286
287 } elsif (m/^stage 0:/o) {
288
289 # runmode = (1 or 2)
290 $self->throw( -class => 'Bio::Root::NotImplemented',
291 -text => "StarDecomposition not yet implemented!"
292 );
293
294 # $self->_pushback($_);
295 # %data = $self->_parse_StarDecomposition;
296 # last;
297
298 }
299 }
300 } elsif ($seqtype eq 'BASEML') {
301 } elsif ($seqtype eq 'YN00') {
302 while ($_ = $self->_readline) {
303 if( m/^Estimation by the method/ ) {
304 $self->_pushback($_);
305 %data = $self->_parse_YN_Pairwise;
306 last;
307 }
308 }
309 }
310 if (%data) {
311 $data{'-version'} = $self->{'_summary'}->{'version'};
312 $data{'-seqs'} = $self->{'_summary'}->{'seqs'};
313 $data{'-patterns'} = $self->{'_summary'}->{'patterns'};
314 $data{'-ngmatrix'} = $self->{'_summary'}->{'ngmatrix'};
315 $data{'-codonpos'} = $self->{'_summary'}->{'codonposition'};
316 $data{'-codonfreq'} = $self->{'_summary'}->{'codonfreqs'};
317 return new Bio::Tools::Phylo::PAML::Result %data;
318 } else {
319 return undef;
320 }
321 }
322
323
324 sub _parse_summary {
325
326 my ($self) = @_;
327
328 # Depending on whether verbose > 0 or not, and whether the result
329 # set comes from a multi-data run, the first few lines could be
330 # various things; we're going to throw away any sequence data
331 # here, since we'll get it later anyways
332
333 # multidata ? : \n\nData set 1\n
334 # verbose ? : cleandata ? : \nBefore deleting alignment gaps. \d sites\n
335 # [ sequence printout ]
336 # \nAfter deleting gaps. \d sites\n"
337 # : [ sequence printout ]
338 # CODONML (in paml 3.12 February 2002) <<-- what we want to see!
339
340 my $SEQTYPES = qr( (?: (?: CODON | AA | BASE | CODON2AA ) ML ) | YN00 )x;
341 while ($_ = $self->_readline) {
342 if ( m/^($SEQTYPES) \s+ # seqtype: CODONML, AAML, BASEML, CODON2AAML, YN00, etc
343 (?: \(in \s+ ([^\)]+?) \s* \) \s* )? # version: "paml 3.12 February 2002"; not present < 3.1 or YN00
344 (\S+) \s* # tree filename
345 (?: (.+?) )? # model description (not there in YN00)
346 \s* $ # trim any trailing space
347 /ox
348 ) {
349
350 @{$self->{_summary}}{qw(seqtype version treefile model)} = ($1,
351 $2,
352 $3,
353 $4);
354 last;
355
356 } elsif (m/^Data set \d$/o) {
357 $self->{'_summary'} = {};
358 $self->{'_summary'}->{'multidata'}++;
359 }
360 }
361
362 unless (defined $self->{'_summary'}->{'seqtype'}) {
363 $self->throw( -class => 'Bio::Root::NotImplemented',
364 -text => 'Unknown format of PAML output');
365 }
366
367
368 my $seqtype = $self->{'_summary'}->{'seqtype'};
369 $self->debug( "seqtype is $seqtype\n");
370 if ($seqtype eq "CODONML") {
371
372 $self->_parse_inputparams(); # settings from the .ctl file that get printed
373 $self->_parse_patterns(); # codon patterns - not very interesting
374 $self->_parse_seqs(); # the sequences data used for analysis
375 $self->_parse_codoncts(); # counts and distributions of codon/nt usage
376 $self->_parse_codon_freqs(); # codon frequencies
377 $self->_parse_distmat(); # NG distance matrices
378
379
380 } elsif ($seqtype eq "AAML") {
381 $self->throw( -class => 'Bio::Root::NotImplemented',
382 -text => 'AAML parsing not yet implemented!');
383 } elsif ($seqtype eq "CODON2AAML") {
384 $self->throw( -class => 'Bio::Root::NotImplemented',
385 -text => 'CODON2AAML parsing not yet implemented!');
386 } elsif ($seqtype eq "BASEML") {
387 $self->throw( -class => 'Bio::Root::NotImplemented',
388 -text => 'BASEML parsing not yet implemented!');
389 } elsif ($seqtype eq "YN00") {
390 $self->_parse_codon_freqs();
391 $self->_parse_codoncts();
392 $self->_parse_distmat(); # NG distance matrices
393
394 } else {
395 $self->throw( -class => 'Bio::Root::NotImplemented',
396 -text => 'Unknown seqtype, not yet implemented!',
397 -value => $seqtype
398 );
399 }
400
401 }
402
403
404 sub _parse_inputparams {
405 my ($self) = @_;
406
407 }
408
409 sub _parse_codon_freqs {
410 my ($self) = @_;
411 my ($okay,$done) = (0,0);
412 while( defined($_ = $self->_readline ) ) {
413 if( /^Nei/ ) { $self->_pushback($_); last }
414 last if( $done);
415 next if ( /^\s+/);
416 next unless($okay || /^Codon position x base \(3x4\) table\, overall/ );
417 $okay = 1;
418 if( s/^position\s+(\d+):\s+// ) {
419 my $pos = $1;
420 s/\s+$//;
421 my @bases = split;
422 foreach my $str ( @bases ) {
423 my ( $base,$freq) = split(/:/,$str,2);
424 $self->{'_summary'}->{'codonposition'}->[$pos-1]->{$base} = $freq;
425 }
426 $done = 1 if $pos == 3;
427 }
428 }
429 $done = 0;
430 while( defined( $_ = $self->_readline) ) {
431 if( /^Nei\s\&\sGojobori/ ) { $self->_pushback($_); last }
432 last if ( $done );
433 if( /^Codon frequencies under model, for use in evolver:/ ){
434 while( defined( $_ = $self->_readline) ) {
435 last if( /^\s+$/ );
436 s/^\s+//;
437 s/\s+$//;
438 push @{$self->{'_summary'}->{'codonfreqs'}},[split];
439 }
440 $done = 1;
441 }
442 }
443 }
444
445 sub _parse_patterns {
446 my ($self) = @_;
447 my ($patternct,@patterns,$ns,$ls);
448 while( defined($_ = $self->_readline) ) {
449 if( $patternct ) {
450 # last unless ( @patterns == $patternct );
451 last if( /^\s+$/ );
452 s/^\s+//;
453 push @patterns, split;
454 } elsif( /^ns\s+\=\s*(\d+)\s+ls\s+\=\s*(\d+)/ ) {
455 ($ns,$ls) = ($1,$2);
456 } elsif( /^\# site patterns \=\s*(\d+)/ ) {
457 $patternct = $1;
458 } else {
459 # $self->debug("Unknown line: $_");
460 }
461 }
462 $self->{'_summary'}->{'patterns'} = { -patterns => \@patterns,
463 -ns => $ns,
464 -ls => $ls};
465 }
466
467 sub _parse_seqs {
468
469 # this should in fact be packed into a Bio::SimpleAlign object instead of
470 # an array but we'll stay with this for now
471 my ($self) = @_;
472 my (@firstseq,@seqs);
473 while( defined ($_ = $self->_readline) ) {
474 last if( /^\s+$/ && @seqs > 0 );
475 next if ( /^\s+$/ );
476 next if( /^\d+\s+$/ );
477
478 my ($name,$seqstr) = split(/\s+/,$_,2);
479 $seqstr =~ s/\s+//g; # remove whitespace
480 unless( @firstseq) {
481 @firstseq = split(//,$seqstr);
482 push @seqs, new Bio::PrimarySeq(-id => $name,
483 -seq => $seqstr);
484 } else {
485
486 my $i = 0;
487 my $v;
488 while(($v = index($seqstr,'.',$i)) >= $i ) {
489 # replace the '.' with the correct seq from the
490 substr($seqstr,$v,1,$firstseq[$v]);
491 $i = $v;
492 }
493 $self->debug( "adding seq $seqstr\n");
494 push @seqs, new Bio::PrimarySeq(-id => $name,
495 -seq => $seqstr);
496 }
497 }
498 $self->{'_summary'}->{'seqs'} = \@seqs;
499 }
500
501 sub _parse_codoncts { }
502
503 sub _parse_distmat {
504 my ($self) = @_;
505 my @results;
506 while( defined ($_ = $self->_readline) ) {
507 next if/^\s+$/;
508 last;
509 }
510 return unless (/^Nei\s*\&\s*Gojobori/);
511 # skip the next 3 lines
512 if( $self->{'_summary'}->{'seqtype'} eq 'CODONML' ) {
513 $self->_readline;
514 $self->_readline;
515 $self->_readline;
516 }
517 my $seqct = 0;
518 while( defined ($_ = $self->_readline ) ) {
519 last if( /^\s+$/ && exists $self->{'_summary'}->{'ngmatrix'} );
520 next if( /^\s+$/ );
521 chomp;
522 my ($seq,$rest) = split(/\s+/,$_,2);
523 my $j = 0;
524 while( $rest =~
525 /(\-?\d+(\.\d+)?)\s*\(\-?(\d+(\.\d+)?)\s+(\-?\d+(\.\d+)?)\)/g ) {
526 $self->{'_summary'}->{'ngmatrix'}->[$j++]->[$seqct] =
527 { 'omega' => $1,
528 'dN' => $3,
529 'dS' => $5 };
530 }
531 $seqct++;
532 }
533 }
534
535 sub _parse_PairwiseCodon {
536 my ($self) = @_;
537 my @result;
538 my ($a,$b,$log,$model);
539 while( defined( $_ = $self->_readline) ) {
540 if( /^pairwise comparison, codon frequencies\:\s*(\S+)\./) {
541 $model = $1;
542 } elsif( /^(\d+)\s+\((\S+)\)\s+\.\.\.\s+(\d+)\s+\((\S+)\)/ ) {
543 ($a,$b) = ($1,$3);
544 } elsif( /^lnL\s+\=\s*(\-?\d+(\.\d+)?)/ ) {
545 $log = $1;
546 } elsif( m/^t\=\s*(\d+(\.\d+)?)\s+
547 S\=\s*(\d+(\.\d+)?)\s+
548 N\=\s*(\d+(\.\d+)?)\s+
549 dN\/dS\=\s*(\d+(\.\d+)?)\s+
550 dN\=\s*(\d+(\.\d+)?)\s+
551 dS\=\s*(\d+(\.\d+)?)/ox ) {
552 $result[$b-1]->[$a-1] = {
553 'lnL' => $log,
554 't' => $1,
555 'S' => $3,
556 'N' => $5,
557 'omega' => $7,
558 'dN' => $9,
559 'dS' => $11 };
560 } elsif( /^\s+$/ ) {
561 next;
562 } elsif( /^\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)/ ) {
563 } else {
564 $self->debug( "unknown line: $_");
565 }
566 }
567 return ( -mlmatrix => \@result);
568 }
569
570 sub _parse_YN_Pairwise {
571 my ($self) = @_;
572 my @result;
573 while( defined( $_ = $self->_readline) ) {
574 last if( /^seq\.\s+seq\./);
575 }
576 while( defined( $_ = $self->_readline) ) {
577 if( m/^\s+(\d+)\s+ # seq #
578 (\d+)\s+ # seq #
579 (\d+(\.\d+))\s+ # S
580 (\d+(\.\d+))\s+ # N
581 (\d+(\.\d+))\s+ # t
582 (\d+(\.\d+))\s+ # kappa
583 (\d+(\.\d+))\s+ # omega
584 (\d+(\.\d+))\s+ # dN
585 \+\-\s+
586 (\d+(\.\d+))\s+ # dN SE
587 (\d+(\.\d+))\s+ # dS
588 \+\-\s+
589 (\d+(\.\d+))\s+ # dS SE
590 /ox
591 ) {
592
593 $result[$2-1]->[$1-1] = {
594 'S' => $3,
595 'N' => $5,
596 't' => $7,
597 'kappa' => $9,
598 'omega' => $11,
599 'dN' => $13,
600 'dN_SE' => $15,
601 'dS' => $17,
602 'dS_SE' => $19,
603 };
604 } elsif( /^\s+$/ ) {
605 next;
606 }
607 }
608 return ( -mlmatrix => \@result);
609 }
610
611 sub _parse_Forestry {
612
613 my ($self) = @_;
614 my %data = (-trees => []);
615
616
617 return %data
618 };
619
620 # parse the mlc file
621
622 sub _parse_mlc {
623 my ($self) = @_;
624 my %data;
625 while( defined( $_ = $self->_readline) ) {
626 $self->debug( "mlc parse: $_");
627 # Aaron this is where the parsing should begin
628
629 # I'll do the Tree objects if you like - I'd do it by building
630 # an IO::String for the the tree data or does it make more
631 # sense to parse this out of a collection of files?
632 if( /^TREE/ ) {
633 # ...
634 while( defined($_ = $self->_readline) ) {
635 if( /^\(/) {
636 my $treestr = new IO::String($_);
637 my $treeio = new Bio::TreeIO(-fh => $treestr,
638 -format => 'newick');
639 # this is very tenative here!!
640 push @{$self->{'_trees'}}, $treeio->next_tree;
641 }
642 }
643 }
644 }
645 }
646
647 1;