diff bin/ess.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/ess.py	Mon Oct 12 17:43:33 2015 -0400
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+import csv
+import numpy
+from biopy import bayesianStats as bs
+import argparse
+import re
+
+
+commented_regex = re.compile("^\s*\#.*")
+
+doc_template = """
+<html>
+<body>
+    <h1>Effective Sample Size Results</h1>
+    <p>Recommendation: {status}</p>
+
+    <h2>Explanation</h2>
+    <p>
+        Ideally, you want all these values to be greater than or equal to 200.
+        If any of them aren't, it's probably a good idea to perform a Resume run with BEAST.
+        On average, running 3x as long will increase your ESS by 3.
+        This should help give you some sense of how much longer you should run BEAST.
+    </p>
+    <p>
+        Note that while ESS statistics are valuable for getting a rough sense of whether to run or not, it is
+        <em>always</em> recommended that you manually look at your logfile traces using
+        <a href="http://tree.bio.ed.ac.uk/software/tracer/">Tracer</a>, or another trace analysis software before
+        accepting your analysis as "final".
+    </p>
+
+    <h2>Detailed Results</h2>
+    <table>
+        <tr><th>Parameter</th><th>ESS</th></tr>
+        {table_content}
+    </table>
+</body>
+</html>
+"""
+
+tr_template = "<tr><td>{parameter}</td><td {style}>{ess}</td></tr>"
+
+
+def format_tr(result, threshold=200):
+    "Takes a result [param, ess] pair and makes the appropriate tr html"
+    param, ess = result
+    passes = ess >= threshold
+    style = "" if passes else 'style="color: red;"'
+    return tr_template.format(parameter=param, ess=ess, style=style)
+
+
+def dict_reader_to_cols(dict_reader):
+    d = dict()
+    for row in dict_reader:
+        for k, v in row.iteritems():
+            try:
+                v = float(v)
+                try:
+                    d[k].append(v)
+                except KeyError:
+                    d[k] = [v]
+            except ValueError:
+                if k == "" and v == "":
+                    # Don't worry about this, there is a tailing \t on each line
+                    pass
+                else:
+                    # Otherwise... something weird is happening
+                    print "Could not cast to float for kv pair:", k, v
+    return d
+
+
+def table_contents(results, threshold=200):
+    trs = [format_tr(result, threshold) for result in sorted(results)]
+    return "\n".join(trs)
+
+
+def html_contents(results, threshold):
+    table_content = table_contents(results, threshold)
+    within_tol = [x[1] > threshold for x in results]
+    ess_good = all(within_tol)
+    status = "Good to go" if all(within_tol) else "Should run longer"
+    return doc_template.format(status=status, table_content=table_content)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('logfile', type=argparse.FileType('r'))
+    parser.add_argument('ess_out', type=argparse.FileType('w'))
+    parser.add_argument('-t', '--threshold', type=float, default=200,
+            help="""For formatting html, what's the minimal ESS score to be considered high enough?
+            (default: 200""")
+    parser.add_argument('--html-out', action="store_true",
+        help="Default is CSV output")
+    return parser.parse_args()
+
+
+def main(args):
+    non_comment_lines = (row for row in args.logfile if not commented_regex.match(row))
+    reader = csv.DictReader(non_comment_lines, delimiter="\t")
+    data_columns = dict_reader_to_cols(reader)
+    results = []
+    for colname, data in data_columns.iteritems():
+        if colname != "Sample":
+            results.append([colname, bs.effectiveSampleSize(data)])
+
+    if args.html_out:
+        html_content = html_contents(results, args.threshold)
+        args.ess_out.write(html_content)
+    else:
+        writer = csv.writer(args.ess_out)
+        writer.writerow(["statistic", "ess"])
+        for result in results:
+            writer.writerow(result)
+
+    args.logfile.close()
+    args.ess_out.close()
+
+
+if __name__ == '__main__':
+    main(get_args())
+
+