Mercurial > repos > bcclaywell > argo_navis
diff bin/ess.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author | bcclaywell |
---|---|
date | Mon, 12 Oct 2015 17:43:33 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ess.py Mon Oct 12 17:43:33 2015 -0400 @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +import csv +import numpy +from biopy import bayesianStats as bs +import argparse +import re + + +commented_regex = re.compile("^\s*\#.*") + +doc_template = """ +<html> +<body> + <h1>Effective Sample Size Results</h1> + <p>Recommendation: {status}</p> + + <h2>Explanation</h2> + <p> + Ideally, you want all these values to be greater than or equal to 200. + If any of them aren't, it's probably a good idea to perform a Resume run with BEAST. + On average, running 3x as long will increase your ESS by 3. + This should help give you some sense of how much longer you should run BEAST. + </p> + <p> + Note that while ESS statistics are valuable for getting a rough sense of whether to run or not, it is + <em>always</em> recommended that you manually look at your logfile traces using + <a href="http://tree.bio.ed.ac.uk/software/tracer/">Tracer</a>, or another trace analysis software before + accepting your analysis as "final". + </p> + + <h2>Detailed Results</h2> + <table> + <tr><th>Parameter</th><th>ESS</th></tr> + {table_content} + </table> +</body> +</html> +""" + +tr_template = "<tr><td>{parameter}</td><td {style}>{ess}</td></tr>" + + +def format_tr(result, threshold=200): + "Takes a result [param, ess] pair and makes the appropriate tr html" + param, ess = result + passes = ess >= threshold + style = "" if passes else 'style="color: red;"' + return tr_template.format(parameter=param, ess=ess, style=style) + + +def dict_reader_to_cols(dict_reader): + d = dict() + for row in dict_reader: + for k, v in row.iteritems(): + try: + v = float(v) + try: + d[k].append(v) + except KeyError: + d[k] = [v] + except ValueError: + if k == "" and v == "": + # Don't worry about this, there is a tailing \t on each line + pass + else: + # Otherwise... something weird is happening + print "Could not cast to float for kv pair:", k, v + return d + + +def table_contents(results, threshold=200): + trs = [format_tr(result, threshold) for result in sorted(results)] + return "\n".join(trs) + + +def html_contents(results, threshold): + table_content = table_contents(results, threshold) + within_tol = [x[1] > threshold for x in results] + ess_good = all(within_tol) + status = "Good to go" if all(within_tol) else "Should run longer" + return doc_template.format(status=status, table_content=table_content) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('logfile', type=argparse.FileType('r')) + parser.add_argument('ess_out', type=argparse.FileType('w')) + parser.add_argument('-t', '--threshold', type=float, default=200, + help="""For formatting html, what's the minimal ESS score to be considered high enough? + (default: 200""") + parser.add_argument('--html-out', action="store_true", + help="Default is CSV output") + return parser.parse_args() + + +def main(args): + non_comment_lines = (row for row in args.logfile if not commented_regex.match(row)) + reader = csv.DictReader(non_comment_lines, delimiter="\t") + data_columns = dict_reader_to_cols(reader) + results = [] + for colname, data in data_columns.iteritems(): + if colname != "Sample": + results.append([colname, bs.effectiveSampleSize(data)]) + + if args.html_out: + html_content = html_contents(results, args.threshold) + args.ess_out.write(html_content) + else: + writer = csv.writer(args.ess_out) + writer.writerow(["statistic", "ess"]) + for result in results: + writer.writerow(result) + + args.logfile.close() + args.ess_out.close() + + +if __name__ == '__main__': + main(get_args()) + +