diff bin/extract_nexus_translations.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/extract_nexus_translations.py	Mon Oct 12 17:43:33 2015 -0400
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import re
+
+
+begin_trees_re = re.compile('^Begin trees\;')
+trans_token_re = re.compile('^\s*Translate')
+trans_line_re = re.compile('^\s*(\d+)\s*([^\s\,]+)\,')
+end_trans_re = re.compile("^\;$")
+
+
+def translation_line_extraction_reducer(lines_and_state, next_line):
+    "The reducer guts behind the extract_translation_lines state machine"
+    lines, state = lines_and_state
+    if state == "init":
+        if begin_trees_re.match(next_line):
+            return (lines, "begin_trees")
+        else:
+            return lines_and_state
+    elif state == "begin_trees":
+        if trans_token_re.match(next_line):
+            return (lines, "in_trans")
+        else:
+            raise Exception, "Translation line should immediately procede Begin Trees"
+    elif state == "in_trans":
+        m = trans_line_re.match(next_line)
+        if m:
+            return (lines + [m.groups()], "in_trans")
+        else:
+            return (lines, "end_of_trans")
+    elif state == "end_of_trans":
+        if end_trans_re.match(next_line):
+            return (lines, "finished")
+        else:
+            raise Exception, "Next line should have been an end of trans line"
+    elif state == "finished":
+        return lines_and_state
+    else:
+        raise Exception, "Unknown state: %s" % state
+
+
+def extract_translation_lines(line_reader):
+    """Basically a little state machine that marches through the nexus file, figures out when we're in the
+    translation section, extracts those lines as 2-tuples of (int, seqname)."""
+    lines, _ = reduce(translation_line_extraction_reducer, line_reader, ([], "init"))
+    return lines
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('in_trees', type=argparse.FileType('r'))
+    parser.add_argument('out_translation', type=argparse.FileType('w'))
+    return parser.parse_args()
+
+
+def main(args):
+    writer = csv.writer(args.out_translation)
+    writer.writerow(['id', 'sequence'])
+    writer.writerows(extract_translation_lines(args.in_trees))
+    args.in_trees.close()
+    args.out_translation.close()
+
+
+if __name__ == '__main__':
+    main(get_args())
+
+