comparison bin/extract_nexus_translations.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d67268158946
1 #!/usr/bin/env python
2
3 import argparse
4 import csv
5 import re
6
7
8 begin_trees_re = re.compile('^Begin trees\;')
9 trans_token_re = re.compile('^\s*Translate')
10 trans_line_re = re.compile('^\s*(\d+)\s*([^\s\,]+)\,')
11 end_trans_re = re.compile("^\;$")
12
13
14 def translation_line_extraction_reducer(lines_and_state, next_line):
15 "The reducer guts behind the extract_translation_lines state machine"
16 lines, state = lines_and_state
17 if state == "init":
18 if begin_trees_re.match(next_line):
19 return (lines, "begin_trees")
20 else:
21 return lines_and_state
22 elif state == "begin_trees":
23 if trans_token_re.match(next_line):
24 return (lines, "in_trans")
25 else:
26 raise Exception, "Translation line should immediately procede Begin Trees"
27 elif state == "in_trans":
28 m = trans_line_re.match(next_line)
29 if m:
30 return (lines + [m.groups()], "in_trans")
31 else:
32 return (lines, "end_of_trans")
33 elif state == "end_of_trans":
34 if end_trans_re.match(next_line):
35 return (lines, "finished")
36 else:
37 raise Exception, "Next line should have been an end of trans line"
38 elif state == "finished":
39 return lines_and_state
40 else:
41 raise Exception, "Unknown state: %s" % state
42
43
44 def extract_translation_lines(line_reader):
45 """Basically a little state machine that marches through the nexus file, figures out when we're in the
46 translation section, extracts those lines as 2-tuples of (int, seqname)."""
47 lines, _ = reduce(translation_line_extraction_reducer, line_reader, ([], "init"))
48 return lines
49
50
51 def get_args():
52 parser = argparse.ArgumentParser()
53 parser.add_argument('in_trees', type=argparse.FileType('r'))
54 parser.add_argument('out_translation', type=argparse.FileType('w'))
55 return parser.parse_args()
56
57
58 def main(args):
59 writer = csv.writer(args.out_translation)
60 writer.writerow(['id', 'sequence'])
61 writer.writerows(extract_translation_lines(args.in_trees))
62 args.in_trees.close()
63 args.out_translation.close()
64
65
66 if __name__ == '__main__':
67 main(get_args())
68
69