annotate bin/extract_nexus_translations.py @ 2:7eaf6f9abd28 draft default tip

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b-dirty
author bcclaywell
date Mon, 12 Oct 2015 17:57:38 -0400
parents d67268158946
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
1 #!/usr/bin/env python
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
2
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
3 import argparse
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
4 import csv
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
5 import re
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
6
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
7
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
8 begin_trees_re = re.compile('^Begin trees\;')
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
9 trans_token_re = re.compile('^\s*Translate')
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
10 trans_line_re = re.compile('^\s*(\d+)\s*([^\s\,]+)\,')
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
11 end_trans_re = re.compile("^\;$")
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
12
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
13
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
14 def translation_line_extraction_reducer(lines_and_state, next_line):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
15 "The reducer guts behind the extract_translation_lines state machine"
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
16 lines, state = lines_and_state
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
17 if state == "init":
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
18 if begin_trees_re.match(next_line):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
19 return (lines, "begin_trees")
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
20 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
21 return lines_and_state
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
22 elif state == "begin_trees":
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
23 if trans_token_re.match(next_line):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
24 return (lines, "in_trans")
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
25 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
26 raise Exception, "Translation line should immediately procede Begin Trees"
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
27 elif state == "in_trans":
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
28 m = trans_line_re.match(next_line)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
29 if m:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
30 return (lines + [m.groups()], "in_trans")
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
31 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
32 return (lines, "end_of_trans")
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
33 elif state == "end_of_trans":
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
34 if end_trans_re.match(next_line):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
35 return (lines, "finished")
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
36 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
37 raise Exception, "Next line should have been an end of trans line"
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
38 elif state == "finished":
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
39 return lines_and_state
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
40 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
41 raise Exception, "Unknown state: %s" % state
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
42
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
43
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
44 def extract_translation_lines(line_reader):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
45 """Basically a little state machine that marches through the nexus file, figures out when we're in the
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
46 translation section, extracts those lines as 2-tuples of (int, seqname)."""
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
47 lines, _ = reduce(translation_line_extraction_reducer, line_reader, ([], "init"))
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
48 return lines
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
49
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
50
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
51 def get_args():
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
52 parser = argparse.ArgumentParser()
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
53 parser.add_argument('in_trees', type=argparse.FileType('r'))
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
54 parser.add_argument('out_translation', type=argparse.FileType('w'))
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
55 return parser.parse_args()
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
56
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
57
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
58 def main(args):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
59 writer = csv.writer(args.out_translation)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
60 writer.writerow(['id', 'sequence'])
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
61 writer.writerows(extract_translation_lines(args.in_trees))
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
62 args.in_trees.close()
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
63 args.out_translation.close()
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
64
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
65
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
66 if __name__ == '__main__':
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
67 main(get_args())
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
68
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
69