diff query_textgrids.py @ 0:e9c8e6204679 draft

planemo upload commit 0fca33c3b7285bd31f6c7380393d08bbdad4e4d6
author stevecassidy
date Mon, 15 Aug 2016 23:47:30 -0400
parents
children be28ced5c4e0
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/query_textgrids.py	Mon Aug 15 23:47:30 2016 -0400
@@ -0,0 +1,46 @@
+from __future__ import print_function
+import json
+import argparse
+import pyalveo
+import sys
+import os
+import tgt
+
+API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
+
+def parser():
+    parser = argparse.ArgumentParser(description="Find matching segments in a TextGrid")
+    parser.add_argument('--textgrid', required=True, action="store", type=str, help="TextGrid files (comma separated)")
+    parser.add_argument('--identifier', required=True, action="store", type=str, help="Dataset identifiers (comma separated)")
+    parser.add_argument('--tier', required=True, action="store", type=str, help="TextGrid Tier to search")
+    parser.add_argument('--regex', required=True, action="store", type=str, help="Regular expression matching segments")
+    parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
+    return parser.parse_args()
+
+def main():
+    args = parser()
+
+    tgfiles = args.textgrid.split(',')
+    identifiers = args.identifier.split(',')
+    assert len(tgfiles) == len(identifiers), "number of textgrids must match number of identifiers"
+
+    pairs = zip(tgfiles, identifiers)
+    
+    rows = []
+    for tgfile, identifier in pairs:
+        tg = tgt.read_textgrid(tgfile)
+        tier = tg.get_tier_by_name(args.tier)
+        matches = tier.get_annotations_with_text(args.regex, regex=True)
+
+        for m in matches:
+            rows.append((str(m.start_time), str(m.end_time), str(m.duration()), m.text, identifier))
+
+    with open(args.output_path, 'w') as out:
+        out.write("start\tend\tduration\tlabel\tidentifier\n")
+        for row in rows:
+            out.write('\t'.join(row) + '\n')
+
+
+
+if __name__ == '__main__':
+    main()