# HG changeset patch
# User stevecassidy
# Date 1479326424 18000
# Node ID d4c27fdc928bdf857c8737a75f66e82dcb93384e
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
diff -r 000000000000 -r d4c27fdc928b __init__.py
diff -r 000000000000 -r d4c27fdc928b __init__.pyc
Binary file __init__.pyc has changed
diff -r 000000000000 -r d4c27fdc928b items_to_bpf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/items_to_bpf.py Wed Nov 16 15:00:24 2016 -0500
@@ -0,0 +1,145 @@
+from __future__ import print_function
+import json
+import argparse
+import pyalveo
+import sys
+import os
+from fnmatch import fnmatch
+import csv
+import re
+
+
+def parser():
+ parser = argparse.ArgumentParser(description="Generate BPF Orthographic Transcription from Item List")
+ parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs")
+ parser.add_argument('--lexicon', required=True, action="store", type=str, help="File containing lexicon (tsv)")
+ parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
+ return parser.parse_args()
+
+def read_item_list(filename):
+ """Read an item list from a file
+ which should be a tabular formatted file
+ with one column header ItemURL.
+ Return an instance of ItemGroup"""
+
+ with open(filename) as fd:
+ csvreader = csv.DictReader(fd, dialect='excel-tab')
+ print("CSV", csvreader.fieldnames)
+ if 'ItemURL' not in csvreader.fieldnames:
+ return None
+ if 'Prompt' not in csvreader.fieldnames:
+ return None
+ itemurls = []
+ for row in csvreader:
+ itemurls.append((row['Prompt'], row['ItemURL']))
+
+ return itemurls
+
+# this file name pattern allows galaxy to discover the dataset designation and type
+FNPAT = "%(designation)s#%(ext)s"
+
+def galaxy_name(itemurl, ext):
+ """Construct a filename suitable for dataset discovery
+ by Galaxy.
+
+ @type itemurl: C{String}
+ @param itemurl: the item URL from Alveo
+
+ @type ext: C{String}
+ @param ext: the datatype extension for the resulting file
+ """
+
+ itemname = itemurl.split('/')[-1]
+ fname = FNPAT % {'designation': itemname, 'ext': ext}
+
+ return fname
+
+
+def build_bpf(ortho_trans, lexicon):
+ """ Given an orthographic transcript, generate a BPF-format phonetic
+ transcription for passing to MAUS, using the specified lexicon.
+
+ @type ortho_trans: C{String}
+ @param ortho_trans: the (space-separated) orthographic transcript
+ @type lex: C{Dict}
+ @param lex: the lexicon to use to translate words to phonetic sybmols
+
+ @rtype: C{String}
+ @returns: the BPF-formatted transcript
+
+ @raises IncompleteLexiconError: if there is a word appearing in the
+ orthographic transcript that is not covered by the lexicon
+
+ """
+
+ spl = re.compile(r'[\s.,!?"\-]')
+ words = [w.lower() for w in spl.split(ortho_trans) if w]
+ ort = []
+ kan = []
+
+ for n, word in enumerate(words):
+ try:
+ ort.append("ORT: %d %s" % (n, word))
+ kan.append("KAN: %d %s" % (n, lexicon[word]))
+ except KeyError:
+ raise IncompleteLexiconError("'" + word +
+ "' not present in lexicon")
+
+ nl = u"\n"
+ return nl.join(ort) + nl + nl.join(kan)
+
+
+def load_lexicon(lexiconfile):
+ """ Load the given file as a lexicon dictionary.
+ Should be a tsv file with two columns, first column
+ is orthography, second is phonetic transcription.
+
+ @type lexiconfile: C{String}
+ @param lexiconfile: the filename of the lexicon file
+
+ @rtype: C{Dict}
+ @returns: the lexicon, as a dictionary with orthographic entries as keys
+
+ """
+ lex = {}
+
+ with open(lexiconfile) as f:
+ for line in f:
+ orth, pron = line.split('\t')
+ lex[orth] = pron
+
+ return lex
+
+
+def list_to_bpf(item_list, lexicon, output_path):
+ """
+ Generate a BPF file for each item in this item list.
+ Items consist of (prompt, ItemURL). URL is used to generate output
+ file name.
+
+ :type documents: list of pyalveo.Document
+ :param documents: Documents to download
+
+ :type output_path: String
+ :param output_path: directory to download to the documents to
+ """
+ if not os.path.exists(output_path):
+ os.makedirs(output_path)
+
+ for prompt, itemURL in item_list:
+
+ fname = galaxy_name(itemURL, 'par')
+ bpftext = build_bpf(prompt, lexicon)
+ with open(os.path.join(output_path, fname), 'w') as out:
+ out.write(bpftext)
+
+
+def main():
+ args = parser()
+ item_list = read_item_list(args.item_list)
+ lexicon = load_lexicon(args.lexicon)
+ list_to_bpf(item_list, lexicon, args.output_path)
+
+
+if __name__ == '__main__':
+ main()
diff -r 000000000000 -r d4c27fdc928b items_to_bpf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/items_to_bpf.xml Wed Nov 16 15:00:24 2016 -0500
@@ -0,0 +1,25 @@
+
+
+
+
+ items_to_bpf.py --lexicon ${lexicon} --item_list ${item_list} --output_path BPFData
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Generate a collection of BPF annotation files from an item list
+ that contains the item orthography, these are then suitable for input
+ to the MAUS forced alignment system.
+
diff -r 000000000000 -r d4c27fdc928b maus.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/maus.xml Wed Nov 16 15:00:24 2016 -0500
@@ -0,0 +1,96 @@
+
+ from BPF transcription
+
+
+ stevecassidy/maus
+
+
+
+ ln -s "${signal}" input.wav &&
+ /home/maus/maus OUTFORMAT=mau LANGUAGE=$language
+ BPF=$bpf INSKANTEXTGRID=$inskantextgrid INSORTTEXTGRID=$insorttextgrid
+ MODUS=$modus MAUSSHIFT=$mausshift MINPAUSLEN=$minpauslen WEIGHT=$weight
+ INSPROB=$insprob NOINITIALFINALSILENCE=$noinitialfinalsilence OUTSYMBOL=$outsymbol
+ OUT=$output SIGNAL=input.wav
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Run the MAUS forced aligner on an audio file given a transcription in BPF format.
+ Output is a TextGrid file containing the phonemic annotation with start/end times for each
+ segment.
+
+
+
+ @inproceedings{Strunk2011,
+ booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation},
+ publisher = {European Language Resources Association (ELRA)},
+ isbn = {978-2-9517408-8-4},
+ keywords = {forced alignment,language documentation corpora,word times},
+ title = {{Untrained Forced Alignment of Transcriptions and Audio for Language Documentation Corpora using WebMAUS}},
+ url = {http://www.bas.uni-muenchen.de/forschung/publikationen/Schiel-LREC2014.pdf},
+ year = {2014}
+ }
+
+
+