Mercurial > repos > stevecassidy > maus
changeset 0:d4c27fdc928b draft
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
| author | stevecassidy |
|---|---|
| date | Wed, 16 Nov 2016 15:00:24 -0500 |
| parents | |
| children | 4162c1e2ad5f |
| files | __init__.py __init__.pyc items_to_bpf.py items_to_bpf.xml maus.xml |
| diffstat | 4 files changed, 266 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/items_to_bpf.py Wed Nov 16 15:00:24 2016 -0500 @@ -0,0 +1,145 @@ +from __future__ import print_function +import json +import argparse +import pyalveo +import sys +import os +from fnmatch import fnmatch +import csv +import re + + +def parser(): + parser = argparse.ArgumentParser(description="Generate BPF Orthographic Transcription from Item List") + parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs") + parser.add_argument('--lexicon', required=True, action="store", type=str, help="File containing lexicon (tsv)") + parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") + return parser.parse_args() + +def read_item_list(filename): + """Read an item list from a file + which should be a tabular formatted file + with one column header ItemURL. + Return an instance of ItemGroup""" + + with open(filename) as fd: + csvreader = csv.DictReader(fd, dialect='excel-tab') + print("CSV", csvreader.fieldnames) + if 'ItemURL' not in csvreader.fieldnames: + return None + if 'Prompt' not in csvreader.fieldnames: + return None + itemurls = [] + for row in csvreader: + itemurls.append((row['Prompt'], row['ItemURL'])) + + return itemurls + +# this file name pattern allows galaxy to discover the dataset designation and type +FNPAT = "%(designation)s#%(ext)s" + +def galaxy_name(itemurl, ext): + """Construct a filename suitable for dataset discovery + by Galaxy. + + @type itemurl: C{String} + @param itemurl: the item URL from Alveo + + @type ext: C{String} + @param ext: the datatype extension for the resulting file + """ + + itemname = itemurl.split('/')[-1] + fname = FNPAT % {'designation': itemname, 'ext': ext} + + return fname + + +def build_bpf(ortho_trans, lexicon): + """ Given an orthographic transcript, generate a BPF-format phonetic + transcription for passing to MAUS, using the specified lexicon. + + @type ortho_trans: C{String} + @param ortho_trans: the (space-separated) orthographic transcript + @type lex: C{Dict} + @param lex: the lexicon to use to translate words to phonetic sybmols + + @rtype: C{String} + @returns: the BPF-formatted transcript + + @raises IncompleteLexiconError: if there is a word appearing in the + orthographic transcript that is not covered by the lexicon + + """ + + spl = re.compile(r'[\s.,!?"\-]') + words = [w.lower() for w in spl.split(ortho_trans) if w] + ort = [] + kan = [] + + for n, word in enumerate(words): + try: + ort.append("ORT: %d %s" % (n, word)) + kan.append("KAN: %d %s" % (n, lexicon[word])) + except KeyError: + raise IncompleteLexiconError("'" + word + + "' not present in lexicon") + + nl = u"\n" + return nl.join(ort) + nl + nl.join(kan) + + +def load_lexicon(lexiconfile): + """ Load the given file as a lexicon dictionary. + Should be a tsv file with two columns, first column + is orthography, second is phonetic transcription. + + @type lexiconfile: C{String} + @param lexiconfile: the filename of the lexicon file + + @rtype: C{Dict} + @returns: the lexicon, as a dictionary with orthographic entries as keys + + """ + lex = {} + + with open(lexiconfile) as f: + for line in f: + orth, pron = line.split('\t') + lex[orth] = pron + + return lex + + +def list_to_bpf(item_list, lexicon, output_path): + """ + Generate a BPF file for each item in this item list. + Items consist of (prompt, ItemURL). URL is used to generate output + file name. + + :type documents: list of pyalveo.Document + :param documents: Documents to download + + :type output_path: String + :param output_path: directory to download to the documents to + """ + if not os.path.exists(output_path): + os.makedirs(output_path) + + for prompt, itemURL in item_list: + + fname = galaxy_name(itemURL, 'par') + bpftext = build_bpf(prompt, lexicon) + with open(os.path.join(output_path, fname), 'w') as out: + out.write(bpftext) + + +def main(): + args = parser() + item_list = read_item_list(args.item_list) + lexicon = load_lexicon(args.lexicon) + list_to_bpf(item_list, lexicon, args.output_path) + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/items_to_bpf.xml Wed Nov 16 15:00:24 2016 -0500 @@ -0,0 +1,25 @@ +<tool id="items_to_bpf" name="Item List to BPF" version="0.01" force_history_refresh="True"> + <description></description> + + <command interpreter="python"> + items_to_bpf.py --lexicon ${lexicon} --item_list ${item_list} --output_path BPFData + </command> + + <inputs> + <param name="item_list" type="data" format="item_list" label="Item List" help=""/> + <param name="lexicon" type="data" format="tabular" label="Lexicon" help=""/> + + <param name="job_name" type="text" size="25" + label="Supply a name for the output to remind you what it contains" value="BPF Files"/> + </inputs> + + <outputs> + <collection type="list" label="$job_name" name="output1"> + <discover_datasets pattern="(?P<designation>[^#]+)#(?P<ext>.+)" directory="BPFData"/> + </collection> + </outputs> + + <help>Generate a collection of BPF annotation files from an item list + that contains the item orthography, these are then suitable for input + to the MAUS forced alignment system.</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/maus.xml Wed Nov 16 15:00:24 2016 -0500 @@ -0,0 +1,96 @@ +<tool id="maus_bpf" name="MAUS Align" version="0.01" force_history_refresh="True"> + <description>from BPF transcription</description> + + <requirements> + <container type="docker">stevecassidy/maus</container> + </requirements> + + <command> + ln -s "${signal}" input.wav && + /home/maus/maus OUTFORMAT=mau LANGUAGE=$language + BPF=$bpf INSKANTEXTGRID=$inskantextgrid INSORTTEXTGRID=$insorttextgrid + MODUS=$modus MAUSSHIFT=$mausshift MINPAUSLEN=$minpauslen WEIGHT=$weight + INSPROB=$insprob NOINITIALFINALSILENCE=$noinitialfinalsilence OUTSYMBOL=$outsymbol + OUT=$output SIGNAL=input.wav + </command> + + <inputs> + <param name="signal" type="data" format="wav" label="Audio File"/> + <param name="bpf" type="data" format="par" label="BPF File" help="Orthographic Transcript as BPF"/> + <param name="language" type="select" label="Language"> + <option value="aus">Australian English</option> + <option value="sampa">SAMPA</option> + <option value="deu">Germa</option> + <option value="gsw-CH">Swiss German</option> + <option value="eng">British English</option> + <option value="fin">Finnish</option> + <option value="fra">French</option> + <option value="eng-US">US English</option> + <option value="nld">Dutch</option> + <option value="spa">Spanish</option> + <option value="ita">Italian</option> + <option value="por">Portugese</option> + <option value="hun">Hungarian</option> + <option value="ekk">Estonian</option> + <option value="pol">Polish</option> + <option value="nze">NZ English</option> + <option value="kat">Georgian</option> + <option value="rus">Russian</option> + </param> + <param name="insorttextgrid" type="boolean" value="true" label="Add Orthography to TextGrid output"/> + <param name="inskantextgrid" type="boolean" value="true" label="Add canonical phonemic transcription to TextGrid output"/> + <param name="modus" type="select" label="Mode"> + <option value="standard">Normal MAUS technique</option> + <option value="align">Do not model pronunciation, align the given phonemic transcript</option> + <option value="bigram">Use phone recognition constrained by a Bigram model</option> + </param> + <param name="mausshift" type="integer" value="10" label="Shift found segment boundaries by this many milliseconds"/> + <param name="minpauslen" type="integer" label="supress inter-word silence less than n*10ms, ie. enter 5 here to suppress silence less than 50ms" value='5'/> + <param name="weight" type="float" value="7.0" label="relative weight given to statistical model, higher values will favour the canonical pronunciation"/> + <param name="insprob" type="float" value="0.0" label="Insertion probability - higher values will reduce the probability of deletions of phonemic segments"/> + <param name="noinitialfinalsilence" type="select" label="Suppress initial and final silence"> + <option value="no">No</option> + <option value="yes">Yes</option> + </param> + <param name="outsymbol" type="select" label="Output symbols"> + <option value="sampa">SAMPA</option> + <option value="ipa">IPA (utf-8)</option> + <option value="manner">IPA manner class</option> + <option value="place">IPA place of articulation</option> + </param> + </inputs> + + <outputs> + <data format="TextGrid" name="output" label="TextGrid" /> + </outputs> + + + <tests> + <test> + <param name="signal" type="wav" value="1_1119_2_22_001-ch6-speaker16.wav"/> + <param name="bpf" value="1_1119_2_22_001.bpf"/> + <param name="language" value="aus"/> + <param name="insorttextgrid" value="true" /> + <param name="inskantextgrid" value="true" /> + <output name="output" file="1_1119_2_22_001.TextGrid"/> + </test> + </tests> + + <help>Run the MAUS forced aligner on an audio file given a transcription in BPF format. + Output is a TextGrid file containing the phonemic annotation with start/end times for each + segment.</help> + + <citations> + <citation type='bibtex'> + @inproceedings{Strunk2011, + booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation}, + publisher = {European Language Resources Association (ELRA)}, + isbn = {978-2-9517408-8-4}, + keywords = {forced alignment,language documentation corpora,word times}, + title = {{Untrained Forced Alignment of Transcriptions and Audio for Language Documentation Corpora using WebMAUS}}, + url = {http://www.bas.uni-muenchen.de/forschung/publikationen/Schiel-LREC2014.pdf}, + year = {2014} + } + </citation> + </citations> +</tool>
