changeset 0:d4c27fdc928b draft

planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
author stevecassidy
date Wed, 16 Nov 2016 15:00:24 -0500
parents
children 4162c1e2ad5f
files __init__.py __init__.pyc items_to_bpf.py items_to_bpf.xml maus.xml
diffstat 4 files changed, 266 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file __init__.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/items_to_bpf.py	Wed Nov 16 15:00:24 2016 -0500
@@ -0,0 +1,145 @@
+from __future__ import print_function
+import json
+import argparse
+import pyalveo
+import sys
+import os
+from fnmatch import fnmatch
+import csv
+import re
+
+
+def parser():
+    parser = argparse.ArgumentParser(description="Generate BPF Orthographic Transcription from Item List")
+    parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs")
+    parser.add_argument('--lexicon', required=True, action="store", type=str, help="File containing lexicon (tsv)")
+    parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
+    return parser.parse_args()
+
+def read_item_list(filename):
+    """Read an item list from a file
+    which should be a tabular formatted file
+    with one column header ItemURL.
+    Return an instance of ItemGroup"""
+
+    with open(filename) as fd:
+        csvreader = csv.DictReader(fd, dialect='excel-tab')
+        print("CSV", csvreader.fieldnames)
+        if 'ItemURL' not in csvreader.fieldnames:
+            return None
+        if 'Prompt' not in csvreader.fieldnames:
+            return None
+        itemurls = []
+        for row in csvreader:
+            itemurls.append((row['Prompt'], row['ItemURL']))
+
+    return itemurls
+
+# this file name pattern allows galaxy to discover the dataset designation and type
+FNPAT = "%(designation)s#%(ext)s"
+
+def galaxy_name(itemurl, ext):
+    """Construct a filename suitable for dataset discovery
+    by Galaxy.
+
+    @type itemurl: C{String}
+    @param itemurl: the item URL from Alveo
+
+    @type ext: C{String}
+    @param ext: the datatype extension for the resulting file
+    """
+
+    itemname = itemurl.split('/')[-1]
+    fname = FNPAT % {'designation': itemname, 'ext': ext}
+
+    return fname
+
+
+def build_bpf(ortho_trans, lexicon):
+    """ Given an orthographic transcript, generate a BPF-format phonetic
+        transcription for passing to MAUS, using the specified lexicon.
+
+        @type ortho_trans: C{String}
+        @param ortho_trans: the (space-separated) orthographic transcript
+        @type lex: C{Dict}
+        @param lex: the lexicon to use to translate words to phonetic sybmols
+
+        @rtype: C{String}
+        @returns: the BPF-formatted transcript
+
+        @raises IncompleteLexiconError: if there is a word appearing in the
+        orthographic transcript that is not covered by the lexicon
+
+    """
+
+    spl = re.compile(r'[\s.,!?"\-]')
+    words = [w.lower() for w in spl.split(ortho_trans) if w]
+    ort = []
+    kan = []
+
+    for n, word in enumerate(words):
+        try:
+            ort.append("ORT: %d %s" % (n, word))
+            kan.append("KAN: %d %s" % (n, lexicon[word]))
+        except KeyError:
+            raise IncompleteLexiconError("'" + word +
+                                         "' not present in lexicon")
+
+    nl = u"\n"
+    return nl.join(ort) + nl + nl.join(kan)
+
+
+def load_lexicon(lexiconfile):
+    """ Load the given file as a lexicon dictionary.
+        Should be a tsv file with two columns, first column
+        is orthography, second is phonetic transcription.
+
+        @type lexiconfile: C{String}
+        @param lexiconfile: the filename of the lexicon file
+
+        @rtype: C{Dict}
+        @returns: the lexicon, as a dictionary with orthographic entries as keys
+
+    """
+    lex = {}
+
+    with open(lexiconfile) as f:
+        for line in f:
+            orth, pron = line.split('\t')
+            lex[orth] = pron
+
+    return lex
+
+
+def list_to_bpf(item_list, lexicon, output_path):
+    """
+    Generate a BPF file for each item in this item list.
+    Items consist of (prompt, ItemURL). URL is used to generate output
+    file name.
+
+    :type documents: list of pyalveo.Document
+    :param documents: Documents to download
+
+    :type output_path: String
+    :param output_path: directory to download to the documents to
+    """
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    for prompt, itemURL in item_list:
+
+        fname = galaxy_name(itemURL, 'par')
+        bpftext = build_bpf(prompt, lexicon)
+        with open(os.path.join(output_path, fname), 'w') as out:
+            out.write(bpftext)
+
+
+def main():
+    args = parser()
+    item_list = read_item_list(args.item_list)
+    lexicon = load_lexicon(args.lexicon)
+    list_to_bpf(item_list, lexicon, args.output_path)
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/items_to_bpf.xml	Wed Nov 16 15:00:24 2016 -0500
@@ -0,0 +1,25 @@
+<tool id="items_to_bpf" name="Item List to BPF" version="0.01" force_history_refresh="True">
+    <description></description>
+
+    <command interpreter="python">
+        items_to_bpf.py --lexicon ${lexicon} --item_list ${item_list} --output_path BPFData
+    </command>
+
+    <inputs>
+        <param name="item_list" type="data" format="item_list" label="Item List" help=""/>
+        <param name="lexicon" type="data" format="tabular" label="Lexicon" help=""/>
+
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the output to remind you what it contains" value="BPF Files"/>
+    </inputs>
+
+    <outputs>
+        <collection type="list" label="$job_name" name="output1">
+            <discover_datasets pattern="(?P&lt;designation&gt;[^#]+)#(?P&lt;ext&gt;.+)" directory="BPFData"/>
+        </collection>
+    </outputs>
+
+    <help>Generate a collection of BPF annotation files from an item list
+    that contains the item orthography, these are then suitable for input
+    to the MAUS forced alignment system.</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/maus.xml	Wed Nov 16 15:00:24 2016 -0500
@@ -0,0 +1,96 @@
+<tool id="maus_bpf" name="MAUS Align" version="0.01" force_history_refresh="True">
+    <description>from BPF transcription</description>
+
+    <requirements>
+        <container type="docker">stevecassidy/maus</container>
+    </requirements>
+
+    <command>
+        ln -s "${signal}" input.wav &amp;&amp;
+        /home/maus/maus OUTFORMAT=mau LANGUAGE=$language
+        BPF=$bpf INSKANTEXTGRID=$inskantextgrid INSORTTEXTGRID=$insorttextgrid
+        MODUS=$modus MAUSSHIFT=$mausshift MINPAUSLEN=$minpauslen WEIGHT=$weight
+        INSPROB=$insprob NOINITIALFINALSILENCE=$noinitialfinalsilence OUTSYMBOL=$outsymbol
+        OUT=$output SIGNAL=input.wav
+    </command>
+
+    <inputs>
+        <param name="signal" type="data" format="wav" label="Audio File"/>
+        <param name="bpf" type="data" format="par" label="BPF File" help="Orthographic Transcript as BPF"/>
+        <param name="language" type="select" label="Language">
+            <option value="aus">Australian English</option>
+            <option value="sampa">SAMPA</option>
+            <option value="deu">Germa</option>
+            <option value="gsw-CH">Swiss German</option>
+            <option value="eng">British English</option>
+            <option value="fin">Finnish</option>
+            <option value="fra">French</option>
+            <option value="eng-US">US English</option>
+            <option value="nld">Dutch</option>
+            <option value="spa">Spanish</option>
+            <option value="ita">Italian</option>
+            <option value="por">Portugese</option>
+            <option value="hun">Hungarian</option>
+            <option value="ekk">Estonian</option>
+            <option value="pol">Polish</option>
+            <option value="nze">NZ English</option>
+            <option value="kat">Georgian</option>
+            <option value="rus">Russian</option>
+        </param>
+        <param name="insorttextgrid" type="boolean" value="true" label="Add Orthography to TextGrid output"/>
+        <param name="inskantextgrid" type="boolean" value="true" label="Add canonical phonemic transcription to TextGrid output"/>
+        <param name="modus" type="select" label="Mode">
+            <option value="standard">Normal MAUS technique</option>
+            <option value="align">Do not model pronunciation, align the given phonemic transcript</option>
+            <option value="bigram">Use phone recognition constrained by a Bigram model</option>
+        </param>
+        <param name="mausshift" type="integer" value="10" label="Shift found segment boundaries by this many milliseconds"/>
+        <param name="minpauslen" type="integer" label="supress inter-word silence less than n*10ms, ie. enter 5 here to suppress silence less than 50ms" value='5'/>
+        <param name="weight" type="float" value="7.0" label="relative weight given to statistical model, higher values will favour the canonical pronunciation"/>
+        <param name="insprob" type="float" value="0.0" label="Insertion probability - higher values will reduce the probability of deletions of phonemic segments"/>
+        <param name="noinitialfinalsilence" type="select" label="Suppress initial and final silence">
+            <option value="no">No</option>
+            <option value="yes">Yes</option>
+        </param>
+        <param name="outsymbol" type="select" label="Output symbols">
+            <option value="sampa">SAMPA</option>
+            <option value="ipa">IPA (utf-8)</option>
+            <option value="manner">IPA manner class</option>
+            <option value="place">IPA place of articulation</option>
+        </param>
+    </inputs>
+
+    <outputs>
+        <data format="TextGrid" name="output" label="TextGrid" />
+    </outputs>
+
+
+    <tests>
+        <test>
+            <param name="signal" type="wav" value="1_1119_2_22_001-ch6-speaker16.wav"/>
+            <param name="bpf" value="1_1119_2_22_001.bpf"/>
+            <param name="language" value="aus"/>
+            <param name="insorttextgrid" value="true" />
+            <param name="inskantextgrid" value="true" />
+            <output name="output" file="1_1119_2_22_001.TextGrid"/>
+        </test>
+    </tests>
+
+    <help>Run the MAUS forced aligner on an audio file given a transcription in BPF format.
+    Output is a TextGrid file containing the phonemic annotation with start/end times for each
+    segment.</help>
+
+    <citations>
+        <citation type='bibtex'>
+            @inproceedings{Strunk2011,
+            booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation},
+            publisher = {European Language Resources Association (ELRA)},
+            isbn = {978-2-9517408-8-4},
+            keywords = {forced alignment,language documentation corpora,word times},
+            title = {{Untrained Forced Alignment of Transcriptions and Audio for Language Documentation Corpora using WebMAUS}},
+            url = {http://www.bas.uni-muenchen.de/forschung/publikationen/Schiel-LREC2014.pdf},
+            year = {2014}
+            }
+        </citation>
+    </citations>
+</tool>