diff items_to_bpf.py @ 0:d4c27fdc928b draft

planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
author stevecassidy
date Wed, 16 Nov 2016 15:00:24 -0500
parents
children 4162c1e2ad5f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/items_to_bpf.py	Wed Nov 16 15:00:24 2016 -0500
@@ -0,0 +1,145 @@
+from __future__ import print_function
+import json
+import argparse
+import pyalveo
+import sys
+import os
+from fnmatch import fnmatch
+import csv
+import re
+
+
+def parser():
+    parser = argparse.ArgumentParser(description="Generate BPF Orthographic Transcription from Item List")
+    parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs")
+    parser.add_argument('--lexicon', required=True, action="store", type=str, help="File containing lexicon (tsv)")
+    parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
+    return parser.parse_args()
+
+def read_item_list(filename):
+    """Read an item list from a file
+    which should be a tabular formatted file
+    with one column header ItemURL.
+    Return an instance of ItemGroup"""
+
+    with open(filename) as fd:
+        csvreader = csv.DictReader(fd, dialect='excel-tab')
+        print("CSV", csvreader.fieldnames)
+        if 'ItemURL' not in csvreader.fieldnames:
+            return None
+        if 'Prompt' not in csvreader.fieldnames:
+            return None
+        itemurls = []
+        for row in csvreader:
+            itemurls.append((row['Prompt'], row['ItemURL']))
+
+    return itemurls
+
+# this file name pattern allows galaxy to discover the dataset designation and type
+FNPAT = "%(designation)s#%(ext)s"
+
+def galaxy_name(itemurl, ext):
+    """Construct a filename suitable for dataset discovery
+    by Galaxy.
+
+    @type itemurl: C{String}
+    @param itemurl: the item URL from Alveo
+
+    @type ext: C{String}
+    @param ext: the datatype extension for the resulting file
+    """
+
+    itemname = itemurl.split('/')[-1]
+    fname = FNPAT % {'designation': itemname, 'ext': ext}
+
+    return fname
+
+
+def build_bpf(ortho_trans, lexicon):
+    """ Given an orthographic transcript, generate a BPF-format phonetic
+        transcription for passing to MAUS, using the specified lexicon.
+
+        @type ortho_trans: C{String}
+        @param ortho_trans: the (space-separated) orthographic transcript
+        @type lex: C{Dict}
+        @param lex: the lexicon to use to translate words to phonetic sybmols
+
+        @rtype: C{String}
+        @returns: the BPF-formatted transcript
+
+        @raises IncompleteLexiconError: if there is a word appearing in the
+        orthographic transcript that is not covered by the lexicon
+
+    """
+
+    spl = re.compile(r'[\s.,!?"\-]')
+    words = [w.lower() for w in spl.split(ortho_trans) if w]
+    ort = []
+    kan = []
+
+    for n, word in enumerate(words):
+        try:
+            ort.append("ORT: %d %s" % (n, word))
+            kan.append("KAN: %d %s" % (n, lexicon[word]))
+        except KeyError:
+            raise IncompleteLexiconError("'" + word +
+                                         "' not present in lexicon")
+
+    nl = u"\n"
+    return nl.join(ort) + nl + nl.join(kan)
+
+
+def load_lexicon(lexiconfile):
+    """ Load the given file as a lexicon dictionary.
+        Should be a tsv file with two columns, first column
+        is orthography, second is phonetic transcription.
+
+        @type lexiconfile: C{String}
+        @param lexiconfile: the filename of the lexicon file
+
+        @rtype: C{Dict}
+        @returns: the lexicon, as a dictionary with orthographic entries as keys
+
+    """
+    lex = {}
+
+    with open(lexiconfile) as f:
+        for line in f:
+            orth, pron = line.split('\t')
+            lex[orth] = pron
+
+    return lex
+
+
+def list_to_bpf(item_list, lexicon, output_path):
+    """
+    Generate a BPF file for each item in this item list.
+    Items consist of (prompt, ItemURL). URL is used to generate output
+    file name.
+
+    :type documents: list of pyalveo.Document
+    :param documents: Documents to download
+
+    :type output_path: String
+    :param output_path: directory to download to the documents to
+    """
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    for prompt, itemURL in item_list:
+
+        fname = galaxy_name(itemURL, 'par')
+        bpftext = build_bpf(prompt, lexicon)
+        with open(os.path.join(output_path, fname), 'w') as out:
+            out.write(bpftext)
+
+
+def main():
+    args = parser()
+    item_list = read_item_list(args.item_list)
+    lexicon = load_lexicon(args.lexicon)
+    list_to_bpf(item_list, lexicon, args.output_path)
+
+
+if __name__ == '__main__':
+    main()