Mercurial > repos > stevecassidy > maus
diff items_to_bpf.py @ 0:d4c27fdc928b draft
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
| author | stevecassidy |
|---|---|
| date | Wed, 16 Nov 2016 15:00:24 -0500 |
| parents | |
| children | 4162c1e2ad5f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/items_to_bpf.py Wed Nov 16 15:00:24 2016 -0500 @@ -0,0 +1,145 @@ +from __future__ import print_function +import json +import argparse +import pyalveo +import sys +import os +from fnmatch import fnmatch +import csv +import re + + +def parser(): + parser = argparse.ArgumentParser(description="Generate BPF Orthographic Transcription from Item List") + parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs") + parser.add_argument('--lexicon', required=True, action="store", type=str, help="File containing lexicon (tsv)") + parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") + return parser.parse_args() + +def read_item_list(filename): + """Read an item list from a file + which should be a tabular formatted file + with one column header ItemURL. + Return an instance of ItemGroup""" + + with open(filename) as fd: + csvreader = csv.DictReader(fd, dialect='excel-tab') + print("CSV", csvreader.fieldnames) + if 'ItemURL' not in csvreader.fieldnames: + return None + if 'Prompt' not in csvreader.fieldnames: + return None + itemurls = [] + for row in csvreader: + itemurls.append((row['Prompt'], row['ItemURL'])) + + return itemurls + +# this file name pattern allows galaxy to discover the dataset designation and type +FNPAT = "%(designation)s#%(ext)s" + +def galaxy_name(itemurl, ext): + """Construct a filename suitable for dataset discovery + by Galaxy. + + @type itemurl: C{String} + @param itemurl: the item URL from Alveo + + @type ext: C{String} + @param ext: the datatype extension for the resulting file + """ + + itemname = itemurl.split('/')[-1] + fname = FNPAT % {'designation': itemname, 'ext': ext} + + return fname + + +def build_bpf(ortho_trans, lexicon): + """ Given an orthographic transcript, generate a BPF-format phonetic + transcription for passing to MAUS, using the specified lexicon. + + @type ortho_trans: C{String} + @param ortho_trans: the (space-separated) orthographic transcript + @type lex: C{Dict} + @param lex: the lexicon to use to translate words to phonetic sybmols + + @rtype: C{String} + @returns: the BPF-formatted transcript + + @raises IncompleteLexiconError: if there is a word appearing in the + orthographic transcript that is not covered by the lexicon + + """ + + spl = re.compile(r'[\s.,!?"\-]') + words = [w.lower() for w in spl.split(ortho_trans) if w] + ort = [] + kan = [] + + for n, word in enumerate(words): + try: + ort.append("ORT: %d %s" % (n, word)) + kan.append("KAN: %d %s" % (n, lexicon[word])) + except KeyError: + raise IncompleteLexiconError("'" + word + + "' not present in lexicon") + + nl = u"\n" + return nl.join(ort) + nl + nl.join(kan) + + +def load_lexicon(lexiconfile): + """ Load the given file as a lexicon dictionary. + Should be a tsv file with two columns, first column + is orthography, second is phonetic transcription. + + @type lexiconfile: C{String} + @param lexiconfile: the filename of the lexicon file + + @rtype: C{Dict} + @returns: the lexicon, as a dictionary with orthographic entries as keys + + """ + lex = {} + + with open(lexiconfile) as f: + for line in f: + orth, pron = line.split('\t') + lex[orth] = pron + + return lex + + +def list_to_bpf(item_list, lexicon, output_path): + """ + Generate a BPF file for each item in this item list. + Items consist of (prompt, ItemURL). URL is used to generate output + file name. + + :type documents: list of pyalveo.Document + :param documents: Documents to download + + :type output_path: String + :param output_path: directory to download to the documents to + """ + if not os.path.exists(output_path): + os.makedirs(output_path) + + for prompt, itemURL in item_list: + + fname = galaxy_name(itemURL, 'par') + bpftext = build_bpf(prompt, lexicon) + with open(os.path.join(output_path, fname), 'w') as out: + out.write(bpftext) + + +def main(): + args = parser() + item_list = read_item_list(args.item_list) + lexicon = load_lexicon(args.lexicon) + list_to_bpf(item_list, lexicon, args.output_path) + + +if __name__ == '__main__': + main()
