Mercurial > repos > stevecassidy > maus
comparison items_to_bpf.py @ 0:d4c27fdc928b draft
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
| author | stevecassidy |
|---|---|
| date | Wed, 16 Nov 2016 15:00:24 -0500 |
| parents | |
| children | 4162c1e2ad5f |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d4c27fdc928b |
|---|---|
| 1 from __future__ import print_function | |
| 2 import json | |
| 3 import argparse | |
| 4 import pyalveo | |
| 5 import sys | |
| 6 import os | |
| 7 from fnmatch import fnmatch | |
| 8 import csv | |
| 9 import re | |
| 10 | |
| 11 | |
| 12 def parser(): | |
| 13 parser = argparse.ArgumentParser(description="Generate BPF Orthographic Transcription from Item List") | |
| 14 parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs") | |
| 15 parser.add_argument('--lexicon', required=True, action="store", type=str, help="File containing lexicon (tsv)") | |
| 16 parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") | |
| 17 return parser.parse_args() | |
| 18 | |
| 19 def read_item_list(filename): | |
| 20 """Read an item list from a file | |
| 21 which should be a tabular formatted file | |
| 22 with one column header ItemURL. | |
| 23 Return an instance of ItemGroup""" | |
| 24 | |
| 25 with open(filename) as fd: | |
| 26 csvreader = csv.DictReader(fd, dialect='excel-tab') | |
| 27 print("CSV", csvreader.fieldnames) | |
| 28 if 'ItemURL' not in csvreader.fieldnames: | |
| 29 return None | |
| 30 if 'Prompt' not in csvreader.fieldnames: | |
| 31 return None | |
| 32 itemurls = [] | |
| 33 for row in csvreader: | |
| 34 itemurls.append((row['Prompt'], row['ItemURL'])) | |
| 35 | |
| 36 return itemurls | |
| 37 | |
| 38 # this file name pattern allows galaxy to discover the dataset designation and type | |
| 39 FNPAT = "%(designation)s#%(ext)s" | |
| 40 | |
| 41 def galaxy_name(itemurl, ext): | |
| 42 """Construct a filename suitable for dataset discovery | |
| 43 by Galaxy. | |
| 44 | |
| 45 @type itemurl: C{String} | |
| 46 @param itemurl: the item URL from Alveo | |
| 47 | |
| 48 @type ext: C{String} | |
| 49 @param ext: the datatype extension for the resulting file | |
| 50 """ | |
| 51 | |
| 52 itemname = itemurl.split('/')[-1] | |
| 53 fname = FNPAT % {'designation': itemname, 'ext': ext} | |
| 54 | |
| 55 return fname | |
| 56 | |
| 57 | |
| 58 def build_bpf(ortho_trans, lexicon): | |
| 59 """ Given an orthographic transcript, generate a BPF-format phonetic | |
| 60 transcription for passing to MAUS, using the specified lexicon. | |
| 61 | |
| 62 @type ortho_trans: C{String} | |
| 63 @param ortho_trans: the (space-separated) orthographic transcript | |
| 64 @type lex: C{Dict} | |
| 65 @param lex: the lexicon to use to translate words to phonetic sybmols | |
| 66 | |
| 67 @rtype: C{String} | |
| 68 @returns: the BPF-formatted transcript | |
| 69 | |
| 70 @raises IncompleteLexiconError: if there is a word appearing in the | |
| 71 orthographic transcript that is not covered by the lexicon | |
| 72 | |
| 73 """ | |
| 74 | |
| 75 spl = re.compile(r'[\s.,!?"\-]') | |
| 76 words = [w.lower() for w in spl.split(ortho_trans) if w] | |
| 77 ort = [] | |
| 78 kan = [] | |
| 79 | |
| 80 for n, word in enumerate(words): | |
| 81 try: | |
| 82 ort.append("ORT: %d %s" % (n, word)) | |
| 83 kan.append("KAN: %d %s" % (n, lexicon[word])) | |
| 84 except KeyError: | |
| 85 raise IncompleteLexiconError("'" + word + | |
| 86 "' not present in lexicon") | |
| 87 | |
| 88 nl = u"\n" | |
| 89 return nl.join(ort) + nl + nl.join(kan) | |
| 90 | |
| 91 | |
| 92 def load_lexicon(lexiconfile): | |
| 93 """ Load the given file as a lexicon dictionary. | |
| 94 Should be a tsv file with two columns, first column | |
| 95 is orthography, second is phonetic transcription. | |
| 96 | |
| 97 @type lexiconfile: C{String} | |
| 98 @param lexiconfile: the filename of the lexicon file | |
| 99 | |
| 100 @rtype: C{Dict} | |
| 101 @returns: the lexicon, as a dictionary with orthographic entries as keys | |
| 102 | |
| 103 """ | |
| 104 lex = {} | |
| 105 | |
| 106 with open(lexiconfile) as f: | |
| 107 for line in f: | |
| 108 orth, pron = line.split('\t') | |
| 109 lex[orth] = pron | |
| 110 | |
| 111 return lex | |
| 112 | |
| 113 | |
| 114 def list_to_bpf(item_list, lexicon, output_path): | |
| 115 """ | |
| 116 Generate a BPF file for each item in this item list. | |
| 117 Items consist of (prompt, ItemURL). URL is used to generate output | |
| 118 file name. | |
| 119 | |
| 120 :type documents: list of pyalveo.Document | |
| 121 :param documents: Documents to download | |
| 122 | |
| 123 :type output_path: String | |
| 124 :param output_path: directory to download to the documents to | |
| 125 """ | |
| 126 if not os.path.exists(output_path): | |
| 127 os.makedirs(output_path) | |
| 128 | |
| 129 for prompt, itemURL in item_list: | |
| 130 | |
| 131 fname = galaxy_name(itemURL, 'par') | |
| 132 bpftext = build_bpf(prompt, lexicon) | |
| 133 with open(os.path.join(output_path, fname), 'w') as out: | |
| 134 out.write(bpftext) | |
| 135 | |
| 136 | |
| 137 def main(): | |
| 138 args = parser() | |
| 139 item_list = read_item_list(args.item_list) | |
| 140 lexicon = load_lexicon(args.lexicon) | |
| 141 list_to_bpf(item_list, lexicon, args.output_path) | |
| 142 | |
| 143 | |
| 144 if __name__ == '__main__': | |
| 145 main() |
