comparison items_to_bpf.py @ 0:d4c27fdc928b draft

planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
author stevecassidy
date Wed, 16 Nov 2016 15:00:24 -0500
parents
children 4162c1e2ad5f
comparison
equal deleted inserted replaced
-1:000000000000 0:d4c27fdc928b
1 from __future__ import print_function
2 import json
3 import argparse
4 import pyalveo
5 import sys
6 import os
7 from fnmatch import fnmatch
8 import csv
9 import re
10
11
12 def parser():
13 parser = argparse.ArgumentParser(description="Generate BPF Orthographic Transcription from Item List")
14 parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs")
15 parser.add_argument('--lexicon', required=True, action="store", type=str, help="File containing lexicon (tsv)")
16 parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
17 return parser.parse_args()
18
19 def read_item_list(filename):
20 """Read an item list from a file
21 which should be a tabular formatted file
22 with one column header ItemURL.
23 Return an instance of ItemGroup"""
24
25 with open(filename) as fd:
26 csvreader = csv.DictReader(fd, dialect='excel-tab')
27 print("CSV", csvreader.fieldnames)
28 if 'ItemURL' not in csvreader.fieldnames:
29 return None
30 if 'Prompt' not in csvreader.fieldnames:
31 return None
32 itemurls = []
33 for row in csvreader:
34 itemurls.append((row['Prompt'], row['ItemURL']))
35
36 return itemurls
37
38 # this file name pattern allows galaxy to discover the dataset designation and type
39 FNPAT = "%(designation)s#%(ext)s"
40
41 def galaxy_name(itemurl, ext):
42 """Construct a filename suitable for dataset discovery
43 by Galaxy.
44
45 @type itemurl: C{String}
46 @param itemurl: the item URL from Alveo
47
48 @type ext: C{String}
49 @param ext: the datatype extension for the resulting file
50 """
51
52 itemname = itemurl.split('/')[-1]
53 fname = FNPAT % {'designation': itemname, 'ext': ext}
54
55 return fname
56
57
58 def build_bpf(ortho_trans, lexicon):
59 """ Given an orthographic transcript, generate a BPF-format phonetic
60 transcription for passing to MAUS, using the specified lexicon.
61
62 @type ortho_trans: C{String}
63 @param ortho_trans: the (space-separated) orthographic transcript
64 @type lex: C{Dict}
65 @param lex: the lexicon to use to translate words to phonetic sybmols
66
67 @rtype: C{String}
68 @returns: the BPF-formatted transcript
69
70 @raises IncompleteLexiconError: if there is a word appearing in the
71 orthographic transcript that is not covered by the lexicon
72
73 """
74
75 spl = re.compile(r'[\s.,!?"\-]')
76 words = [w.lower() for w in spl.split(ortho_trans) if w]
77 ort = []
78 kan = []
79
80 for n, word in enumerate(words):
81 try:
82 ort.append("ORT: %d %s" % (n, word))
83 kan.append("KAN: %d %s" % (n, lexicon[word]))
84 except KeyError:
85 raise IncompleteLexiconError("'" + word +
86 "' not present in lexicon")
87
88 nl = u"\n"
89 return nl.join(ort) + nl + nl.join(kan)
90
91
92 def load_lexicon(lexiconfile):
93 """ Load the given file as a lexicon dictionary.
94 Should be a tsv file with two columns, first column
95 is orthography, second is phonetic transcription.
96
97 @type lexiconfile: C{String}
98 @param lexiconfile: the filename of the lexicon file
99
100 @rtype: C{Dict}
101 @returns: the lexicon, as a dictionary with orthographic entries as keys
102
103 """
104 lex = {}
105
106 with open(lexiconfile) as f:
107 for line in f:
108 orth, pron = line.split('\t')
109 lex[orth] = pron
110
111 return lex
112
113
114 def list_to_bpf(item_list, lexicon, output_path):
115 """
116 Generate a BPF file for each item in this item list.
117 Items consist of (prompt, ItemURL). URL is used to generate output
118 file name.
119
120 :type documents: list of pyalveo.Document
121 :param documents: Documents to download
122
123 :type output_path: String
124 :param output_path: directory to download to the documents to
125 """
126 if not os.path.exists(output_path):
127 os.makedirs(output_path)
128
129 for prompt, itemURL in item_list:
130
131 fname = galaxy_name(itemURL, 'par')
132 bpftext = build_bpf(prompt, lexicon)
133 with open(os.path.join(output_path, fname), 'w') as out:
134 out.write(bpftext)
135
136
137 def main():
138 args = parser()
139 item_list = read_item_list(args.item_list)
140 lexicon = load_lexicon(args.lexicon)
141 list_to_bpf(item_list, lexicon, args.output_path)
142
143
144 if __name__ == '__main__':
145 main()