# HG changeset patch # User stevecassidy # Date 1471319146 14400 # Node ID 6fef3489d97c86d628f6d3979b4a942885033c90 # Parent bfe39bd252df89460ee5cb271a3211e04b389525 planemo upload commit 0fca33c3b7285bd31f6c7380393d08bbdad4e4d6 diff -r bfe39bd252df -r 6fef3489d97c alveo_api_key.pyc Binary file alveo_api_key.pyc has changed diff -r bfe39bd252df -r 6fef3489d97c alveo_api_key.xml --- a/alveo_api_key.xml Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_api_key.xml Mon Aug 15 23:45:46 2016 -0400 @@ -2,7 +2,7 @@ for use with Alveo tools - pyalveo + pyalveo @@ -19,7 +19,7 @@ - + diff -r bfe39bd252df -r 6fef3489d97c alveo_get_item_data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_get_item_data.py Mon Aug 15 23:45:46 2016 -0400 @@ -0,0 +1,100 @@ +from __future__ import print_function +import json +import argparse +import pyalveo +import sys +import os +from fnmatch import fnmatch +import csv + +API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module + +def parser(): + parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") + parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") + parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs") + parser.add_argument('--patterns', required=True, action="store", type=str, help="File patterns to download") + parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") + return parser.parse_args() + +def read_item_list(filename, client): + """Read an item list from a file + which should be a tabular formatted file + with one column header ItemURL. + Return an instance of ItemGroup""" + + with open(filename) as fd: + csvreader = csv.DictReader(fd, dialect='excel-tab') + if 'ItemURL' not in csvreader.fieldnames: + return None + itemurls = [] + for row in csvreader: + itemurls.append(row['ItemURL']) + + itemlist = pyalveo.ItemGroup(itemurls, client) + + return itemlist + +# this file name pattern allows galaxy to discover the dataset designation and type +FNPAT = "%(designation)s#%(ext)s" + +def galaxy_name(itemname, fname): + """construct a filename suitable for Galaxy dataset discovery + designation - (dataset identifier) is the file basename + ext - defines the dataset type and is the file extension + """ + + root, ext = os.path.splitext(fname) + ext = ext[1:] # remove initial . + fname = FNPAT % {'designation': itemname, 'ext': ext} + + return fname + +def download_documents(item_list, patterns, output_path): + """ + Downloads a list of documents to the directory specificed by output_path. + + :type documents: list of pyalveo.Document + :param documents: Documents to download + + :type output_path: String + :param output_path: directory to download to the documents to + """ + if not os.path.exists(output_path): + os.makedirs(output_path) + + downloaded = [] + + items = item_list.get_all() + filtered_documents = [] + for item in items: + documents = item.get_documents() + for doc in documents: + for pattern in patterns: + if not pattern == '' and fnmatch(doc.get_filename(), pattern): + fname = galaxy_name(item.metadata()['alveo:metadata']['dc:identifier'], doc.get_filename()) + try: + doc.download_content(dir_path=output_path, filename=fname) + downloaded.append(doc.get_filename()) + except: + # maybe it doesn't exist or we have no access + # TODO: report this + pass + return downloaded + +def main(): + args = parser() + try: + api_key = open(args.api_key, 'r').read().strip() + + client = pyalveo.Client(api_url=API_URL, api_key=api_key) + + item_list = read_item_list(args.item_list, client) + patterns = args.patterns.split(',') + downloaded = download_documents(item_list, patterns, args.output_path) + except pyalveo.APIError as e: + print("ERROR: " + str(e), file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() diff -r bfe39bd252df -r 6fef3489d97c alveo_get_item_data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_get_item_data.xml Mon Aug 15 23:45:46 2016 -0400 @@ -0,0 +1,68 @@ + + Downloads files from the items in an Galaxy list of items + + + pyalveo + + + + alveo_get_item_data.py --api_key $api_key --item_list $item_list --patterns $patterns,$patternselect --output_path ItemListData + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Downloads files from a local list of Alveo items. You can download all files or those matching + a wildcard pattern (e.g. *.txt). Results will be stored as a dataset collection in + your history. + + + @article{cassidy2014alveo, + title={The alveo virtual laboratory: a web based repository API}, + author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others}, + year={2014}, + publisher={Reykjavik, Iceland: European Language Resources Association} + } + + + diff -r bfe39bd252df -r 6fef3489d97c alveo_get_primary_text.xml --- a/alveo_get_primary_text.xml Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_get_primary_text.xml Mon Aug 15 23:45:46 2016 -0400 @@ -2,7 +2,7 @@ Downloads primary text from the items in an Alveo Item List - pyalveo + pyalveo diff -r bfe39bd252df -r 6fef3489d97c alveo_item_list_downloader.py --- a/alveo_item_list_downloader.py Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_item_list_downloader.py Mon Aug 15 23:45:46 2016 -0400 @@ -23,15 +23,20 @@ # this file name pattern allows galaxy to discover the dataset designation and type FNPAT = "%(designation)s_%(ext)s" -def galaxy_name(fname): - """construct a filename suitable for Galaxy dataset discovery""" + +def galaxy_name(itemname, fname): + """construct a filename suitable for Galaxy dataset discovery + designation - (dataset identifier) is the file basename + ext - defines the dataset type and is the file extension + """ root, ext = os.path.splitext(fname) ext = ext[1:] # remove initial . - fname = FNPAT % {'designation': fname, 'ext': ext} + fname = FNPAT % {'designation': itemname, 'ext': ext} return fname + def download_documents(item_list, patterns, output_path): """ Downloads a list of documents to the directory specificed by output_path. @@ -54,7 +59,7 @@ for doc in documents: for pattern in patterns: if not pattern == '' and fnmatch(doc.get_filename(), pattern): - fname = galaxy_name(doc.get_filename()) + fname = galaxy_name(item.metadata()['alveo:metadata']['dc:identifier'], doc.get_filename()) try: doc.download_content(dir_path=output_path, filename=fname) downloaded.append(doc.get_filename()) diff -r bfe39bd252df -r 6fef3489d97c alveo_item_list_downloader.xml --- a/alveo_item_list_downloader.xml Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_item_list_downloader.xml Mon Aug 15 23:45:46 2016 -0400 @@ -2,7 +2,7 @@ Downloads files from the items in an Alveo Item List - pyalveo + pyalveo @@ -51,7 +51,7 @@ - + @@ -66,7 +66,7 @@ - + diff -r bfe39bd252df -r 6fef3489d97c alveo_item_list_importer.pyc Binary file alveo_item_list_importer.pyc has changed diff -r bfe39bd252df -r 6fef3489d97c alveo_item_list_importer.xml --- a/alveo_item_list_importer.xml Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_item_list_importer.xml Mon Aug 15 23:45:46 2016 -0400 @@ -2,7 +2,7 @@ Retrieves item list metadata. - pyalveo + pyalveo diff -r bfe39bd252df -r 6fef3489d97c austalk-select-hVd-words.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/austalk-select-hVd-words.py Mon Aug 15 23:45:46 2016 -0400 @@ -0,0 +1,87 @@ +from __future__ import print_function +import argparse +import pyalveo +import sys + +API_URL = 'https://app.alveo.edu.au/' +PREFIXES = """ +PREFIX dc: +PREFIX austalk: +PREFIX olac: +PREFIX ausnc: +PREFIX foaf: +PREFIX dbpedia: +PREFIX rdf: +PREFIX rdfs: +PREFIX geo: +PREFIX iso639schema: +PREFIX austalkid: +PREFIX iso639: +PREFIX xsd: +PREFIX is: +PREFIX iso: +PREFIX dada: """ + +def parser(): + parser = argparse.ArgumentParser(description="Retrieves Alveo Item Lists") + parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") + parser.add_argument('--speaker', required=True, action="store", type=str, help="Speaker identifier") + parser.add_argument('--words', required=False, default='all', action="store", type=str, help="Word group (all, monopthongs, dipthongs)") + parser.add_argument('--output', required=True, action="store", type=str, help="output file name") + return parser.parse_args() + +def find_hVd_words(api_key, speakerid, output, words='all'): + """Find words in the Austalk corpus + """ + + client = pyalveo.Client(api_key, API_URL) + + query = PREFIXES + """ +SELECT distinct ?item ?prompt ?compname +WHERE { + ?item a ausnc:AusNCObject . + ?item olac:speaker ?speaker . + ?speaker austalk:id "%s" . + ?item austalk:prompt ?prompt . + ?item austalk:componentName ?compname . + """ % speakerid + + hVdWords = { + 'monopthongs': ['head', 'had', 'hud', 'heed', 'hid', 'hood', 'hod', "whod"], + 'dipthongs': ['herd', 'howd', 'hoyd', 'haired', 'hard', 'heared'] + } + + if words == 'all': + words = hVdWords['monopthongs'] + hVdWords['dipthongs'] + else: + words = hVdWords[words] + + filterclause = 'FILTER regex(?prompt, "^' + filterclause += '$|^'.join(words) + filterclause += '$", "i")\n' + + query += filterclause + "}" + + result = client.sparql_query('austalk', query) + + items = [] + for b in result['results']['bindings']: + items.append((b['prompt']['value'], b['item']['value'])) + + with open(output, 'w') as out: + out.write("Speaker\tPrompt\tItemURL\n") + for item in items: + out.write(speakerid + "\t" + item[0] + "\t" + item[1] + "\n") + + +def main(): + args = parser() + try: + api_key = open(args.api_key, 'r').read().strip() + find_hVd_words(api_key, args.speaker, args.output, args.words) + except Exception as e: + print("ERROR: " + str(e), file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() diff -r bfe39bd252df -r 6fef3489d97c austalk-select-hVd-words.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/austalk-select-hVd-words.xml Mon Aug 15 23:45:46 2016 -0400 @@ -0,0 +1,44 @@ + + for a single speaker + + + pyalveo + + + + austalk-select-hVd-words.py --api_key $api_key --speaker $speaker --words $words --output $output + + + + + + + + + + + + + + + + + + + + + Find items corresponding to the hVd words for this Austalk speaker. + + + + @inproceedings{Buschmeir2013, + author = {{Hendrik Buschmeier}, Marcin Wlodarczak}, + booktitle = {Tagungsband der 24. Konferenz zur Elektronischen Sprachsignalverarbeitung (ESSV 2013)}, + pages = {152--157}, + title = {{TextGridTools: A TextGrid Processing and Analysis Toolkit for Python}}, + year = {2013} + } + + + diff -r bfe39bd252df -r 6fef3489d97c test-data/api-key.dat --- a/test-data/api-key.dat Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -insert your api key here diff -r bfe39bd252df -r 6fef3489d97c test-data/item-lists.dat --- a/test-data/item-lists.dat Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -austalk_catepillar (309) https://app.alveo.edu.au/item_lists/64 -different (888) https://app.alveo.edu.au/item_lists/132 -gum-tree (58) https://app.alveo.edu.au/item_lists/84 -M&D_Test_140904 (10) https://app.alveo.edu.au/item_lists/168 -rose (245) https://app.alveo.edu.au/item_lists/82 -thistle (16) https://app.alveo.edu.au/item_lists/83 -ace-specialised (122) https://app.alveo.edu.au/item_lists/178 -austalk_hide (42) https://app.alveo.edu.au/item_lists/251 -austalk-male-digits (144) https://app.alveo.edu.au/item_lists/412 -COOEE ALL (1354) https://app.alveo.edu.au/item_lists/95 -cooee sample (129) https://app.alveo.edu.au/item_lists/53 -dialogue-all (76) https://app.alveo.edu.au/item_lists/116 -dialogue-sample (6) https://app.alveo.edu.au/item_lists/180 -mdsample (20) https://app.alveo.edu.au/item_lists/52 -one austalk sample (1) https://app.alveo.edu.au/item_lists/179 diff -r bfe39bd252df -r 6fef3489d97c test_alveo_api_key.py --- a/test_alveo_api_key.py Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -import unittest -import os -import alveo_api_key -import pyalveo -from mock import Mock - -class TestAlveoAPIKey(unittest.TestCase): - - OUTPUT_PATH = 'test.txt' - API_KEY = 'test123' - MOCK_CLIENT = Mock(pyalveo) - - def test_write_key(self): - alveo_api_key.write_key(self.API_KEY, self.OUTPUT_PATH, self.MOCK_CLIENT) - actual = open(self.OUTPUT_PATH, 'r').read() - self.assertEqual(self.API_KEY, actual) - - def tearDown(self): - try: - os.remove(self.OUTPUT_PATH) - except OSError: - pass - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff -r bfe39bd252df -r 6fef3489d97c test_alveo_item_list_downloader.py --- a/test_alveo_item_list_downloader.py Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ -import unittest -import os -import json -import alveo_item_list_importer -import pyalveo -from mock import Mock - -class TestAlveoItemListImporter(unittest.TestCase): - - API_KEY = 'test123' - OUTPUT_PATH = 'test.csv' - ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}' - CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n' - - def test_write_table(self): - api_list = json.loads(self.ITEM_LIST) - alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH) - actual = open(self.OUTPUT_PATH, 'r').read() - self.assertEqual(self.CSV_CONTENTS, actual) - - def tearDown(self): - try: - os.remove(self.OUTPUT_PATH) - except OSError: - pass - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff -r bfe39bd252df -r 6fef3489d97c test_alveo_item_list_importer.py --- a/test_alveo_item_list_importer.py Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ -import unittest -import os -import json -import alveo_item_list_importer -import pyalveo -from mock import Mock - -class TestAlveoItemListImporter(unittest.TestCase): - - API_KEY = 'test123' - OUTPUT_PATH = 'test.csv' - ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}' - CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n' - - def test_write_table(self): - api_list = json.loads(self.ITEM_LIST) - alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH) - actual = open(self.OUTPUT_PATH, 'r').read() - self.assertEqual(self.CSV_CONTENTS, actual) - - def tearDown(self): - try: - os.remove(self.OUTPUT_PATH) - except OSError: - pass - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff -r bfe39bd252df -r 6fef3489d97c tool_dependencies.xml --- a/tool_dependencies.xml Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ - - - - - - pyalveo==0.4 - - - -