Mercurial > repos > stevecassidy > alveoimport
changeset 1:6fef3489d97c draft
planemo upload commit 0fca33c3b7285bd31f6c7380393d08bbdad4e4d6
author | stevecassidy |
---|---|
date | Mon, 15 Aug 2016 23:45:46 -0400 |
parents | bfe39bd252df |
children | 7b6021997b8e |
files | alveo_api_key.pyc alveo_api_key.xml alveo_get_item_data.py alveo_get_item_data.xml alveo_get_primary_text.xml alveo_item_list_downloader.py alveo_item_list_downloader.xml alveo_item_list_importer.pyc alveo_item_list_importer.xml austalk-select-hVd-words.py austalk-select-hVd-words.xml test-data/api-key.dat test-data/item-lists.dat test_alveo_api_key.py test_alveo_item_list_downloader.py test_alveo_item_list_importer.py tool_dependencies.xml |
diffstat | 17 files changed, 315 insertions(+), 118 deletions(-) [+] |
line wrap: on
line diff
--- a/alveo_api_key.xml Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_api_key.xml Mon Aug 15 23:45:46 2016 -0400 @@ -2,7 +2,7 @@ <description>for use with Alveo tools</description> <requirements> - <requirement type="package" version="0.4">pyalveo</requirement> + <requirement type="package" version="0.6">pyalveo</requirement> </requirements> <command interpreter="python"> @@ -19,7 +19,7 @@ <tests> <test> - <param name="api_key" value="9swHm5MgVxdnuhrqdqwk" /> + <param name="api_key" value="your api key here" /> <output name="output" file="api-key.dat" compare="contains" /> </test> </tests>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_get_item_data.py Mon Aug 15 23:45:46 2016 -0400 @@ -0,0 +1,100 @@ +from __future__ import print_function +import json +import argparse +import pyalveo +import sys +import os +from fnmatch import fnmatch +import csv + +API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module + +def parser(): + parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") + parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") + parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs") + parser.add_argument('--patterns', required=True, action="store", type=str, help="File patterns to download") + parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") + return parser.parse_args() + +def read_item_list(filename, client): + """Read an item list from a file + which should be a tabular formatted file + with one column header ItemURL. + Return an instance of ItemGroup""" + + with open(filename) as fd: + csvreader = csv.DictReader(fd, dialect='excel-tab') + if 'ItemURL' not in csvreader.fieldnames: + return None + itemurls = [] + for row in csvreader: + itemurls.append(row['ItemURL']) + + itemlist = pyalveo.ItemGroup(itemurls, client) + + return itemlist + +# this file name pattern allows galaxy to discover the dataset designation and type +FNPAT = "%(designation)s#%(ext)s" + +def galaxy_name(itemname, fname): + """construct a filename suitable for Galaxy dataset discovery + designation - (dataset identifier) is the file basename + ext - defines the dataset type and is the file extension + """ + + root, ext = os.path.splitext(fname) + ext = ext[1:] # remove initial . + fname = FNPAT % {'designation': itemname, 'ext': ext} + + return fname + +def download_documents(item_list, patterns, output_path): + """ + Downloads a list of documents to the directory specificed by output_path. + + :type documents: list of pyalveo.Document + :param documents: Documents to download + + :type output_path: String + :param output_path: directory to download to the documents to + """ + if not os.path.exists(output_path): + os.makedirs(output_path) + + downloaded = [] + + items = item_list.get_all() + filtered_documents = [] + for item in items: + documents = item.get_documents() + for doc in documents: + for pattern in patterns: + if not pattern == '' and fnmatch(doc.get_filename(), pattern): + fname = galaxy_name(item.metadata()['alveo:metadata']['dc:identifier'], doc.get_filename()) + try: + doc.download_content(dir_path=output_path, filename=fname) + downloaded.append(doc.get_filename()) + except: + # maybe it doesn't exist or we have no access + # TODO: report this + pass + return downloaded + +def main(): + args = parser() + try: + api_key = open(args.api_key, 'r').read().strip() + + client = pyalveo.Client(api_url=API_URL, api_key=api_key) + + item_list = read_item_list(args.item_list, client) + patterns = args.patterns.split(',') + downloaded = download_documents(item_list, patterns, args.output_path) + except pyalveo.APIError as e: + print("ERROR: " + str(e), file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_get_item_data.xml Mon Aug 15 23:45:46 2016 -0400 @@ -0,0 +1,68 @@ +<tool id="alveo_get_item_data" name="Get Alveo Data for Items" version="0.01" force_history_refresh="True"> + <description>Downloads files from the items in an Galaxy list of items</description> + + <requirements> + <requirement type="package" version="0.6">pyalveo</requirement> + </requirements> + + <command interpreter="python"> + alveo_get_item_data.py --api_key $api_key --item_list $item_list --patterns $patterns,$patternselect --output_path ItemListData + </command> + + <inputs> + <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/> + <param name="item_list" type="data" format="tabular" label="Item List (table)" help=""/> + + <param name="patternselect" type="select" multiple="true" label="Predefined imports" display="checkboxes"> + <option value='*'>All Files</option> + <option value='*speaker16.wav'>Austalk 16bit/16kHz Speaker Headset WAV (*speaker16.wav)</option> + <option value='*plain.txt'>Plain text documents (*plain.txt)</option> + <option value='*.txt'>All text documents (*.txt)</option> + <option value=''>Other - enter pattern below</option> + </param> + + <param name="patterns" type="text" label="File patterns to import" + optional="true" + help="One or more file patterns separated by commas eg. *.wav,*.txt"/> + + <param name="job_name" type="text" size="25" + label="Supply a name for the output to remind you what it contains" value="Alveo Data"/> + </inputs> + + <outputs> + <collection type="list" label="$job_name" name="output1"> + <discover_datasets pattern="(?P<designation>[^#]+)#(?P<ext>.+)" directory="ItemListData"/> + </collection> + </outputs> + + <tests> + <test> + <param name="api_key" value="api-key.dat"/> + <param name="item_list" value="hvd-sample-items.dat"/> + <param name="patterns" value="*.TextGrid"/> + <param name="patternselect" value=""/> + <param name="output_path" value="test_out"/> + <output_collection name="output1" type="list" count="5"> + <element name="1_1308_2_22_023"> + <assert_contents> + <has_text_matching expression="xmax = 1.020000"/> + </assert_contents> + </element> + </output_collection> + </test> + </tests> + + <help>Downloads files from a local list of Alveo items. You can download all files or those matching + a wildcard pattern (e.g. *.txt). Results will be stored as a dataset collection in + your history.</help> + <citations> + <citation type='bibtex'> + @article{cassidy2014alveo, + title={The alveo virtual laboratory: a web based repository API}, + author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others}, + year={2014}, + publisher={Reykjavik, Iceland: European Language Resources Association} + } + </citation> + </citations> +</tool>
--- a/alveo_get_primary_text.xml Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_get_primary_text.xml Mon Aug 15 23:45:46 2016 -0400 @@ -2,7 +2,7 @@ <description>Downloads primary text from the items in an Alveo Item List</description> <requirements> - <requirement type="package" version="0.4">pyalveo</requirement> + <requirement type="package" version="0.6">pyalveo</requirement> </requirements> <command interpreter="python">
--- a/alveo_item_list_downloader.py Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_item_list_downloader.py Mon Aug 15 23:45:46 2016 -0400 @@ -23,15 +23,20 @@ # this file name pattern allows galaxy to discover the dataset designation and type FNPAT = "%(designation)s_%(ext)s" -def galaxy_name(fname): - """construct a filename suitable for Galaxy dataset discovery""" + +def galaxy_name(itemname, fname): + """construct a filename suitable for Galaxy dataset discovery + designation - (dataset identifier) is the file basename + ext - defines the dataset type and is the file extension + """ root, ext = os.path.splitext(fname) ext = ext[1:] # remove initial . - fname = FNPAT % {'designation': fname, 'ext': ext} + fname = FNPAT % {'designation': itemname, 'ext': ext} return fname + def download_documents(item_list, patterns, output_path): """ Downloads a list of documents to the directory specificed by output_path. @@ -54,7 +59,7 @@ for doc in documents: for pattern in patterns: if not pattern == '' and fnmatch(doc.get_filename(), pattern): - fname = galaxy_name(doc.get_filename()) + fname = galaxy_name(item.metadata()['alveo:metadata']['dc:identifier'], doc.get_filename()) try: doc.download_content(dir_path=output_path, filename=fname) downloaded.append(doc.get_filename())
--- a/alveo_item_list_downloader.xml Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_item_list_downloader.xml Mon Aug 15 23:45:46 2016 -0400 @@ -2,7 +2,7 @@ <description>Downloads files from the items in an Alveo Item List</description> <requirements> - <requirement type="package" version="0.4">pyalveo</requirement> + <requirement type="package" version="0.6">pyalveo</requirement> </requirements> <command interpreter="python"> @@ -51,7 +51,7 @@ <param name="patternselect" value="*plain.txt"/> <param name="output_path" value="test_out"/> <output_collection name="output1" type="list" count="6"> - <element name="GCSAusE02-plain.txt"> + <element name="GCSAusE02"> <assert_contents> <has_text_matching expression="background noises"/> </assert_contents> @@ -66,7 +66,7 @@ <param name="patternselect" value=""/> <param name="output_path" value="test_out"/> <output_collection name="output1" type="list" count="6"> - <element name="GCSAusE02-plain.txt"> + <element name="GCSAusE02"> <assert_contents> <has_text_matching expression="background noises"/> </assert_contents>
--- a/alveo_item_list_importer.xml Mon Jul 18 23:49:40 2016 -0400 +++ b/alveo_item_list_importer.xml Mon Aug 15 23:45:46 2016 -0400 @@ -2,7 +2,7 @@ <description>Retrieves item list metadata.</description> <requirements> - <requirement type="package" version="0.4">pyalveo</requirement> + <requirement type="package" version="0.6">pyalveo</requirement> </requirements> <command interpreter="python">
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/austalk-select-hVd-words.py Mon Aug 15 23:45:46 2016 -0400 @@ -0,0 +1,87 @@ +from __future__ import print_function +import argparse +import pyalveo +import sys + +API_URL = 'https://app.alveo.edu.au/' +PREFIXES = """ +PREFIX dc:<http://purl.org/dc/terms/> +PREFIX austalk:<http://ns.austalk.edu.au/> +PREFIX olac:<http://www.language-archives.org/OLAC/1.1/> +PREFIX ausnc:<http://ns.ausnc.org.au/schemas/ausnc_md_model/> +PREFIX foaf:<http://xmlns.com/foaf/0.1/> +PREFIX dbpedia:<http://dbpedia.org/ontology/> +PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> +PREFIX geo:<http://www.w3.org/2003/01/geo/wgs84_pos#> +PREFIX iso639schema:<http://downlode.org/rdf/iso-639/schema#> +PREFIX austalkid:<http://id.austalk.edu.au/> +PREFIX iso639:<http://downlode.org/rdf/iso-639/languages#> +PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> +PREFIX is: <http://purl.org/ontology/is/core#> +PREFIX iso: <http://purl.org/iso25964/skos-thes#> +PREFIX dada: <http://purl.org/dada/schema/0.2#>""" + +def parser(): + parser = argparse.ArgumentParser(description="Retrieves Alveo Item Lists") + parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") + parser.add_argument('--speaker', required=True, action="store", type=str, help="Speaker identifier") + parser.add_argument('--words', required=False, default='all', action="store", type=str, help="Word group (all, monopthongs, dipthongs)") + parser.add_argument('--output', required=True, action="store", type=str, help="output file name") + return parser.parse_args() + +def find_hVd_words(api_key, speakerid, output, words='all'): + """Find words in the Austalk corpus + """ + + client = pyalveo.Client(api_key, API_URL) + + query = PREFIXES + """ +SELECT distinct ?item ?prompt ?compname +WHERE { + ?item a ausnc:AusNCObject . + ?item olac:speaker ?speaker . + ?speaker austalk:id "%s" . + ?item austalk:prompt ?prompt . + ?item austalk:componentName ?compname . + """ % speakerid + + hVdWords = { + 'monopthongs': ['head', 'had', 'hud', 'heed', 'hid', 'hood', 'hod', "whod"], + 'dipthongs': ['herd', 'howd', 'hoyd', 'haired', 'hard', 'heared'] + } + + if words == 'all': + words = hVdWords['monopthongs'] + hVdWords['dipthongs'] + else: + words = hVdWords[words] + + filterclause = 'FILTER regex(?prompt, "^' + filterclause += '$|^'.join(words) + filterclause += '$", "i")\n' + + query += filterclause + "}" + + result = client.sparql_query('austalk', query) + + items = [] + for b in result['results']['bindings']: + items.append((b['prompt']['value'], b['item']['value'])) + + with open(output, 'w') as out: + out.write("Speaker\tPrompt\tItemURL\n") + for item in items: + out.write(speakerid + "\t" + item[0] + "\t" + item[1] + "\n") + + +def main(): + args = parser() + try: + api_key = open(args.api_key, 'r').read().strip() + find_hVd_words(api_key, args.speaker, args.output, args.words) + except Exception as e: + print("ERROR: " + str(e), file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/austalk-select-hVd-words.xml Mon Aug 15 23:45:46 2016 -0400 @@ -0,0 +1,44 @@ +<tool id="austalk-select-hvd-words" name="Find HVD words in Austalk" version="0.01" force_history_refresh="True"> + <description>for a single speaker</description> + + <requirements> + <requirement type="package" version="0.6">pyalveo</requirement> + </requirements> + + <command interpreter="python"> + austalk-select-hVd-words.py --api_key $api_key --speaker $speaker --words $words --output $output + </command> + + <inputs> + <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/> + <param name="speaker" type="text" format="text" label="Speaker ID" help="e.g. 1_123"/> + <param name="words" type="select" multiple="false" label="Word List" display="radioboxes"> + <option value='all'>All hVd words</option> + <option value='monopthongs'>hVd monopthongs</option> + <option value='dipthongs'>hVd dipthongs</option> + </param> + <param name="job_name" type="text" size="25" + label="Supply a name for the output to remind you what it contains" value="Query Results"/> + </inputs> + + <outputs> + <data format="tabular" name="output" label="$job_name" /> + </outputs> + + <tests> + </tests> + + <help>Find items corresponding to the hVd words for this Austalk speaker.</help> + + <citations> + <citation type='bibtex'> + @inproceedings{Buschmeir2013, + author = {{Hendrik Buschmeier}, Marcin Wlodarczak}, + booktitle = {Tagungsband der 24. Konferenz zur Elektronischen Sprachsignalverarbeitung (ESSV 2013)}, + pages = {152--157}, + title = {{TextGridTools: A TextGrid Processing and Analysis Toolkit for Python}}, + year = {2013} + } + </citation> + </citations> +</tool>
--- a/test-data/api-key.dat Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -insert your api key here
--- a/test-data/item-lists.dat Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -austalk_catepillar (309) https://app.alveo.edu.au/item_lists/64 -different (888) https://app.alveo.edu.au/item_lists/132 -gum-tree (58) https://app.alveo.edu.au/item_lists/84 -M&D_Test_140904 (10) https://app.alveo.edu.au/item_lists/168 -rose (245) https://app.alveo.edu.au/item_lists/82 -thistle (16) https://app.alveo.edu.au/item_lists/83 -ace-specialised (122) https://app.alveo.edu.au/item_lists/178 -austalk_hide (42) https://app.alveo.edu.au/item_lists/251 -austalk-male-digits (144) https://app.alveo.edu.au/item_lists/412 -COOEE ALL (1354) https://app.alveo.edu.au/item_lists/95 -cooee sample (129) https://app.alveo.edu.au/item_lists/53 -dialogue-all (76) https://app.alveo.edu.au/item_lists/116 -dialogue-sample (6) https://app.alveo.edu.au/item_lists/180 -mdsample (20) https://app.alveo.edu.au/item_lists/52 -one austalk sample (1) https://app.alveo.edu.au/item_lists/179
--- a/test_alveo_api_key.py Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -import unittest -import os -import alveo_api_key -import pyalveo -from mock import Mock - -class TestAlveoAPIKey(unittest.TestCase): - - OUTPUT_PATH = 'test.txt' - API_KEY = 'test123' - MOCK_CLIENT = Mock(pyalveo) - - def test_write_key(self): - alveo_api_key.write_key(self.API_KEY, self.OUTPUT_PATH, self.MOCK_CLIENT) - actual = open(self.OUTPUT_PATH, 'r').read() - self.assertEqual(self.API_KEY, actual) - - def tearDown(self): - try: - os.remove(self.OUTPUT_PATH) - except OSError: - pass - -if __name__ == '__main__': - unittest.main() \ No newline at end of file
--- a/test_alveo_item_list_downloader.py Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ -import unittest -import os -import json -import alveo_item_list_importer -import pyalveo -from mock import Mock - -class TestAlveoItemListImporter(unittest.TestCase): - - API_KEY = 'test123' - OUTPUT_PATH = 'test.csv' - ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}' - CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n' - - def test_write_table(self): - api_list = json.loads(self.ITEM_LIST) - alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH) - actual = open(self.OUTPUT_PATH, 'r').read() - self.assertEqual(self.CSV_CONTENTS, actual) - - def tearDown(self): - try: - os.remove(self.OUTPUT_PATH) - except OSError: - pass - -if __name__ == '__main__': - unittest.main() \ No newline at end of file
--- a/test_alveo_item_list_importer.py Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ -import unittest -import os -import json -import alveo_item_list_importer -import pyalveo -from mock import Mock - -class TestAlveoItemListImporter(unittest.TestCase): - - API_KEY = 'test123' - OUTPUT_PATH = 'test.csv' - ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}' - CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n' - - def test_write_table(self): - api_list = json.loads(self.ITEM_LIST) - alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH) - actual = open(self.OUTPUT_PATH, 'r').read() - self.assertEqual(self.CSV_CONTENTS, actual) - - def tearDown(self): - try: - os.remove(self.OUTPUT_PATH) - except OSError: - pass - -if __name__ == '__main__': - unittest.main() \ No newline at end of file
--- a/tool_dependencies.xml Mon Jul 18 23:49:40 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="pyalveo" version="0.4"> - <install version="1.0"> - <actions> - <action type="setup_virtualenv">pyalveo==0.4</action> - </actions> - </install> - </package> -</tool_dependency>