Mercurial > repos > stevecassidy > alveoimport
changeset 0:bfe39bd252df draft
planemo upload commit 5de43e6a614de2a1b2065bc63823ecc9854ebb32-dirty
author | stevecassidy |
---|---|
date | Mon, 18 Jul 2016 23:49:40 -0400 |
parents | |
children | 6fef3489d97c |
files | alveo_api_key.py alveo_api_key.pyc alveo_api_key.xml alveo_get_primary_text.py alveo_get_primary_text.xml alveo_item_list_downloader.py alveo_item_list_downloader.xml alveo_item_list_importer.py alveo_item_list_importer.pyc alveo_item_list_importer.xml test-data/api-key.dat test-data/item-lists.dat test_alveo_api_key.py test_alveo_item_list_downloader.py test_alveo_item_list_importer.py tool_dependencies.xml |
diffstat | 16 files changed, 577 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_api_key.py Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,44 @@ +from __future__ import print_function +import argparse +import pyalveo +import sys + +API_URL = 'https://app.alveo.edu.au' + +def parser(): + parser = argparse.ArgumentParser(description="Retrieves Alveo Item Lists") + parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") + parser.add_argument('--output_path', required=True, action="store", type=str, help="File to store the API key in") + return parser.parse_args() + +def write_key(api_key, output_path, client_module=pyalveo): + """Tests whether an API key is valid and writes it to a file. + + :type api_key: String + :param api_key: Alveo API key + + :type output_path: String + :param output_path: Path to the file to store the API key in + + :type client_module: pyalveo.Client + :param client_module: Module providing the client (used for testing purposes), + defaults to pyalveo + + :raises: pyalveo.APIError if the API request is not successful + + """ + client = client_module.Client(api_key, API_URL) + outfile = open(output_path, 'w') + outfile.write(api_key) + outfile.close() + +def main(): + args = parser() + try: + write_key(args.api_key, args.output_path) + except Exception as e: + print("ERROR: " + str(e), file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_api_key.xml Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,39 @@ +<tool id="alveo_api_key_storer" name="Store Alveo API Key" version="0.01" force_history_refresh="True"> + <description>for use with Alveo tools</description> + + <requirements> + <requirement type="package" version="0.4">pyalveo</requirement> + </requirements> + + <command interpreter="python"> + alveo_api_key.py --api_key $api_key --output_path $output + </command> + + <inputs> + <param name="api_key" type="text" size="30" label="API Key" help="Your Alveo API key"/> + </inputs> + + <outputs> + <data format="txt" name="output" label="Alveo API key" /> + </outputs> + + <tests> + <test> + <param name="api_key" value="9swHm5MgVxdnuhrqdqwk" /> + <output name="output" file="api-key.dat" compare="contains" /> + </test> + </tests> + + <help>Stores Alveo API keys for use with the Alveo Galaxy tools.</help> + + <citations> + <citation type='bibtex'> + @article{cassidy2014alveo, + title={The alveo virtual laboratory: a web based repository API}, + author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others}, + year={2014}, + publisher={Reykjavik, Iceland: European Language Resources Association} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_get_primary_text.py Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,71 @@ +from __future__ import print_function +import json +import argparse +import pyalveo +import sys +import os +from fnmatch import fnmatch + +API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module + +def parser(): + parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") + parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") + parser.add_argument('--item_list_url', required=True, action="store", type=str, help="Item List to download") + parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") + return parser.parse_args() + +def get_item_list(api_key, item_list_url): + client = pyalveo.Client(api_key=api_key, api_url=API_URL) + return client.get_item_list(item_list_url) + +# this file name pattern allows galaxy to discover the dataset designation and type +FNPAT = "%(designation)s_%(ext)s" + +def galaxy_name(fname, ext): + """construct a filename suitable for Galaxy dataset discovery""" + + fname = FNPAT % {'designation': fname, 'ext': ext} + + return fname +import pprint +def download_documents(item_list, output_path): + """ + Downloads a list of documents to the directory specificed by output_path. + + :type documents: list of pyalveo.Document + :param documents: Documents to download + + :type output_path: String + :param output_path: directory to download to the documents to + """ + if not os.path.exists(output_path): + os.makedirs(output_path) + + downloaded = [] + + items = item_list.get_all() + filtered_documents = [] + for item in items: + md = item.metadata() + fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt')) + content = item.get_primary_text() + if not content == None: + with open(fname, 'w') as out: + out.write(content) + + return downloaded + +def main(): + args = parser() + try: + api_key = open(args.api_key, 'r').read().strip() + item_list = get_item_list(api_key, args.item_list_url) + downloaded = download_documents(item_list, args.output_path) + # write out a list of downloaded files as a result? + except pyalveo.APIError as e: + print("ERROR: " + str(e), file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_get_primary_text.xml Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,68 @@ +<tool id="alveo_get_primary_text" name="Get Text from Alveo" version="0.01" force_history_refresh="True"> + <description>Downloads primary text from the items in an Alveo Item List</description> + + <requirements> + <requirement type="package" version="0.4">pyalveo</requirement> + </requirements> + + <command interpreter="python"> + alveo_get_primary_text.py --api_key $api_key --item_list_url $item_list_url --output_path ItemListData + </command> + + <inputs> + <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/> + <param name="import_list" type="data" format="tabular" label="Imported Alveo Item List" help=""/> + + <param name="item_list_url" type="select" label="Alveo Item List" help="The Alveo Item List you wish to import"> + <options from_dataset="import_list"> + <column name="name" index="0"/> + <column name="value" index="1"/> + </options> + </param> + + <param name="job_name" type="text" size="25" + label="Supply a name for the outputs to remind you what they contain" value="Item List downloaded from Alveo"/> + </inputs> + + <outputs> + <collection type="list" label="$job_name" name="output1"> + <discover_datasets pattern="(?P<designation>[^_]+)_(?P<ext>.+)" directory="ItemListData"/> + </collection> + </outputs> + + <tests> + <test> + <param name="api_key" value="api-key.dat"/> + <param name="import_list" value="item-lists.dat"/> + <param name="item_list_url" value="https://app.alveo.edu.au/item_lists/180"/> + <param name="output_path" value="test_out"/> + <output_collection name="output1" type="list" count="6"> + <element name="GCSAusE02"> + <assert_contents> + <has_text_matching expression="background noises"/> + </assert_contents> + </element> + </output_collection> + </test> + <test> + <param name="api_key" value="api-key.dat"/> + <param name="import_list" value="item-lists.dat"/> + <param name="item_list_url" value="https://app.alveo.edu.au/item_lists/52"/> + <param name="output_path" value="test_out"/> + <output_collection name="output1" type="list" count="0"> + </output_collection> + </test> + </tests> + + <help>Downloads the primary text for each item from an Alveo Item List</help> + <citations> + <citation type='bibtex'> + @article{cassidy2014alveo, + title={The alveo virtual laboratory: a web based repository API}, + author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others}, + year={2014}, + publisher={Reykjavik, Iceland: European Language Resources Association} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_item_list_downloader.py Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,80 @@ +from __future__ import print_function +import json +import argparse +import pyalveo +import sys +import os +from fnmatch import fnmatch + +API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module + +def parser(): + parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") + parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") + parser.add_argument('--item_list_url', required=True, action="store", type=str, help="Item List to download") + parser.add_argument('--patterns', required=True, action="store", type=str, help="File patterns to download") + parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") + return parser.parse_args() + +def get_item_list(api_key, item_list_url): + client = pyalveo.Client(api_key=api_key, api_url=API_URL) + return client.get_item_list(item_list_url) + +# this file name pattern allows galaxy to discover the dataset designation and type +FNPAT = "%(designation)s_%(ext)s" + +def galaxy_name(fname): + """construct a filename suitable for Galaxy dataset discovery""" + + root, ext = os.path.splitext(fname) + ext = ext[1:] # remove initial . + fname = FNPAT % {'designation': fname, 'ext': ext} + + return fname + +def download_documents(item_list, patterns, output_path): + """ + Downloads a list of documents to the directory specificed by output_path. + + :type documents: list of pyalveo.Document + :param documents: Documents to download + + :type output_path: String + :param output_path: directory to download to the documents to + """ + if not os.path.exists(output_path): + os.makedirs(output_path) + + downloaded = [] + + items = item_list.get_all() + filtered_documents = [] + for item in items: + documents = item.get_documents() + for doc in documents: + for pattern in patterns: + if not pattern == '' and fnmatch(doc.get_filename(), pattern): + fname = galaxy_name(doc.get_filename()) + try: + doc.download_content(dir_path=output_path, filename=fname) + downloaded.append(doc.get_filename()) + except: + # maybe it doesn't exist or we have no access + # TODO: report this + pass + return downloaded + +def main(): + args = parser() + try: + api_key = open(args.api_key, 'r').read().strip() + item_list = get_item_list(api_key, args.item_list_url) + patterns = args.patterns.split(',') + downloaded = download_documents(item_list, patterns, args.output_path) + # write out a list of downloaded files as a result? + except pyalveo.APIError as e: + print("ERROR: " + str(e), file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_item_list_downloader.xml Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,91 @@ +<tool id="alveo_item_list_downloader" name="Get Files from Alveo" version="0.01" force_history_refresh="True"> + <description>Downloads files from the items in an Alveo Item List</description> + + <requirements> + <requirement type="package" version="0.4">pyalveo</requirement> + </requirements> + + <command interpreter="python"> + alveo_item_list_downloader.py --api_key $api_key --item_list_url $item_list_url --patterns $patterns,$patternselect --output_path ItemListData + </command> + + <inputs> + <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/> + <param name="import_list" type="data" format="tabular" label="Imported Alveo Item List" help=""/> + + <param name="item_list_url" type="select" label="Alveo Item List" help="The Alveo Item List you wish to import"> + <options from_dataset="import_list"> + <column name="name" index="0"/> + <column name="value" index="1"/> + </options> + </param> + + <param name="patternselect" type="select" multiple="true" label="Predefined imports" display="checkboxes"> + <option value='*'>All Files</option> + <option value='*speaker16.wav'>Austalk 16bit/16kHz Speaker Headset WAV (*speaker16.wav)</option> + <option value='*plain.txt'>Plain text documents (*plain.txt)</option> + <option value='*.txt'>All text documents (*.txt)</option> + <option value=''>Other - enter pattern below</option> + </param> + + <param name="patterns" type="text" label="File patterns to import" + optional="true" + help="One or more file patterns separated by commas eg. *.wav,*.txt"/> + + <param name="job_name" type="text" size="25" + label="Supply a name for the outputs to remind you what they contain" value="Item List downloaded from Alveo"/> + </inputs> + + <outputs> + <collection type="list" label="$job_name" name="output1"> + <discover_datasets pattern="(?P<designation>[^_]+)_(?P<ext>.+)" directory="ItemListData"/> + </collection> + </outputs> + + <tests> + <test> + <param name="api_key" value="api-key.dat"/> + <param name="import_list" value="item-lists.dat"/> + <param name="item_list_url" value="https://app.alveo.edu.au/item_lists/180"/> + <param name="patterns" value=""/> + <param name="patternselect" value="*plain.txt"/> + <param name="output_path" value="test_out"/> + <output_collection name="output1" type="list" count="6"> + <element name="GCSAusE02-plain.txt"> + <assert_contents> + <has_text_matching expression="background noises"/> + </assert_contents> + </element> + </output_collection> + </test> + <test> + <param name="api_key" value="api-key.dat"/> + <param name="import_list" value="item-lists.dat"/> + <param name="item_list_url" value="https://app.alveo.edu.au/item_lists/180"/> + <param name="patterns" value="*plain.txt"/> + <param name="patternselect" value=""/> + <param name="output_path" value="test_out"/> + <output_collection name="output1" type="list" count="6"> + <element name="GCSAusE02-plain.txt"> + <assert_contents> + <has_text_matching expression="background noises"/> + </assert_contents> + </element> + </output_collection> + </test> + </tests> + + <help>Downloads files from an Alveo Item list. You can download all files or those matching + a wildcard pattern (e.g. *.txt). Results will be stored as a dataset collection in + your history.</help> + <citations> + <citation type='bibtex'> + @article{cassidy2014alveo, + title={The alveo virtual laboratory: a web based repository API}, + author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others}, + year={2014}, + publisher={Reykjavik, Iceland: European Language Resources Association} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_item_list_importer.py Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,40 @@ +from __future__ import print_function +import json +import argparse +import pyalveo +import sys + +API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module + + +def parser(): + parser = argparse.ArgumentParser(description="Retrieves Alveo Item Lists") + parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") + parser.add_argument('--output', required=True, action="store", type=str, help="Path to output file") + return parser.parse_args() + +# TODO: export common function to helper module +def get_item_lists(api_key): + client = pyalveo.Client(api_key=api_key, api_url=API_URL) + return client.get_item_lists() + +def write_table(item_lists, filename): + with open(filename, 'w') as outfile: + for list_set in item_lists.itervalues(): + for item_list in list_set: + outfile.write("%s (%d)\t%s\n" % (item_list['name'], item_list['num_items'], item_list['item_list_url'])) + +def main(): + args = parser() + try: + api_key = open(args.api_key, 'r').read().strip() + item_lists = get_item_lists(api_key) + if item_lists: + write_table(item_lists, args.output) + except Exception as e: + print("ERROR: " + str(e), file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alveo_item_list_importer.xml Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,37 @@ +<tool id="alveo_item_list_importer" name="Get Alveo Item Lists" version="0.01" force_history_refresh="True"> + <description>Retrieves item list metadata.</description> + + <requirements> + <requirement type="package" version="0.4">pyalveo</requirement> + </requirements> + + <command interpreter="python"> + alveo_item_list_importer.py --api_key $api_key --output $item_list + </command> + + + <inputs> + <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/> + <param name="job_name" type="text" size="25" + label="Supply a name for the outputs to remind you what they contain" value="Alveo Item Lists"/> + </inputs> + + <outputs> + <data format="tabular" name="item_list" label="${job_name}"/> + </outputs> + + <help>Import Item Lists from Alveo. This imports the lists, but does not download the individual items. + That task is performed by the *Get Files from Alveo* tool. + </help> + + <citations> + <citation type='bibtex'> + @article{cassidy2014alveo, + title={The alveo virtual laboratory: a web based repository API}, + author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others}, + year={2014}, + publisher={Reykjavik, Iceland: European Language Resources Association} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/api-key.dat Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,1 @@ +insert your api key here
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/item-lists.dat Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,15 @@ +austalk_catepillar (309) https://app.alveo.edu.au/item_lists/64 +different (888) https://app.alveo.edu.au/item_lists/132 +gum-tree (58) https://app.alveo.edu.au/item_lists/84 +M&D_Test_140904 (10) https://app.alveo.edu.au/item_lists/168 +rose (245) https://app.alveo.edu.au/item_lists/82 +thistle (16) https://app.alveo.edu.au/item_lists/83 +ace-specialised (122) https://app.alveo.edu.au/item_lists/178 +austalk_hide (42) https://app.alveo.edu.au/item_lists/251 +austalk-male-digits (144) https://app.alveo.edu.au/item_lists/412 +COOEE ALL (1354) https://app.alveo.edu.au/item_lists/95 +cooee sample (129) https://app.alveo.edu.au/item_lists/53 +dialogue-all (76) https://app.alveo.edu.au/item_lists/116 +dialogue-sample (6) https://app.alveo.edu.au/item_lists/180 +mdsample (20) https://app.alveo.edu.au/item_lists/52 +one austalk sample (1) https://app.alveo.edu.au/item_lists/179
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_alveo_api_key.py Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,25 @@ +import unittest +import os +import alveo_api_key +import pyalveo +from mock import Mock + +class TestAlveoAPIKey(unittest.TestCase): + + OUTPUT_PATH = 'test.txt' + API_KEY = 'test123' + MOCK_CLIENT = Mock(pyalveo) + + def test_write_key(self): + alveo_api_key.write_key(self.API_KEY, self.OUTPUT_PATH, self.MOCK_CLIENT) + actual = open(self.OUTPUT_PATH, 'r').read() + self.assertEqual(self.API_KEY, actual) + + def tearDown(self): + try: + os.remove(self.OUTPUT_PATH) + except OSError: + pass + +if __name__ == '__main__': + unittest.main() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_alveo_item_list_downloader.py Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,28 @@ +import unittest +import os +import json +import alveo_item_list_importer +import pyalveo +from mock import Mock + +class TestAlveoItemListImporter(unittest.TestCase): + + API_KEY = 'test123' + OUTPUT_PATH = 'test.csv' + ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}' + CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n' + + def test_write_table(self): + api_list = json.loads(self.ITEM_LIST) + alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH) + actual = open(self.OUTPUT_PATH, 'r').read() + self.assertEqual(self.CSV_CONTENTS, actual) + + def tearDown(self): + try: + os.remove(self.OUTPUT_PATH) + except OSError: + pass + +if __name__ == '__main__': + unittest.main() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_alveo_item_list_importer.py Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,28 @@ +import unittest +import os +import json +import alveo_item_list_importer +import pyalveo +from mock import Mock + +class TestAlveoItemListImporter(unittest.TestCase): + + API_KEY = 'test123' + OUTPUT_PATH = 'test.csv' + ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}' + CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n' + + def test_write_table(self): + api_list = json.loads(self.ITEM_LIST) + alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH) + actual = open(self.OUTPUT_PATH, 'r').read() + self.assertEqual(self.CSV_CONTENTS, actual) + + def tearDown(self): + try: + os.remove(self.OUTPUT_PATH) + except OSError: + pass + +if __name__ == '__main__': + unittest.main() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Jul 18 23:49:40 2016 -0400 @@ -0,0 +1,10 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="pyalveo" version="0.4"> + <install version="1.0"> + <actions> + <action type="setup_virtualenv">pyalveo==0.4</action> + </actions> + </install> + </package> +</tool_dependency>