Mercurial > repos > stevecassidy > alveoimport

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_api_key.py	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,44 @@
+from __future__ import print_function
+import argparse
+import pyalveo
+import sys
+
+API_URL = 'https://app.alveo.edu.au'
+
+def parser():
+    parser = argparse.ArgumentParser(description="Retrieves Alveo Item Lists")
+    parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
+    parser.add_argument('--output_path', required=True, action="store", type=str, help="File to store the API key in")
+    return parser.parse_args()
+
+def write_key(api_key, output_path, client_module=pyalveo):
+    """Tests whether an API key is valid and writes it to a file.
+
+    :type api_key: String
+    :param api_key: Alveo API key
+
+    :type output_path: String
+    :param output_path: Path to the file to store the API key in
+
+    :type client_module: pyalveo.Client
+    :param client_module: Module providing the client (used for testing purposes),
+        defaults to pyalveo
+
+    :raises: pyalveo.APIError if the API request is not successful
+
+    """
+    client = client_module.Client(api_key, API_URL)
+    outfile = open(output_path, 'w')
+    outfile.write(api_key)
+    outfile.close()
+
+def main():
+    args = parser()
+    try:
+        write_key(args.api_key, args.output_path)
+    except Exception as e:
+        print("ERROR: " + str(e), file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
Binary file alveo_api_key.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_api_key.xml	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,39 @@
+<tool id="alveo_api_key_storer" name="Store Alveo API Key" version="0.01" force_history_refresh="True">
+    <description>for use with Alveo tools</description>
+
+    <requirements>
+        <requirement type="package" version="0.4">pyalveo</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        alveo_api_key.py --api_key $api_key --output_path $output
+    </command>
+
+    <inputs>
+        <param name="api_key" type="text" size="30" label="API Key" help="Your Alveo API key"/>
+    </inputs>
+
+    <outputs>
+        <data format="txt" name="output" label="Alveo API key" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="api_key" value="9swHm5MgVxdnuhrqdqwk" />
+            <output name="output" file="api-key.dat" compare="contains" />
+        </test>
+    </tests>
+
+    <help>Stores Alveo API keys for use with the Alveo Galaxy tools.</help>
+
+    <citations>
+        <citation type='bibtex'>
+            @article{cassidy2014alveo,
+              title={The alveo virtual laboratory: a web based repository API},
+              author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others},
+              year={2014},
+              publisher={Reykjavik, Iceland: European Language Resources Association}
+            }
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_get_primary_text.py	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,71 @@
+from __future__ import print_function
+import json
+import argparse
+import pyalveo
+import sys
+import os
+from fnmatch import fnmatch
+
+API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
+
+def parser():
+    parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List")
+    parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
+    parser.add_argument('--item_list_url', required=True, action="store", type=str, help="Item List to download")
+    parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
+    return parser.parse_args()
+
+def get_item_list(api_key, item_list_url):
+    client = pyalveo.Client(api_key=api_key, api_url=API_URL)
+    return client.get_item_list(item_list_url)
+
+# this file name pattern allows galaxy to discover the dataset designation and type
+FNPAT = "%(designation)s_%(ext)s"
+
+def galaxy_name(fname, ext):
+    """construct a filename suitable for Galaxy dataset discovery"""
+
+    fname = FNPAT % {'designation': fname, 'ext': ext}
+
+    return fname
+import pprint
+def download_documents(item_list, output_path):
+    """
+    Downloads a list of documents to the directory specificed by output_path.
+
+    :type documents: list of pyalveo.Document
+    :param documents: Documents to download
+
+    :type output_path: String
+    :param output_path: directory to download to the documents to
+    """
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    downloaded = []
+
+    items = item_list.get_all()
+    filtered_documents = []
+    for item in items:
+        md = item.metadata()
+        fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt'))
+        content = item.get_primary_text()
+        if not content == None:
+            with open(fname, 'w') as out:
+                out.write(content)
+
+    return downloaded
+
+def main():
+    args = parser()
+    try:
+        api_key = open(args.api_key, 'r').read().strip()
+        item_list = get_item_list(api_key, args.item_list_url)
+        downloaded = download_documents(item_list, args.output_path)
+        # write out a list of downloaded files as a result?
+    except pyalveo.APIError as e:
+        print("ERROR: " + str(e), file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_get_primary_text.xml	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,68 @@
+<tool id="alveo_get_primary_text" name="Get Text from Alveo" version="0.01" force_history_refresh="True">
+    <description>Downloads primary text from the items in an Alveo Item List</description>
+
+    <requirements>
+        <requirement type="package" version="0.4">pyalveo</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        alveo_get_primary_text.py --api_key $api_key --item_list_url $item_list_url --output_path ItemListData
+    </command>
+
+    <inputs>
+        <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/>
+        <param name="import_list" type="data" format="tabular" label="Imported Alveo Item List" help=""/>
+
+        <param name="item_list_url" type="select" label="Alveo Item List" help="The Alveo Item List you wish to import">
+            <options from_dataset="import_list">
+                <column name="name" index="0"/>
+                <column name="value" index="1"/>
+            </options>
+        </param>
+
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the outputs to remind you what they contain" value="Item List downloaded from Alveo"/>
+    </inputs>
+
+    <outputs>
+        <collection type="list" label="$job_name" name="output1">
+            <discover_datasets pattern="(?P&lt;designation&gt;[^_]+)_(?P&lt;ext&gt;.+)" directory="ItemListData"/>
+        </collection>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="api_key" value="api-key.dat"/>
+            <param name="import_list" value="item-lists.dat"/>
+            <param name="item_list_url" value="https://app.alveo.edu.au/item_lists/180"/>
+            <param name="output_path" value="test_out"/>
+            <output_collection name="output1" type="list" count="6">
+                    <element name="GCSAusE02">
+                        <assert_contents>
+                            <has_text_matching expression="background noises"/>
+                        </assert_contents>
+                    </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="api_key" value="api-key.dat"/>
+            <param name="import_list" value="item-lists.dat"/>
+            <param name="item_list_url" value="https://app.alveo.edu.au/item_lists/52"/>
+            <param name="output_path" value="test_out"/>
+            <output_collection name="output1" type="list" count="0">
+            </output_collection>
+        </test>
+    </tests>
+
+    <help>Downloads the primary text for each item from an Alveo Item List</help>
+    <citations>
+        <citation type='bibtex'>
+            @article{cassidy2014alveo,
+              title={The alveo virtual laboratory: a web based repository API},
+              author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others},
+              year={2014},
+              publisher={Reykjavik, Iceland: European Language Resources Association}
+            }
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_item_list_downloader.py	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,80 @@
+from __future__ import print_function
+import json
+import argparse
+import pyalveo
+import sys
+import os
+from fnmatch import fnmatch
+
+API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
+
+def parser():
+    parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List")
+    parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
+    parser.add_argument('--item_list_url', required=True, action="store", type=str, help="Item List to download")
+    parser.add_argument('--patterns', required=True, action="store", type=str, help="File patterns to download")
+    parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
+    return parser.parse_args()
+
+def get_item_list(api_key, item_list_url):
+    client = pyalveo.Client(api_key=api_key, api_url=API_URL)
+    return client.get_item_list(item_list_url)
+
+# this file name pattern allows galaxy to discover the dataset designation and type
+FNPAT = "%(designation)s_%(ext)s"
+
+def galaxy_name(fname):
+    """construct a filename suitable for Galaxy dataset discovery"""
+
+    root, ext = os.path.splitext(fname)
+    ext = ext[1:] # remove initial .
+    fname = FNPAT % {'designation': fname, 'ext': ext}
+
+    return fname
+
+def download_documents(item_list, patterns, output_path):
+    """
+    Downloads a list of documents to the directory specificed by output_path.
+
+    :type documents: list of pyalveo.Document
+    :param documents: Documents to download
+
+    :type output_path: String
+    :param output_path: directory to download to the documents to
+    """
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    downloaded = []
+
+    items = item_list.get_all()
+    filtered_documents = []
+    for item in items:
+        documents = item.get_documents()
+        for doc in documents:
+            for pattern in patterns:
+                if not pattern == '' and fnmatch(doc.get_filename(), pattern):
+                    fname = galaxy_name(doc.get_filename())
+                    try:
+                        doc.download_content(dir_path=output_path, filename=fname)
+                        downloaded.append(doc.get_filename())
+                    except:
+                        # maybe it doesn't exist or we have no access
+                        # TODO: report this
+                        pass
+    return downloaded
+
+def main():
+    args = parser()
+    try:
+        api_key = open(args.api_key, 'r').read().strip()
+        item_list = get_item_list(api_key, args.item_list_url)
+        patterns = args.patterns.split(',')
+        downloaded = download_documents(item_list, patterns, args.output_path)
+        # write out a list of downloaded files as a result?
+    except pyalveo.APIError as e:
+        print("ERROR: " + str(e), file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_item_list_downloader.xml	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,91 @@
+<tool id="alveo_item_list_downloader" name="Get Files from Alveo" version="0.01" force_history_refresh="True">
+    <description>Downloads files from the items in an Alveo Item List</description>
+
+    <requirements>
+        <requirement type="package" version="0.4">pyalveo</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        alveo_item_list_downloader.py --api_key $api_key --item_list_url $item_list_url --patterns $patterns,$patternselect --output_path ItemListData
+    </command>
+
+    <inputs>
+        <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/>
+        <param name="import_list" type="data" format="tabular" label="Imported Alveo Item List" help=""/>
+
+        <param name="item_list_url" type="select" label="Alveo Item List" help="The Alveo Item List you wish to import">
+            <options from_dataset="import_list">
+                <column name="name" index="0"/>
+                <column name="value" index="1"/>
+            </options>
+        </param>
+
+        <param name="patternselect" type="select" multiple="true" label="Predefined imports" display="checkboxes">
+            <option value='*'>All Files</option>
+            <option value='*speaker16.wav'>Austalk 16bit/16kHz Speaker Headset WAV (*speaker16.wav)</option>
+            <option value='*plain.txt'>Plain text documents (*plain.txt)</option>
+            <option value='*.txt'>All text documents (*.txt)</option>
+            <option value=''>Other - enter pattern below</option>
+        </param>
+
+        <param name="patterns" type="text" label="File patterns to import"
+               optional="true"
+               help="One or more file patterns separated by commas eg. *.wav,*.txt"/>
+
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the outputs to remind you what they contain" value="Item List downloaded from Alveo"/>
+    </inputs>
+
+    <outputs>
+        <collection type="list" label="$job_name" name="output1">
+            <discover_datasets pattern="(?P&lt;designation&gt;[^_]+)_(?P&lt;ext&gt;.+)" directory="ItemListData"/>
+        </collection>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="api_key" value="api-key.dat"/>
+            <param name="import_list" value="item-lists.dat"/>
+            <param name="item_list_url" value="https://app.alveo.edu.au/item_lists/180"/>
+            <param name="patterns" value=""/>
+            <param name="patternselect" value="*plain.txt"/>
+            <param name="output_path" value="test_out"/>
+            <output_collection name="output1" type="list" count="6">
+                    <element name="GCSAusE02-plain.txt">
+                        <assert_contents>
+                            <has_text_matching expression="background noises"/>
+                        </assert_contents>
+                    </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="api_key" value="api-key.dat"/>
+            <param name="import_list" value="item-lists.dat"/>
+            <param name="item_list_url" value="https://app.alveo.edu.au/item_lists/180"/>
+            <param name="patterns" value="*plain.txt"/>
+            <param name="patternselect" value=""/>
+            <param name="output_path" value="test_out"/>
+            <output_collection name="output1" type="list" count="6">
+                    <element name="GCSAusE02-plain.txt">
+                        <assert_contents>
+                            <has_text_matching expression="background noises"/>
+                        </assert_contents>
+                    </element>
+            </output_collection>
+        </test>
+    </tests>
+
+    <help>Downloads files from an Alveo Item list. You can download all files or those matching
+        a wildcard pattern (e.g. *.txt).  Results will be stored as a dataset collection in
+        your history.</help>
+    <citations>
+        <citation type='bibtex'>
+            @article{cassidy2014alveo,
+              title={The alveo virtual laboratory: a web based repository API},
+              author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others},
+              year={2014},
+              publisher={Reykjavik, Iceland: European Language Resources Association}
+            }
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_item_list_importer.py	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,40 @@
+from __future__ import print_function
+import json
+import argparse
+import pyalveo
+import sys
+
+API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
+
+
+def parser():
+    parser = argparse.ArgumentParser(description="Retrieves Alveo Item Lists")
+    parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
+    parser.add_argument('--output', required=True, action="store", type=str, help="Path to output file")
+    return parser.parse_args()
+
+# TODO: export common function to helper module
+def get_item_lists(api_key):
+    client = pyalveo.Client(api_key=api_key, api_url=API_URL)
+    return client.get_item_lists()
+
+def write_table(item_lists, filename):
+    with open(filename, 'w') as outfile:
+        for list_set in item_lists.itervalues():
+            for item_list in list_set:
+                outfile.write("%s (%d)\t%s\n" % (item_list['name'], item_list['num_items'], item_list['item_list_url']))
+
+def main():
+    args = parser()
+    try:
+        api_key = open(args.api_key, 'r').read().strip()
+        item_lists = get_item_lists(api_key)
+        if item_lists:
+            write_table(item_lists, args.output)
+    except Exception as e:
+        print("ERROR: " + str(e), file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
Binary file alveo_item_list_importer.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_item_list_importer.xml	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,37 @@
+<tool id="alveo_item_list_importer" name="Get Alveo Item Lists" version="0.01" force_history_refresh="True">
+    <description>Retrieves item list metadata.</description>
+
+    <requirements>
+        <requirement type="package" version="0.4">pyalveo</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        alveo_item_list_importer.py --api_key $api_key --output $item_list
+    </command>
+
+
+    <inputs>
+        <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/>
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the outputs to remind you what they contain" value="Alveo Item Lists"/>
+    </inputs>
+
+    <outputs>
+        <data format="tabular" name="item_list" label="${job_name}"/>
+    </outputs>
+
+    <help>Import Item Lists from Alveo. This imports the lists, but does not download the individual items.
+        That task is performed by the *Get Files from Alveo* tool.
+    </help>
+
+    <citations>
+        <citation type='bibtex'>
+            @article{cassidy2014alveo,
+              title={The alveo virtual laboratory: a web based repository API},
+              author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others},
+              year={2014},
+              publisher={Reykjavik, Iceland: European Language Resources Association}
+            }
+        </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/api-key.dat	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,1 @@
+insert your api key here
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/item-lists.dat	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,15 @@
+austalk_catepillar (309)	https://app.alveo.edu.au/item_lists/64
+different (888)	https://app.alveo.edu.au/item_lists/132
+gum-tree (58)	https://app.alveo.edu.au/item_lists/84
+M&D_Test_140904 (10)	https://app.alveo.edu.au/item_lists/168
+rose (245)	https://app.alveo.edu.au/item_lists/82
+thistle (16)	https://app.alveo.edu.au/item_lists/83
+ace-specialised (122)	https://app.alveo.edu.au/item_lists/178
+austalk_hide (42)	https://app.alveo.edu.au/item_lists/251
+austalk-male-digits (144)	https://app.alveo.edu.au/item_lists/412
+COOEE ALL (1354)	https://app.alveo.edu.au/item_lists/95
+cooee sample (129)	https://app.alveo.edu.au/item_lists/53
+dialogue-all (76)	https://app.alveo.edu.au/item_lists/116
+dialogue-sample (6)	https://app.alveo.edu.au/item_lists/180
+mdsample (20)	https://app.alveo.edu.au/item_lists/52
+one austalk sample (1)	https://app.alveo.edu.au/item_lists/179
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test_alveo_api_key.py	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,25 @@
+import unittest
+import os
+import alveo_api_key
+import pyalveo
+from mock import Mock
+
+class TestAlveoAPIKey(unittest.TestCase):
+
+  OUTPUT_PATH = 'test.txt'
+  API_KEY = 'test123'
+  MOCK_CLIENT = Mock(pyalveo)
+
+  def test_write_key(self):
+    alveo_api_key.write_key(self.API_KEY, self.OUTPUT_PATH, self.MOCK_CLIENT)
+    actual = open(self.OUTPUT_PATH, 'r').read()
+    self.assertEqual(self.API_KEY, actual)
+
+  def tearDown(self):
+    try:
+      os.remove(self.OUTPUT_PATH)
+    except OSError:
+      pass
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test_alveo_item_list_downloader.py	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,28 @@
+import unittest
+import os
+import json
+import alveo_item_list_importer
+import pyalveo
+from mock import Mock
+
+class TestAlveoItemListImporter(unittest.TestCase):
+
+  API_KEY = 'test123'
+  OUTPUT_PATH = 'test.csv'
+  ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}'
+  CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n'
+
+  def test_write_table(self):
+    api_list = json.loads(self.ITEM_LIST)
+    alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH)
+    actual = open(self.OUTPUT_PATH, 'r').read()
+    self.assertEqual(self.CSV_CONTENTS, actual)
+
+  def tearDown(self):
+    try:
+      os.remove(self.OUTPUT_PATH)
+    except OSError:
+      pass
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test_alveo_item_list_importer.py	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,28 @@
+import unittest
+import os
+import json
+import alveo_item_list_importer
+import pyalveo
+from mock import Mock
+
+class TestAlveoItemListImporter(unittest.TestCase):
+
+  API_KEY = 'test123'
+  OUTPUT_PATH = 'test.csv'
+  ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}'
+  CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n'
+
+  def test_write_table(self):
+    api_list = json.loads(self.ITEM_LIST)
+    alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH)
+    actual = open(self.OUTPUT_PATH, 'r').read()
+    self.assertEqual(self.CSV_CONTENTS, actual)
+
+  def tearDown(self):
+    try:
+      os.remove(self.OUTPUT_PATH)
+    except OSError:
+      pass
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Jul 18 23:49:40 2016 -0400
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<tool_dependency>
+   <package name="pyalveo" version="0.4">
+       <install version="1.0">
+           <actions>
+               <action type="setup_virtualenv">pyalveo==0.4</action>
+           </actions>
+       </install>
+   </package>
+</tool_dependency>