Mercurial > repos > stevecassidy > alveoimport

Binary file alveo_api_key.pyc has changed
--- a/alveo_api_key.xml	Mon Jul 18 23:49:40 2016 -0400
+++ b/alveo_api_key.xml	Mon Aug 15 23:45:46 2016 -0400
@@ -2,7 +2,7 @@
     <description>for use with Alveo tools</description>

     <requirements>
-        <requirement type="package" version="0.4">pyalveo</requirement>
+        <requirement type="package" version="0.6">pyalveo</requirement>
     </requirements>

     <command interpreter="python">
@@ -19,7 +19,7 @@

     <tests>
         <test>
-            <param name="api_key" value="9swHm5MgVxdnuhrqdqwk" />
+            <param name="api_key" value="your api key here" />
             <output name="output" file="api-key.dat" compare="contains" />
         </test>
     </tests>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_get_item_data.py	Mon Aug 15 23:45:46 2016 -0400
@@ -0,0 +1,100 @@
+from __future__ import print_function
+import json
+import argparse
+import pyalveo
+import sys
+import os
+from fnmatch import fnmatch
+import csv
+
+API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
+
+def parser():
+    parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List")
+    parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
+    parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs")
+    parser.add_argument('--patterns', required=True, action="store", type=str, help="File patterns to download")
+    parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
+    return parser.parse_args()
+
+def read_item_list(filename, client):
+    """Read an item list from a file
+    which should be a tabular formatted file
+    with one column header ItemURL.
+    Return an instance of ItemGroup"""
+
+    with open(filename) as fd:
+        csvreader = csv.DictReader(fd, dialect='excel-tab')
+        if 'ItemURL' not in csvreader.fieldnames:
+            return None
+        itemurls = []
+        for row in csvreader:
+            itemurls.append(row['ItemURL'])
+
+    itemlist = pyalveo.ItemGroup(itemurls, client)
+
+    return itemlist
+
+# this file name pattern allows galaxy to discover the dataset designation and type
+FNPAT = "%(designation)s#%(ext)s"
+
+def galaxy_name(itemname, fname):
+    """construct a filename suitable for Galaxy dataset discovery
+    designation - (dataset identifier) is the file basename
+    ext - defines the dataset type and is the file extension
+    """
+
+    root, ext = os.path.splitext(fname)
+    ext = ext[1:] # remove initial .
+    fname = FNPAT % {'designation': itemname, 'ext': ext}
+
+    return fname
+
+def download_documents(item_list, patterns, output_path):
+    """
+    Downloads a list of documents to the directory specificed by output_path.
+
+    :type documents: list of pyalveo.Document
+    :param documents: Documents to download
+
+    :type output_path: String
+    :param output_path: directory to download to the documents to
+    """
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    downloaded = []
+
+    items = item_list.get_all()
+    filtered_documents = []
+    for item in items:
+        documents = item.get_documents()
+        for doc in documents:
+            for pattern in patterns:
+                if not pattern == '' and fnmatch(doc.get_filename(), pattern):
+                    fname = galaxy_name(item.metadata()['alveo:metadata']['dc:identifier'], doc.get_filename())
+                    try:
+                        doc.download_content(dir_path=output_path, filename=fname)
+                        downloaded.append(doc.get_filename())
+                    except:
+                        # maybe it doesn't exist or we have no access
+                        # TODO: report this
+                        pass
+    return downloaded
+
+def main():
+    args = parser()
+    try:
+        api_key = open(args.api_key, 'r').read().strip()
+
+        client = pyalveo.Client(api_url=API_URL, api_key=api_key)
+
+        item_list = read_item_list(args.item_list, client)
+        patterns = args.patterns.split(',')
+        downloaded = download_documents(item_list, patterns, args.output_path)
+    except pyalveo.APIError as e:
+        print("ERROR: " + str(e), file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alveo_get_item_data.xml	Mon Aug 15 23:45:46 2016 -0400
@@ -0,0 +1,68 @@
+<tool id="alveo_get_item_data" name="Get Alveo Data for Items" version="0.01" force_history_refresh="True">
+    <description>Downloads files from the items in an Galaxy list of items</description>
+
+    <requirements>
+        <requirement type="package" version="0.6">pyalveo</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        alveo_get_item_data.py --api_key $api_key --item_list $item_list --patterns $patterns,$patternselect --output_path ItemListData
+    </command>
+
+    <inputs>
+        <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/>
+        <param name="item_list" type="data" format="tabular" label="Item List (table)" help=""/>
+
+        <param name="patternselect" type="select" multiple="true" label="Predefined imports" display="checkboxes">
+            <option value='*'>All Files</option>
+            <option value='*speaker16.wav'>Austalk 16bit/16kHz Speaker Headset WAV (*speaker16.wav)</option>
+            <option value='*plain.txt'>Plain text documents (*plain.txt)</option>
+            <option value='*.txt'>All text documents (*.txt)</option>
+            <option value=''>Other - enter pattern below</option>
+        </param>
+
+        <param name="patterns" type="text" label="File patterns to import"
+               optional="true"
+               help="One or more file patterns separated by commas eg. *.wav,*.txt"/>
+
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the output to remind you what it contains" value="Alveo Data"/>
+    </inputs>
+
+    <outputs>
+        <collection type="list" label="$job_name" name="output1">
+            <discover_datasets pattern="(?P&lt;designation&gt;[^#]+)#(?P&lt;ext&gt;.+)" directory="ItemListData"/>
+        </collection>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="api_key" value="api-key.dat"/>
+            <param name="item_list" value="hvd-sample-items.dat"/>
+            <param name="patterns" value="*.TextGrid"/>
+            <param name="patternselect" value=""/>
+            <param name="output_path" value="test_out"/>
+            <output_collection name="output1" type="list" count="5">
+                    <element name="1_1308_2_22_023">
+                        <assert_contents>
+                            <has_text_matching expression="xmax = 1.020000"/>
+                        </assert_contents>
+                    </element>
+            </output_collection>
+        </test>
+    </tests>
+
+    <help>Downloads files from a local list of Alveo items. You can download all files or those matching
+        a wildcard pattern (e.g. *.txt).  Results will be stored as a dataset collection in
+        your history.</help>
+    <citations>
+        <citation type='bibtex'>
+            @article{cassidy2014alveo,
+              title={The alveo virtual laboratory: a web based repository API},
+              author={Cassidy, Steve and Estival, Dominique and Jones, Tim and Sefton, Peter and Burnham, Denis and Burghold, Jared and others},
+              year={2014},
+              publisher={Reykjavik, Iceland: European Language Resources Association}
+            }
+        </citation>
+    </citations>
+</tool>
--- a/alveo_get_primary_text.xml	Mon Jul 18 23:49:40 2016 -0400
+++ b/alveo_get_primary_text.xml	Mon Aug 15 23:45:46 2016 -0400
@@ -2,7 +2,7 @@
     <description>Downloads primary text from the items in an Alveo Item List</description>

     <requirements>
-        <requirement type="package" version="0.4">pyalveo</requirement>
+        <requirement type="package" version="0.6">pyalveo</requirement>
     </requirements>

     <command interpreter="python">
--- a/alveo_item_list_downloader.py	Mon Jul 18 23:49:40 2016 -0400
+++ b/alveo_item_list_downloader.py	Mon Aug 15 23:45:46 2016 -0400
@@ -23,15 +23,20 @@
 # this file name pattern allows galaxy to discover the dataset designation and type
 FNPAT = "%(designation)s_%(ext)s"

-def galaxy_name(fname):
-    """construct a filename suitable for Galaxy dataset discovery"""
+
+def galaxy_name(itemname, fname):
+    """construct a filename suitable for Galaxy dataset discovery
+    designation - (dataset identifier) is the file basename
+    ext - defines the dataset type and is the file extension
+    """

     root, ext = os.path.splitext(fname)
     ext = ext[1:] # remove initial .
-    fname = FNPAT % {'designation': fname, 'ext': ext}
+    fname = FNPAT % {'designation': itemname, 'ext': ext}

     return fname

+
 def download_documents(item_list, patterns, output_path):
     """
     Downloads a list of documents to the directory specificed by output_path.
@@ -54,7 +59,7 @@
         for doc in documents:
             for pattern in patterns:
                 if not pattern == '' and fnmatch(doc.get_filename(), pattern):
-                    fname = galaxy_name(doc.get_filename())
+                    fname = galaxy_name(item.metadata()['alveo:metadata']['dc:identifier'], doc.get_filename())
                     try:
                         doc.download_content(dir_path=output_path, filename=fname)
                         downloaded.append(doc.get_filename())
--- a/alveo_item_list_downloader.xml	Mon Jul 18 23:49:40 2016 -0400
+++ b/alveo_item_list_downloader.xml	Mon Aug 15 23:45:46 2016 -0400
@@ -2,7 +2,7 @@
     <description>Downloads files from the items in an Alveo Item List</description>

     <requirements>
-        <requirement type="package" version="0.4">pyalveo</requirement>
+        <requirement type="package" version="0.6">pyalveo</requirement>
     </requirements>

     <command interpreter="python">
@@ -51,7 +51,7 @@
             <param name="patternselect" value="*plain.txt"/>
             <param name="output_path" value="test_out"/>
             <output_collection name="output1" type="list" count="6">
-                    <element name="GCSAusE02-plain.txt">
+                    <element name="GCSAusE02">
                         <assert_contents>
                             <has_text_matching expression="background noises"/>
                         </assert_contents>
@@ -66,7 +66,7 @@
             <param name="patternselect" value=""/>
             <param name="output_path" value="test_out"/>
             <output_collection name="output1" type="list" count="6">
-                    <element name="GCSAusE02-plain.txt">
+                    <element name="GCSAusE02">
                         <assert_contents>
                             <has_text_matching expression="background noises"/>
                         </assert_contents>
Binary file alveo_item_list_importer.pyc has changed
--- a/alveo_item_list_importer.xml	Mon Jul 18 23:49:40 2016 -0400
+++ b/alveo_item_list_importer.xml	Mon Aug 15 23:45:46 2016 -0400
@@ -2,7 +2,7 @@
     <description>Retrieves item list metadata.</description>

     <requirements>
-        <requirement type="package" version="0.4">pyalveo</requirement>
+        <requirement type="package" version="0.6">pyalveo</requirement>
     </requirements>

     <command interpreter="python">
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/austalk-select-hVd-words.py	Mon Aug 15 23:45:46 2016 -0400
@@ -0,0 +1,87 @@
+from __future__ import print_function
+import argparse
+import pyalveo
+import sys
+
+API_URL = 'https://app.alveo.edu.au/'
+PREFIXES = """
+PREFIX dc:<http://purl.org/dc/terms/>
+PREFIX austalk:<http://ns.austalk.edu.au/>
+PREFIX olac:<http://www.language-archives.org/OLAC/1.1/>
+PREFIX ausnc:<http://ns.ausnc.org.au/schemas/ausnc_md_model/>
+PREFIX foaf:<http://xmlns.com/foaf/0.1/>
+PREFIX dbpedia:<http://dbpedia.org/ontology/>
+PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
+PREFIX geo:<http://www.w3.org/2003/01/geo/wgs84_pos#>
+PREFIX iso639schema:<http://downlode.org/rdf/iso-639/schema#>
+PREFIX austalkid:<http://id.austalk.edu.au/>
+PREFIX iso639:<http://downlode.org/rdf/iso-639/languages#>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+PREFIX is: <http://purl.org/ontology/is/core#>
+PREFIX iso: <http://purl.org/iso25964/skos-thes#>
+PREFIX dada: <http://purl.org/dada/schema/0.2#>"""
+
+def parser():
+    parser = argparse.ArgumentParser(description="Retrieves Alveo Item Lists")
+    parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
+    parser.add_argument('--speaker', required=True, action="store", type=str, help="Speaker identifier")
+    parser.add_argument('--words', required=False, default='all', action="store", type=str, help="Word group (all, monopthongs, dipthongs)")
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file name")
+    return parser.parse_args()
+
+def find_hVd_words(api_key, speakerid, output, words='all'):
+    """Find words in the Austalk corpus
+    """
+
+    client = pyalveo.Client(api_key, API_URL)
+
+    query = PREFIXES + """
+SELECT distinct ?item ?prompt ?compname
+WHERE {
+  ?item a ausnc:AusNCObject .
+  ?item olac:speaker ?speaker .
+  ?speaker austalk:id "%s" .
+  ?item austalk:prompt ?prompt .
+  ?item austalk:componentName ?compname .
+ """ % speakerid
+
+    hVdWords = {
+        'monopthongs': ['head', 'had', 'hud', 'heed', 'hid', 'hood', 'hod', "whod"],
+        'dipthongs': ['herd', 'howd', 'hoyd', 'haired', 'hard', 'heared']
+        }
+
+    if words == 'all':
+        words = hVdWords['monopthongs'] + hVdWords['dipthongs']
+    else:
+        words = hVdWords[words]
+
+    filterclause = 'FILTER regex(?prompt, "^'
+    filterclause += '$|^'.join(words)
+    filterclause += '$", "i")\n'
+
+    query += filterclause + "}"
+
+    result = client.sparql_query('austalk', query)
+
+    items = []
+    for b in result['results']['bindings']:
+        items.append((b['prompt']['value'], b['item']['value']))
+
+    with open(output, 'w') as out:
+        out.write("Speaker\tPrompt\tItemURL\n")
+        for item in items:
+            out.write(speakerid + "\t" + item[0] + "\t" + item[1] + "\n")
+
+
+def main():
+    args = parser()
+    try:
+        api_key = open(args.api_key, 'r').read().strip()
+        find_hVd_words(api_key, args.speaker, args.output, args.words)
+    except Exception as e:
+        print("ERROR: " + str(e), file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/austalk-select-hVd-words.xml	Mon Aug 15 23:45:46 2016 -0400
@@ -0,0 +1,44 @@
+<tool id="austalk-select-hvd-words" name="Find HVD words in Austalk" version="0.01" force_history_refresh="True">
+    <description>for a single speaker</description>
+
+    <requirements>
+        <requirement type="package" version="0.6">pyalveo</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        austalk-select-hVd-words.py  --api_key $api_key --speaker $speaker --words $words --output $output
+    </command>
+
+    <inputs>
+        <param name="api_key" type="data" format="txt" label="API Key" help="Your Alveo API key"/>
+        <param name="speaker" type="text" format="text" label="Speaker ID" help="e.g. 1_123"/>
+        <param name="words" type="select" multiple="false" label="Word List" display="radioboxes">
+            <option value='all'>All hVd words</option>
+            <option value='monopthongs'>hVd monopthongs</option>
+            <option value='dipthongs'>hVd dipthongs</option>
+        </param>
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the output to remind you what it contains" value="Query Results"/>
+    </inputs>
+
+    <outputs>
+        <data format="tabular" name="output" label="$job_name" />
+    </outputs>
+
+    <tests>
+    </tests>
+
+    <help>Find items corresponding to the hVd words for this Austalk speaker.</help>
+
+    <citations>
+        <citation type='bibtex'>
+            @inproceedings{Buschmeir2013,
+            author = {{Hendrik Buschmeier}, Marcin Wlodarczak},
+            booktitle = {Tagungsband der 24. Konferenz zur Elektronischen Sprachsignalverarbeitung (ESSV 2013)},
+            pages = {152--157},
+            title = {{TextGridTools: A TextGrid Processing and Analysis Toolkit for Python}},
+            year = {2013}
+            }
+        </citation>
+    </citations>
+</tool>
--- a/test-data/api-key.dat	Mon Jul 18 23:49:40 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-insert your api key here
--- a/test-data/item-lists.dat	Mon Jul 18 23:49:40 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-austalk_catepillar (309)	https://app.alveo.edu.au/item_lists/64
-different (888)	https://app.alveo.edu.au/item_lists/132
-gum-tree (58)	https://app.alveo.edu.au/item_lists/84
-M&D_Test_140904 (10)	https://app.alveo.edu.au/item_lists/168
-rose (245)	https://app.alveo.edu.au/item_lists/82
-thistle (16)	https://app.alveo.edu.au/item_lists/83
-ace-specialised (122)	https://app.alveo.edu.au/item_lists/178
-austalk_hide (42)	https://app.alveo.edu.au/item_lists/251
-austalk-male-digits (144)	https://app.alveo.edu.au/item_lists/412
-COOEE ALL (1354)	https://app.alveo.edu.au/item_lists/95
-cooee sample (129)	https://app.alveo.edu.au/item_lists/53
-dialogue-all (76)	https://app.alveo.edu.au/item_lists/116
-dialogue-sample (6)	https://app.alveo.edu.au/item_lists/180
-mdsample (20)	https://app.alveo.edu.au/item_lists/52
-one austalk sample (1)	https://app.alveo.edu.au/item_lists/179
--- a/test_alveo_api_key.py	Mon Jul 18 23:49:40 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-import unittest
-import os
-import alveo_api_key
-import pyalveo
-from mock import Mock
-
-class TestAlveoAPIKey(unittest.TestCase):
-
-  OUTPUT_PATH = 'test.txt'
-  API_KEY = 'test123'
-  MOCK_CLIENT = Mock(pyalveo)
-
-  def test_write_key(self):
-    alveo_api_key.write_key(self.API_KEY, self.OUTPUT_PATH, self.MOCK_CLIENT)
-    actual = open(self.OUTPUT_PATH, 'r').read()
-    self.assertEqual(self.API_KEY, actual)
-
-  def tearDown(self):
-    try:
-      os.remove(self.OUTPUT_PATH)
-    except OSError:
-      pass
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
--- a/test_alveo_item_list_downloader.py	Mon Jul 18 23:49:40 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,28 +0,0 @@
-import unittest
-import os
-import json
-import alveo_item_list_importer
-import pyalveo
-from mock import Mock
-
-class TestAlveoItemListImporter(unittest.TestCase):
-
-  API_KEY = 'test123'
-  OUTPUT_PATH = 'test.csv'
-  ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}'
-  CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n'
-
-  def test_write_table(self):
-    api_list = json.loads(self.ITEM_LIST)
-    alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH)
-    actual = open(self.OUTPUT_PATH, 'r').read()
-    self.assertEqual(self.CSV_CONTENTS, actual)
-
-  def tearDown(self):
-    try:
-      os.remove(self.OUTPUT_PATH)
-    except OSError:
-      pass
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
--- a/test_alveo_item_list_importer.py	Mon Jul 18 23:49:40 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,28 +0,0 @@
-import unittest
-import os
-import json
-import alveo_item_list_importer
-import pyalveo
-from mock import Mock
-
-class TestAlveoItemListImporter(unittest.TestCase):
-
-  API_KEY = 'test123'
-  OUTPUT_PATH = 'test.csv'
-  ITEM_LIST = '{"shared": [{"shared": true, "num_items": 309, "name": "austalk_catepillar", "item_list_url": "https://app.alveo.edu.au/item_lists/64"}]}'
-  CSV_CONTENTS = 'austalk_catepillar (309)\thttps://app.alveo.edu.au/item_lists/64\n'
-
-  def test_write_table(self):
-    api_list = json.loads(self.ITEM_LIST)
-    alveo_item_list_importer.write_table(api_list, self.OUTPUT_PATH)
-    actual = open(self.OUTPUT_PATH, 'r').read()
-    self.assertEqual(self.CSV_CONTENTS, actual)
-
-  def tearDown(self):
-    try:
-      os.remove(self.OUTPUT_PATH)
-    except OSError:
-      pass
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
--- a/tool_dependencies.xml	Mon Jul 18 23:49:40 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-   <package name="pyalveo" version="0.4">
-       <install version="1.0">
-           <actions>
-               <action type="setup_virtualenv">pyalveo==0.4</action>
-           </actions>
-       </install>
-   </package>
-</tool_dependency>