diff alveo_get_primary_text.py @ 14:a38315ecf593 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:18:15 -0400
parents 2f4907372748
children 3fd0f8f1f3ce
line wrap: on
line diff
--- a/alveo_get_primary_text.py	Wed Feb 01 22:34:24 2017 -0500
+++ b/alveo_get_primary_text.py	Wed Nov 01 01:18:15 2017 -0400
@@ -1,14 +1,11 @@
 from __future__ import print_function
-import json
 import argparse
 import pyalveo
 import sys
 import os
-from fnmatch import fnmatch
-import csv
 
+from util import API_URL, read_item_list
 
-API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
 
 def parser():
     parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List")
@@ -17,13 +14,11 @@
     parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
     return parser.parse_args()
 
-def get_item_list(api_key, item_list_url):
-    client = pyalveo.Client(api_key=api_key, api_url=API_URL, use_cache=False)
-    return client.get_item_list(item_list_url)
 
 # this file name pattern allows galaxy to discover the dataset designation and type
 FNPAT = "%(designation)s_%(ext)s"
 
+
 def galaxy_name(fname, ext):
     """construct a filename suitable for Galaxy dataset discovery"""
 
@@ -31,13 +26,13 @@
 
     return fname
 
-import pprint
-def download_documents(item_list, output_path):
+
+def download_text(item_list, output_path):
     """
-    Downloads a list of documents to the directory specificed by output_path.
+    Downloads primary text from a list of items to the directory specified by output_path.
 
-    :type documents: list of pyalveo.Document
-    :param documents: Documents to download
+    :type item_list: ItemGroup
+    :param item_list: item list to download
 
     :type output_path: String
     :param output_path: directory to download to the documents to
@@ -48,34 +43,16 @@
     downloaded = []
 
     items = item_list.get_all()
-    filtered_documents = []
     for item in items:
         md = item.metadata()
-        fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt'))
+        fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dcterms:identifier'], 'txt'))
         content = item.get_primary_text()
-        if not content == None:
+        if content is not None:
             with open(fname, 'w') as out:
                 out.write(content)
 
     return downloaded
 
-def read_item_list(filename, client):
-    """Read an item list from a file
-    which should be a tabular formatted file
-    with one column header ItemURL.
-    Return an instance of ItemGroup"""
-
-    with open(filename) as fd:
-        csvreader = csv.DictReader(fd, dialect='excel-tab')
-        if 'ItemURL' not in csvreader.fieldnames:
-            return None
-        itemurls = []
-        for row in csvreader:
-            itemurls.append(row['ItemURL'])
-
-    itemlist = pyalveo.ItemGroup(itemurls, client)
-
-    return itemlist
 
 def main():
     args = parser()
@@ -83,10 +60,11 @@
         api_key = open(args.api_key, 'r').read().strip()
         client = pyalveo.Client(api_url=API_URL, api_key=api_key, use_cache=False)
         item_list = read_item_list(args.item_list, client)
-        downloaded = download_documents(item_list, args.output_path)
+        download_text(item_list, args.output_path)
     except pyalveo.APIError as e:
         print("ERROR: " + str(e), file=sys.stderr)
         sys.exit(1)
 
+
 if __name__ == '__main__':
     main()