Mercurial > repos > stevecassidy > alveoimport
view alveo_get_item_data.py @ 18:5e1b7d922ea3 draft default tip
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit 53cde5cc9b8c1adcccdc3cfa52d8ca82079aeda7
author | stevecassidy |
---|---|
date | Mon, 15 Jan 2018 18:34:57 -0500 |
parents | a38315ecf593 |
children |
line wrap: on
line source
from __future__ import print_function import argparse import pyalveo import sys import os from fnmatch import fnmatch from util import API_URL, read_item_list def parser(): p = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") p.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") p.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs") p.add_argument('--patterns', required=False, action="store", type=str, help="File patterns to download") p.add_argument('--primarytext', required=False, action="store_true", help="Download only primary text") p.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") return p.parse_args() # this file name pattern allows galaxy to discover the dataset designation and type FNPAT = "%(designation)s#%(ext)s" def galaxy_name(fname, ext): """construct a filename suitable for Galaxy dataset discovery""" fname = FNPAT % {'designation': fname, 'ext': ext} return fname def download_documents(item_list, output_path, patterns=None): """ Downloads a list of documents to the directory specified by output_path. :type documents: list of pyalveo.Document :param documents: Documents to download :type patterns: string :param patterns: Glob pattern for files to download, may include the special pattern PRIMARYTEXT which returns the primary text of each item as ITEMNAME.txt :type output_path: String :param output_path: directory to download to the documents to """ if not os.path.exists(output_path): os.makedirs(output_path) items = item_list.get_all() for item in items: md = item.metadata() ident = md['alveo:metadata']['dcterms:identifier'] documents = item.get_documents() for doc in documents: for pattern in patterns: if pattern == 'PRIMARYTEXT': fname = os.path.join(output_path, galaxy_name(ident, 'txt')) content = item.get_primary_text() if content is not None: with open(fname, 'w') as out: out.write(content.decode('utf-8')) elif not pattern == '' and fnmatch(doc.get_filename(), pattern): root, ext = os.path.splitext(doc.get_filename()) ext = ext[1:] # remove initial . fname = galaxy_name(root, ext) try: doc.download_content(dir_path=output_path, filename=fname) except pyalveo.APIError as e: print("ERROR: " + str(e), file=sys.stderr) sys.exit(1) def main(): args = parser() try: api_key = open(args.api_key, 'r').read().strip() client = pyalveo.Client(api_url=API_URL, api_key=api_key, use_cache=False) item_list = read_item_list(args.item_list, client) patterns = args.patterns.split(',') download_documents(item_list, args.output_path, patterns=patterns) except pyalveo.APIError as e: print("ERROR: " + str(e), file=sys.stderr) sys.exit(1) if __name__ == '__main__': main()