Mercurial > repos > stevecassidy > alveoimport
view alveo_get_primary_text.py @ 4:3a9f20428cff draft
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
author | stevecassidy |
---|---|
date | Wed, 31 Aug 2016 22:07:27 -0400 |
parents | |
children | 2f4907372748 |
line wrap: on
line source
from __future__ import print_function import json import argparse import pyalveo import sys import os from fnmatch import fnmatch API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module def parser(): parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") parser.add_argument('--item_list_url', required=True, action="store", type=str, help="Item List to download") parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") return parser.parse_args() def get_item_list(api_key, item_list_url): client = pyalveo.Client(api_key=api_key, api_url=API_URL, use_cache=False) return client.get_item_list(item_list_url) # this file name pattern allows galaxy to discover the dataset designation and type FNPAT = "%(designation)s_%(ext)s" def galaxy_name(fname, ext): """construct a filename suitable for Galaxy dataset discovery""" fname = FNPAT % {'designation': fname, 'ext': ext} return fname import pprint def download_documents(item_list, output_path): """ Downloads a list of documents to the directory specificed by output_path. :type documents: list of pyalveo.Document :param documents: Documents to download :type output_path: String :param output_path: directory to download to the documents to """ if not os.path.exists(output_path): os.makedirs(output_path) downloaded = [] items = item_list.get_all() filtered_documents = [] for item in items: md = item.metadata() fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt')) content = item.get_primary_text() if not content == None: with open(fname, 'w') as out: out.write(content) return downloaded def main(): args = parser() try: api_key = open(args.api_key, 'r').read().strip() item_list = get_item_list(api_key, args.item_list_url) downloaded = download_documents(item_list, args.output_path) # write out a list of downloaded files as a result? except pyalveo.APIError as e: print("ERROR: " + str(e), file=sys.stderr) sys.exit(1) if __name__ == '__main__': main()