Mercurial > repos > stevecassidy > alveoimport
diff alveo_get_primary_text.py @ 14:a38315ecf593 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author | stevecassidy |
---|---|
date | Wed, 01 Nov 2017 01:18:15 -0400 |
parents | 2f4907372748 |
children | 3fd0f8f1f3ce |
line wrap: on
line diff
--- a/alveo_get_primary_text.py Wed Feb 01 22:34:24 2017 -0500 +++ b/alveo_get_primary_text.py Wed Nov 01 01:18:15 2017 -0400 @@ -1,14 +1,11 @@ from __future__ import print_function -import json import argparse import pyalveo import sys import os -from fnmatch import fnmatch -import csv +from util import API_URL, read_item_list -API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module def parser(): parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") @@ -17,13 +14,11 @@ parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") return parser.parse_args() -def get_item_list(api_key, item_list_url): - client = pyalveo.Client(api_key=api_key, api_url=API_URL, use_cache=False) - return client.get_item_list(item_list_url) # this file name pattern allows galaxy to discover the dataset designation and type FNPAT = "%(designation)s_%(ext)s" + def galaxy_name(fname, ext): """construct a filename suitable for Galaxy dataset discovery""" @@ -31,13 +26,13 @@ return fname -import pprint -def download_documents(item_list, output_path): + +def download_text(item_list, output_path): """ - Downloads a list of documents to the directory specificed by output_path. + Downloads primary text from a list of items to the directory specified by output_path. - :type documents: list of pyalveo.Document - :param documents: Documents to download + :type item_list: ItemGroup + :param item_list: item list to download :type output_path: String :param output_path: directory to download to the documents to @@ -48,34 +43,16 @@ downloaded = [] items = item_list.get_all() - filtered_documents = [] for item in items: md = item.metadata() - fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt')) + fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dcterms:identifier'], 'txt')) content = item.get_primary_text() - if not content == None: + if content is not None: with open(fname, 'w') as out: out.write(content) return downloaded -def read_item_list(filename, client): - """Read an item list from a file - which should be a tabular formatted file - with one column header ItemURL. - Return an instance of ItemGroup""" - - with open(filename) as fd: - csvreader = csv.DictReader(fd, dialect='excel-tab') - if 'ItemURL' not in csvreader.fieldnames: - return None - itemurls = [] - for row in csvreader: - itemurls.append(row['ItemURL']) - - itemlist = pyalveo.ItemGroup(itemurls, client) - - return itemlist def main(): args = parser() @@ -83,10 +60,11 @@ api_key = open(args.api_key, 'r').read().strip() client = pyalveo.Client(api_url=API_URL, api_key=api_key, use_cache=False) item_list = read_item_list(args.item_list, client) - downloaded = download_documents(item_list, args.output_path) + download_text(item_list, args.output_path) except pyalveo.APIError as e: print("ERROR: " + str(e), file=sys.stderr) sys.exit(1) + if __name__ == '__main__': main()