Mercurial > repos > stevecassidy > alveoimport
comparison alveo_get_primary_text.py @ 0:bfe39bd252df draft
planemo upload commit 5de43e6a614de2a1b2065bc63823ecc9854ebb32-dirty
author | stevecassidy |
---|---|
date | Mon, 18 Jul 2016 23:49:40 -0400 |
parents | |
children | 7b6021997b8e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bfe39bd252df |
---|---|
1 from __future__ import print_function | |
2 import json | |
3 import argparse | |
4 import pyalveo | |
5 import sys | |
6 import os | |
7 from fnmatch import fnmatch | |
8 | |
9 API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module | |
10 | |
11 def parser(): | |
12 parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") | |
13 parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") | |
14 parser.add_argument('--item_list_url', required=True, action="store", type=str, help="Item List to download") | |
15 parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") | |
16 return parser.parse_args() | |
17 | |
18 def get_item_list(api_key, item_list_url): | |
19 client = pyalveo.Client(api_key=api_key, api_url=API_URL) | |
20 return client.get_item_list(item_list_url) | |
21 | |
22 # this file name pattern allows galaxy to discover the dataset designation and type | |
23 FNPAT = "%(designation)s_%(ext)s" | |
24 | |
25 def galaxy_name(fname, ext): | |
26 """construct a filename suitable for Galaxy dataset discovery""" | |
27 | |
28 fname = FNPAT % {'designation': fname, 'ext': ext} | |
29 | |
30 return fname | |
31 import pprint | |
32 def download_documents(item_list, output_path): | |
33 """ | |
34 Downloads a list of documents to the directory specificed by output_path. | |
35 | |
36 :type documents: list of pyalveo.Document | |
37 :param documents: Documents to download | |
38 | |
39 :type output_path: String | |
40 :param output_path: directory to download to the documents to | |
41 """ | |
42 if not os.path.exists(output_path): | |
43 os.makedirs(output_path) | |
44 | |
45 downloaded = [] | |
46 | |
47 items = item_list.get_all() | |
48 filtered_documents = [] | |
49 for item in items: | |
50 md = item.metadata() | |
51 fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt')) | |
52 content = item.get_primary_text() | |
53 if not content == None: | |
54 with open(fname, 'w') as out: | |
55 out.write(content) | |
56 | |
57 return downloaded | |
58 | |
59 def main(): | |
60 args = parser() | |
61 try: | |
62 api_key = open(args.api_key, 'r').read().strip() | |
63 item_list = get_item_list(api_key, args.item_list_url) | |
64 downloaded = download_documents(item_list, args.output_path) | |
65 # write out a list of downloaded files as a result? | |
66 except pyalveo.APIError as e: | |
67 print("ERROR: " + str(e), file=sys.stderr) | |
68 sys.exit(1) | |
69 | |
70 if __name__ == '__main__': | |
71 main() |