comparison alveo_get_primary_text.py @ 0:bfe39bd252df draft

planemo upload commit 5de43e6a614de2a1b2065bc63823ecc9854ebb32-dirty
author stevecassidy
date Mon, 18 Jul 2016 23:49:40 -0400
parents
children 7b6021997b8e
comparison
equal deleted inserted replaced
-1:000000000000 0:bfe39bd252df
1 from __future__ import print_function
2 import json
3 import argparse
4 import pyalveo
5 import sys
6 import os
7 from fnmatch import fnmatch
8
9 API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
10
11 def parser():
12 parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List")
13 parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
14 parser.add_argument('--item_list_url', required=True, action="store", type=str, help="Item List to download")
15 parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
16 return parser.parse_args()
17
18 def get_item_list(api_key, item_list_url):
19 client = pyalveo.Client(api_key=api_key, api_url=API_URL)
20 return client.get_item_list(item_list_url)
21
22 # this file name pattern allows galaxy to discover the dataset designation and type
23 FNPAT = "%(designation)s_%(ext)s"
24
25 def galaxy_name(fname, ext):
26 """construct a filename suitable for Galaxy dataset discovery"""
27
28 fname = FNPAT % {'designation': fname, 'ext': ext}
29
30 return fname
31 import pprint
32 def download_documents(item_list, output_path):
33 """
34 Downloads a list of documents to the directory specificed by output_path.
35
36 :type documents: list of pyalveo.Document
37 :param documents: Documents to download
38
39 :type output_path: String
40 :param output_path: directory to download to the documents to
41 """
42 if not os.path.exists(output_path):
43 os.makedirs(output_path)
44
45 downloaded = []
46
47 items = item_list.get_all()
48 filtered_documents = []
49 for item in items:
50 md = item.metadata()
51 fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt'))
52 content = item.get_primary_text()
53 if not content == None:
54 with open(fname, 'w') as out:
55 out.write(content)
56
57 return downloaded
58
59 def main():
60 args = parser()
61 try:
62 api_key = open(args.api_key, 'r').read().strip()
63 item_list = get_item_list(api_key, args.item_list_url)
64 downloaded = download_documents(item_list, args.output_path)
65 # write out a list of downloaded files as a result?
66 except pyalveo.APIError as e:
67 print("ERROR: " + str(e), file=sys.stderr)
68 sys.exit(1)
69
70 if __name__ == '__main__':
71 main()