annotate alveo_get_item_data.py @ 10:e2989a1d751d draft

planemo upload commit 063e049d569aeb45e1008cbf044f2dad850eca3d-dirty
author stevecassidy
date Wed, 07 Dec 2016 19:09:55 -0500
parents 5a8d9ddabec4
children be3fd14899a1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
1 from __future__ import print_function
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
2 import json
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
3 import argparse
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
4 import pyalveo
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
5 import sys
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
6 import os
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
7 from fnmatch import fnmatch
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
8 import csv
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
9
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
10 API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
11
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
12 def parser():
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
13 parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List")
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
14 parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
15 parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs")
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
16 parser.add_argument('--patterns', required=True, action="store", type=str, help="File patterns to download")
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
17 parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
18 return parser.parse_args()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
19
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
20 def read_item_list(filename, client):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
21 """Read an item list from a file
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
22 which should be a tabular formatted file
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
23 with one column header ItemURL.
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
24 Return an instance of ItemGroup"""
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
25
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
26 with open(filename) as fd:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
27 csvreader = csv.DictReader(fd, dialect='excel-tab')
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
28 if 'ItemURL' not in csvreader.fieldnames:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
29 return None
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
30 itemurls = []
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
31 for row in csvreader:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
32 itemurls.append(row['ItemURL'])
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
33
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
34 itemlist = pyalveo.ItemGroup(itemurls, client)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
35
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
36 return itemlist
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
37
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
38 # this file name pattern allows galaxy to discover the dataset designation and type
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
39 FNPAT = "%(designation)s#%(ext)s"
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
40
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
41 def galaxy_name(itemname, fname):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
42 """construct a filename suitable for Galaxy dataset discovery
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
43 designation - (dataset identifier) is the file basename
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
44 ext - defines the dataset type and is the file extension
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
45 """
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
46
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
47 root, ext = os.path.splitext(fname)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
48 ext = ext[1:] # remove initial .
10
e2989a1d751d planemo upload commit 063e049d569aeb45e1008cbf044f2dad850eca3d-dirty
stevecassidy
parents: 7
diff changeset
49 fname = FNPAT % {'designation': fname, 'ext': ext}
4
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
50
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
51 return fname
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
52
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
53 def download_documents(item_list, patterns, output_path):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
54 """
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
55 Downloads a list of documents to the directory specificed by output_path.
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
56
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
57 :type documents: list of pyalveo.Document
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
58 :param documents: Documents to download
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
59
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
60 :type output_path: String
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
61 :param output_path: directory to download to the documents to
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
62 """
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
63 if not os.path.exists(output_path):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
64 os.makedirs(output_path)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
65
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
66 downloaded = []
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
67
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
68 items = item_list.get_all()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
69 filtered_documents = []
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
70 for item in items:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
71 documents = item.get_documents()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
72 for doc in documents:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
73 for pattern in patterns:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
74 if not pattern == '' and fnmatch(doc.get_filename(), pattern):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
75 fname = galaxy_name(item.metadata()['alveo:metadata']['dc:identifier'], doc.get_filename())
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
76 try:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
77 doc.download_content(dir_path=output_path, filename=fname)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
78 downloaded.append(doc.get_filename())
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
79 except:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
80 # maybe it doesn't exist or we have no access
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
81 # TODO: report this
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
82 pass
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
83 return downloaded
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
84
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
85 def main():
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
86 args = parser()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
87 try:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
88 api_key = open(args.api_key, 'r').read().strip()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
89
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
90 client = pyalveo.Client(api_url=API_URL, api_key=api_key, use_cache=False)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
91
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
92 item_list = read_item_list(args.item_list, client)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
93 patterns = args.patterns.split(',')
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
94 downloaded = download_documents(item_list, patterns, args.output_path)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
95 except pyalveo.APIError as e:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
96 print("ERROR: " + str(e), file=sys.stderr)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
97 sys.exit(1)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
98
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
99 if __name__ == '__main__':
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
100 main()