annotate alveo_get_primary_text.py @ 10:e2989a1d751d draft

planemo upload commit 063e049d569aeb45e1008cbf044f2dad850eca3d-dirty
author stevecassidy
date Wed, 07 Dec 2016 19:09:55 -0500
parents 2f4907372748
children a38315ecf593
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
1 from __future__ import print_function
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
2 import json
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
3 import argparse
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
4 import pyalveo
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
5 import sys
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
6 import os
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
7 from fnmatch import fnmatch
9
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
8 import csv
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
9
4
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
10
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
11 API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
12
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
13 def parser():
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
14 parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List")
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
15 parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key")
9
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
16 parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs")
4
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
17 parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file")
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
18 return parser.parse_args()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
19
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
20 def get_item_list(api_key, item_list_url):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
21 client = pyalveo.Client(api_key=api_key, api_url=API_URL, use_cache=False)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
22 return client.get_item_list(item_list_url)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
23
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
24 # this file name pattern allows galaxy to discover the dataset designation and type
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
25 FNPAT = "%(designation)s_%(ext)s"
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
26
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
27 def galaxy_name(fname, ext):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
28 """construct a filename suitable for Galaxy dataset discovery"""
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
29
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
30 fname = FNPAT % {'designation': fname, 'ext': ext}
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
31
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
32 return fname
9
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
33
4
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
34 import pprint
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
35 def download_documents(item_list, output_path):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
36 """
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
37 Downloads a list of documents to the directory specificed by output_path.
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
38
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
39 :type documents: list of pyalveo.Document
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
40 :param documents: Documents to download
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
41
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
42 :type output_path: String
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
43 :param output_path: directory to download to the documents to
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
44 """
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
45 if not os.path.exists(output_path):
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
46 os.makedirs(output_path)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
47
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
48 downloaded = []
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
49
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
50 items = item_list.get_all()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
51 filtered_documents = []
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
52 for item in items:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
53 md = item.metadata()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
54 fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt'))
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
55 content = item.get_primary_text()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
56 if not content == None:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
57 with open(fname, 'w') as out:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
58 out.write(content)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
59
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
60 return downloaded
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
61
9
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
62 def read_item_list(filename, client):
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
63 """Read an item list from a file
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
64 which should be a tabular formatted file
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
65 with one column header ItemURL.
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
66 Return an instance of ItemGroup"""
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
67
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
68 with open(filename) as fd:
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
69 csvreader = csv.DictReader(fd, dialect='excel-tab')
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
70 if 'ItemURL' not in csvreader.fieldnames:
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
71 return None
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
72 itemurls = []
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
73 for row in csvreader:
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
74 itemurls.append(row['ItemURL'])
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
75
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
76 itemlist = pyalveo.ItemGroup(itemurls, client)
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
77
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
78 return itemlist
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
79
4
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
80 def main():
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
81 args = parser()
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
82 try:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
83 api_key = open(args.api_key, 'r').read().strip()
9
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
84 client = pyalveo.Client(api_url=API_URL, api_key=api_key, use_cache=False)
2f4907372748 planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents: 4
diff changeset
85 item_list = read_item_list(args.item_list, client)
4
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
86 downloaded = download_documents(item_list, args.output_path)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
87 except pyalveo.APIError as e:
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
88 print("ERROR: " + str(e), file=sys.stderr)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
89 sys.exit(1)
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
90
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
91 if __name__ == '__main__':
3a9f20428cff planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff changeset
92 main()