Mercurial > repos > stevecassidy > alveoimport
annotate alveo_get_primary_text.py @ 9:2f4907372748 draft
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
author | stevecassidy |
---|---|
date | Wed, 16 Nov 2016 15:01:03 -0500 |
parents | 3a9f20428cff |
children | a38315ecf593 |
rev | line source |
---|---|
4
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
1 from __future__ import print_function |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
2 import json |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
3 import argparse |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
4 import pyalveo |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
5 import sys |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
6 import os |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
7 from fnmatch import fnmatch |
9
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
8 import csv |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
9 |
4
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
10 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
11 API_URL = 'https://app.alveo.edu.au' # TODO: export constants to a separate module |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
12 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
13 def parser(): |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
14 parser = argparse.ArgumentParser(description="Downloads documents in an Alveo Item List") |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
15 parser.add_argument('--api_key', required=True, action="store", type=str, help="Alveo API key") |
9
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
16 parser.add_argument('--item_list', required=True, action="store", type=str, help="File containing list of item URLs") |
4
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
17 parser.add_argument('--output_path', required=True, action="store", type=str, help="Path to output file") |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
18 return parser.parse_args() |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
19 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
20 def get_item_list(api_key, item_list_url): |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
21 client = pyalveo.Client(api_key=api_key, api_url=API_URL, use_cache=False) |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
22 return client.get_item_list(item_list_url) |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
23 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
24 # this file name pattern allows galaxy to discover the dataset designation and type |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
25 FNPAT = "%(designation)s_%(ext)s" |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
26 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
27 def galaxy_name(fname, ext): |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
28 """construct a filename suitable for Galaxy dataset discovery""" |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
29 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
30 fname = FNPAT % {'designation': fname, 'ext': ext} |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
31 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
32 return fname |
9
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
33 |
4
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
34 import pprint |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
35 def download_documents(item_list, output_path): |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
36 """ |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
37 Downloads a list of documents to the directory specificed by output_path. |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
38 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
39 :type documents: list of pyalveo.Document |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
40 :param documents: Documents to download |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
41 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
42 :type output_path: String |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
43 :param output_path: directory to download to the documents to |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
44 """ |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
45 if not os.path.exists(output_path): |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
46 os.makedirs(output_path) |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
47 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
48 downloaded = [] |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
49 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
50 items = item_list.get_all() |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
51 filtered_documents = [] |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
52 for item in items: |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
53 md = item.metadata() |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
54 fname = os.path.join(output_path, galaxy_name(md['alveo:metadata']['dc:identifier'], 'txt')) |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
55 content = item.get_primary_text() |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
56 if not content == None: |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
57 with open(fname, 'w') as out: |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
58 out.write(content) |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
59 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
60 return downloaded |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
61 |
9
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
62 def read_item_list(filename, client): |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
63 """Read an item list from a file |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
64 which should be a tabular formatted file |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
65 with one column header ItemURL. |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
66 Return an instance of ItemGroup""" |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
67 |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
68 with open(filename) as fd: |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
69 csvreader = csv.DictReader(fd, dialect='excel-tab') |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
70 if 'ItemURL' not in csvreader.fieldnames: |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
71 return None |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
72 itemurls = [] |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
73 for row in csvreader: |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
74 itemurls.append(row['ItemURL']) |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
75 |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
76 itemlist = pyalveo.ItemGroup(itemurls, client) |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
77 |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
78 return itemlist |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
79 |
4
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
80 def main(): |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
81 args = parser() |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
82 try: |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
83 api_key = open(args.api_key, 'r').read().strip() |
9
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
84 client = pyalveo.Client(api_url=API_URL, api_key=api_key, use_cache=False) |
2f4907372748
planemo upload commit 7b5663b41b2dc11f9e375b8f386bc31855800bcf-dirty
stevecassidy
parents:
4
diff
changeset
|
85 item_list = read_item_list(args.item_list, client) |
4
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
86 downloaded = download_documents(item_list, args.output_path) |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
87 except pyalveo.APIError as e: |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
88 print("ERROR: " + str(e), file=sys.stderr) |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
89 sys.exit(1) |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
90 |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
91 if __name__ == '__main__': |
3a9f20428cff
planemo upload commit f36456464c692ed9d39a9cf654d09fe793113cce-dirty
stevecassidy
parents:
diff
changeset
|
92 main() |