annotate data_manager/data_manager_plant_tribes_scaffolds_download.py @ 2:b3fc4b35e3f5 draft

Uploaded
author iuc
date Tue, 14 Feb 2017 13:50:55 -0500
parents f5e3438468c7
children fe92a529ed01
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
1 #!/usr/bin/env python
b418349edb0d Uploaded
iuc
parents:
diff changeset
2 #
b418349edb0d Uploaded
iuc
parents:
diff changeset
3 # Data manager for downloading Plant Tribes scaffolds data.
b418349edb0d Uploaded
iuc
parents:
diff changeset
4 import argparse
b418349edb0d Uploaded
iuc
parents:
diff changeset
5 import json
b418349edb0d Uploaded
iuc
parents:
diff changeset
6 import os
b418349edb0d Uploaded
iuc
parents:
diff changeset
7 import shutil
b418349edb0d Uploaded
iuc
parents:
diff changeset
8 import sys
b418349edb0d Uploaded
iuc
parents:
diff changeset
9 import tarfile
b418349edb0d Uploaded
iuc
parents:
diff changeset
10 import urllib2
b418349edb0d Uploaded
iuc
parents:
diff changeset
11 import zipfile
b418349edb0d Uploaded
iuc
parents:
diff changeset
12
b418349edb0d Uploaded
iuc
parents:
diff changeset
13
b418349edb0d Uploaded
iuc
parents:
diff changeset
14 DEFAULT_DATA_TABLE_NAMES = ["plant_tribes_scaffolds"]
b418349edb0d Uploaded
iuc
parents:
diff changeset
15
b418349edb0d Uploaded
iuc
parents:
diff changeset
16
b418349edb0d Uploaded
iuc
parents:
diff changeset
17 def add_data_table_entry(data_manager_dict, data_table_name, data_table_entry):
b418349edb0d Uploaded
iuc
parents:
diff changeset
18 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
b418349edb0d Uploaded
iuc
parents:
diff changeset
19 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, [])
b418349edb0d Uploaded
iuc
parents:
diff changeset
20 data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
b418349edb0d Uploaded
iuc
parents:
diff changeset
21 return data_manager_dict
b418349edb0d Uploaded
iuc
parents:
diff changeset
22
b418349edb0d Uploaded
iuc
parents:
diff changeset
23
b418349edb0d Uploaded
iuc
parents:
diff changeset
24 def make_directory(dir):
b418349edb0d Uploaded
iuc
parents:
diff changeset
25 if not os.path.exists(dir):
b418349edb0d Uploaded
iuc
parents:
diff changeset
26 os.makedirs(dir)
b418349edb0d Uploaded
iuc
parents:
diff changeset
27
b418349edb0d Uploaded
iuc
parents:
diff changeset
28
b418349edb0d Uploaded
iuc
parents:
diff changeset
29 def remove_directory(dir):
b418349edb0d Uploaded
iuc
parents:
diff changeset
30 if os.path.exists(dir):
b418349edb0d Uploaded
iuc
parents:
diff changeset
31 shutil.rmtree(dir)
b418349edb0d Uploaded
iuc
parents:
diff changeset
32
b418349edb0d Uploaded
iuc
parents:
diff changeset
33
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
34 def extract_archive(file_path, work_directory):
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
35 if tarfile.is_tarfile(file_path):
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
36 fh = tarfile.open(file_path, 'r:*')
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
37 elif zipfile.is_zipfile(file_path):
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
38 fh = zipfile.ZipFile(file_path, 'r')
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
39 else:
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
40 return
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
41 fh.extractall(work_directory)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
42
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
43
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
44 def move_files(work_directory, target_directory):
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
45 # Move the files into defined output directory.
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
46 for filename in os.listdir(work_directory):
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
47 shutil.move(os.path.join(work_directory, filename), target_directory)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
48
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
49
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
50 def url_download(url, work_directory):
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
51 file_path = os.path.join(work_directory, os.path.basename(url))
b418349edb0d Uploaded
iuc
parents:
diff changeset
52 src = None
b418349edb0d Uploaded
iuc
parents:
diff changeset
53 dst = None
b418349edb0d Uploaded
iuc
parents:
diff changeset
54 try:
b418349edb0d Uploaded
iuc
parents:
diff changeset
55 req = urllib2.Request(url)
b418349edb0d Uploaded
iuc
parents:
diff changeset
56 src = urllib2.urlopen(req)
b418349edb0d Uploaded
iuc
parents:
diff changeset
57 dst = open(file_path, 'wb')
b418349edb0d Uploaded
iuc
parents:
diff changeset
58 while True:
b418349edb0d Uploaded
iuc
parents:
diff changeset
59 chunk = src.read(2**10)
b418349edb0d Uploaded
iuc
parents:
diff changeset
60 if chunk:
b418349edb0d Uploaded
iuc
parents:
diff changeset
61 dst.write(chunk)
b418349edb0d Uploaded
iuc
parents:
diff changeset
62 else:
b418349edb0d Uploaded
iuc
parents:
diff changeset
63 break
b418349edb0d Uploaded
iuc
parents:
diff changeset
64 except Exception, e:
b418349edb0d Uploaded
iuc
parents:
diff changeset
65 print >>sys.stderr, str(e)
b418349edb0d Uploaded
iuc
parents:
diff changeset
66 finally:
b418349edb0d Uploaded
iuc
parents:
diff changeset
67 if src:
b418349edb0d Uploaded
iuc
parents:
diff changeset
68 src.close()
b418349edb0d Uploaded
iuc
parents:
diff changeset
69 if dst:
b418349edb0d Uploaded
iuc
parents:
diff changeset
70 dst.close()
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
71 return file_path
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
72
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
73
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
74 def download(target_file_path, web_url, config_web_url, description, data_table_names=DEFAULT_DATA_TABLE_NAMES):
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
75 data_manager_dict = {}
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
76 data_table_entry = {}
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
77 # Download the scaffolds data.
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
78 work_directory = os.path.abspath(os.path.join(os.getcwd(), 'scaffolds'))
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
79 make_directory(work_directory)
2
b3fc4b35e3f5 Uploaded
iuc
parents: 1
diff changeset
80 file_path = url_download(web_url, work_directory)
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
81 extract_archive(file_path, work_directory)
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
82 os.remove(file_path)
b418349edb0d Uploaded
iuc
parents:
diff changeset
83 # Move the scaffolds data files into defined output directory.
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
84 target_directory = make_directory(target_file_path)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
85 move_files(work_directory, target_directory)
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
86 remove_directory(work_directory)
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
87 # Populate the data_manager_dict with the scaffolds data entry.
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
88 for file_path in os.listdir(target_directory):
b418349edb0d Uploaded
iuc
parents:
diff changeset
89 full_path = os.path.abspath(os.path.join(target_directory, file_path))
b418349edb0d Uploaded
iuc
parents:
diff changeset
90 entry_name = "%s" % os.path.basename(file_path)
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
91 data_table_entry['value'] = entry_name
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
92 data_table_entry['name'] = entry_name
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
93 data_table_entry['path'] = full_path
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
94 data_table_entry['description'] = description
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
95 # Download the default configuration files.
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
96 work_directory = os.path.abspath(os.path.join(os.getcwd(), 'configs'))
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
97 make_directory(work_directory)
2
b3fc4b35e3f5 Uploaded
iuc
parents: 1
diff changeset
98 file_path = url_download(config_web_url, work_directory)
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
99 extract_archive(file_path, work_directory)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
100 os.remove(file_path)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
101 shutil.rmtree(target_directory)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
102 # Move the scaffolds data files into defined output directory.
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
103 target_directory = make_directory(target_file_path)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
104 move_files(work_directory, target_directory)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
105 remove_directory(work_directory)
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
106 # Populate the data_manager_dict with the default configs entry.
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
107 for file_path in os.listdir(target_directory):
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
108 full_path = os.path.abspath(os.path.join(target_directory, file_path))
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
109 data_table_entry['config_path'] = full_path
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
110 # Populate the data_man ager_dict.
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
111 for data_table_name in data_table_names:
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
112 data_manager_dict = add_data_table_entry(data_manager_dict, data_table_name, data_table_entry)
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
113 return data_manager_dict
b418349edb0d Uploaded
iuc
parents:
diff changeset
114
b418349edb0d Uploaded
iuc
parents:
diff changeset
115
b418349edb0d Uploaded
iuc
parents:
diff changeset
116 parser = argparse.ArgumentParser()
b418349edb0d Uploaded
iuc
parents:
diff changeset
117 parser.add_argument('--description', dest='description', default=None, help='Description')
b418349edb0d Uploaded
iuc
parents:
diff changeset
118 parser.add_argument('--name', dest='name', help='Data table entry unique ID')
b418349edb0d Uploaded
iuc
parents:
diff changeset
119 parser.add_argument('--out_file', dest='out_file', help='JSON output file')
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
120 parser.add_argument('--web_url', dest='web_url', help='URL for downloading scaffolds')
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
121 parser.add_argument('--config_web_url', dest='config_web_url', help='URL for downloading default configs')
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
122
b418349edb0d Uploaded
iuc
parents:
diff changeset
123 args = parser.parse_args()
b418349edb0d Uploaded
iuc
parents:
diff changeset
124
b418349edb0d Uploaded
iuc
parents:
diff changeset
125 # Some magic happens with tools of type "manage_data" in that the output
b418349edb0d Uploaded
iuc
parents:
diff changeset
126 # file contains some JSON data that allows us to define the target directory.
b418349edb0d Uploaded
iuc
parents:
diff changeset
127 params = json.loads(open(args.out_file).read())
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
128 target_file_path = params['output_data'][0]['extra_files_path']
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
129
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
130
b418349edb0d Uploaded
iuc
parents:
diff changeset
131 if args.description is None:
b418349edb0d Uploaded
iuc
parents:
diff changeset
132 description = ''
b418349edb0d Uploaded
iuc
parents:
diff changeset
133 else:
b418349edb0d Uploaded
iuc
parents:
diff changeset
134 description = args.description.strip()
b418349edb0d Uploaded
iuc
parents:
diff changeset
135
b418349edb0d Uploaded
iuc
parents:
diff changeset
136 # Get the scaffolds data.
1
f5e3438468c7 Uploaded
iuc
parents: 0
diff changeset
137 data_manager_dict = download(target_file_path, args.web_url, args.config_web_url, description)
0
b418349edb0d Uploaded
iuc
parents:
diff changeset
138 # Write the JSON output dataset.
b418349edb0d Uploaded
iuc
parents:
diff changeset
139 fh = open(args.out_file, 'wb')
b418349edb0d Uploaded
iuc
parents:
diff changeset
140 fh.write(json.dumps(data_manager_dict))
b418349edb0d Uploaded
iuc
parents:
diff changeset
141 fh.close()