Mercurial > repos > matt-shirley > json_data_source
comparison json_data_source.py @ 0:44119c5d8cc6
Initial commit.
author | Matt Shirley <mdshw5@gmail.com> |
---|---|
date | Sun, 29 Jun 2014 10:52:24 -0400 |
parents | |
children | 988f34ef5c9f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:44119c5d8cc6 |
---|---|
1 #!/usr/bin/env python | |
2 import json | |
3 import optparse | |
4 import urllib | |
5 import os.path | |
6 | |
7 CHUNK_SIZE = 2**20 #1mb | |
8 | |
9 | |
10 def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ): | |
11 source_method = getattr( source_stream, source_method ) | |
12 target_method = getattr( target_stream, target_method ) | |
13 while True: | |
14 chunk = source_method( CHUNK_SIZE ) | |
15 if chunk: | |
16 target_method( chunk ) | |
17 else: | |
18 break | |
19 | |
20 | |
21 def deconstruct_multi_filename( multi_filename ): | |
22 keys = [ 'primary', 'id', 'name', 'visible', 'file_type' ] | |
23 return ( dict( zip( keys, multi_filename.split('_') ) ) ) | |
24 | |
25 | |
26 def construct_multi_filename( id, name, file_type ): | |
27 """ Implementation of *Number of Output datasets cannot be determined until tool run* from documentation_. | |
28 .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files | |
29 """ | |
30 filename = "%s_%s_%s_%s_%s" % ( 'primary', id, name, 'visible', file_type ) | |
31 return filename | |
32 | |
33 | |
34 def download_from_query( query_data, target_output_filename ): | |
35 """ Download file from the json data and write it to target_output_filename. | |
36 """ | |
37 query_url = query_data.get( 'url' ) | |
38 query_file_type = query_data.get( 'extension' ) | |
39 query_stream = urllib.urlopen( query_url ) | |
40 output_stream = open( target_output_filename, 'wb' ) | |
41 chunk_write( query_stream, output_stream ) | |
42 query_stream.close() | |
43 output_stream.close() | |
44 | |
45 | |
46 def download_extra_data( query_ext_data, base_path ): | |
47 for ext_data in query_ext_data: | |
48 query_stream = urllib.urlopen( ext_data.get( 'url' ) ) | |
49 output_stream = open( os.path.normpath( '/'.join( [ base_path, extra_item.get( 'path' ) ] ) ), 'wb' ) | |
50 chunk_write( query_stream, output_stream ) | |
51 query_stream.close() | |
52 output_stream.close() | |
53 | |
54 | |
55 def download_from_json_data( options, args ): | |
56 """ Parse the returned JSON data and download files. Write metadata | |
57 to flat JSON file. | |
58 """ | |
59 json_params = json.loads( open( options.json_param_file, 'r' ).read() ) | |
60 datasource_params = json_params.get( 'param_dict' ) | |
61 dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id'] | |
62 hda_id = json_params['output_data'][0]['hda_id'] | |
63 dataset_url = json_params['url'] | |
64 | |
65 url_param = datasource_params.get( file_url_name, None ) | |
66 output_filename = datasource_params.get( "output1", None ) | |
67 output_base_path = options.path | |
68 | |
69 # get JSON response from data source | |
70 # TODO: make sure response is not enormous | |
71 query_params = json.loads(urllib.urlopen( dataset_url ).read()) | |
72 metadata_to_write = [] | |
73 # download and write files | |
74 for query_item in query_params: | |
75 if isinstance( query_item, list ): | |
76 # do something with the nested list as a collection | |
77 for query_subitem in query_item: | |
78 multi_name = construct_multi_filename( hda_id, output_filename, extension ) | |
79 target_output_filename = os.path.normpath( '/'.join( [ output_base_path, multi_name ] ) ) | |
80 download_from_query( query_subitem, target_output_filename ) | |
81 if query_item.get( 'extra_data' ): | |
82 download_extra_data( query_item.get( 'extra_data' ), output_base_path ) | |
83 metadata_to_write.append( query_item ) | |
84 | |
85 elif isinstance( query_item, dict ): | |
86 # what is the difference between hda_id and dataset_id? | |
87 multi_name = construct_multi_filename( hda_id, output_filename, extension ) | |
88 target_output_filename = os.path.normpath( [ '/'.join( output_base_path, multi_name ) ] ) | |
89 download_from_query( query_item, target_output_filename ) | |
90 if query_item.get( 'extra_data' ): | |
91 download_extra_data( query_item.get( 'extra_data' ), output_base_path ) | |
92 metadata_to_write.append( query_item ) | |
93 | |
94 with open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) as metadata_parameter_file: | |
95 # write JSON metadata from flattened list | |
96 metadata_parameter_file.write( json.dumps( metadata_to_write ) ) | |
97 | |
98 | |
99 def __main__(): | |
100 """ Read the JSON return from a data source. Parse each line and request | |
101 the data, download to "newfilepath", and write metadata. | |
102 | |
103 Schema | |
104 ------ | |
105 | |
106 [ {"url":"http://url_of_file", | |
107 "name":"encode WigData", | |
108 "extension":"wig", | |
109 "metadata":{"db_key":"hg19"}, | |
110 "extra_data":[ {"url":"http://url_of_ext_file", | |
111 "path":"rel/path/to/ext_file"} | |
112 ] | |
113 } | |
114 ] | |
115 | |
116 """ | |
117 # Parse the command line options | |
118 usage = "Usage: json_data_source.py max_size --json_param_file filename [options]" | |
119 parser = optparse.OptionParser(usage = usage) | |
120 parser.add_option("-j", "--json_param_file", type="string", | |
121 action="store", dest="json_param_file", help="json schema return data") | |
122 parser.add_option("-p", "--path", type="string", | |
123 action="store", dest="newfilepath", help="new file path") | |
124 | |
125 (options, args) = parser.parse_args() | |
126 download_from_json_data( options, args ) | |
127 | |
128 | |
129 if __name__ == "__main__": __main__() |