# HG changeset patch # User Matt Shirley # Date 1404053544 14400 # Node ID 44119c5d8cc6785ea80cc25f73cbf67c1f115882 Initial commit. diff -r 000000000000 -r 44119c5d8cc6 json_data_source.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/json_data_source.py Sun Jun 29 10:52:24 2014 -0400 @@ -0,0 +1,129 @@ +#!/usr/bin/env python +import json +import optparse +import urllib +import os.path + +CHUNK_SIZE = 2**20 #1mb + + +def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ): + source_method = getattr( source_stream, source_method ) + target_method = getattr( target_stream, target_method ) + while True: + chunk = source_method( CHUNK_SIZE ) + if chunk: + target_method( chunk ) + else: + break + + +def deconstruct_multi_filename( multi_filename ): + keys = [ 'primary', 'id', 'name', 'visible', 'file_type' ] + return ( dict( zip( keys, multi_filename.split('_') ) ) ) + + +def construct_multi_filename( id, name, file_type ): + """ Implementation of *Number of Output datasets cannot be determined until tool run* from documentation_. + .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files + """ + filename = "%s_%s_%s_%s_%s" % ( 'primary', id, name, 'visible', file_type ) + return filename + + +def download_from_query( query_data, target_output_filename ): + """ Download file from the json data and write it to target_output_filename. + """ + query_url = query_data.get( 'url' ) + query_file_type = query_data.get( 'extension' ) + query_stream = urllib.urlopen( query_url ) + output_stream = open( target_output_filename, 'wb' ) + chunk_write( query_stream, output_stream ) + query_stream.close() + output_stream.close() + + +def download_extra_data( query_ext_data, base_path ): + for ext_data in query_ext_data: + query_stream = urllib.urlopen( ext_data.get( 'url' ) ) + output_stream = open( os.path.normpath( '/'.join( [ base_path, extra_item.get( 'path' ) ] ) ), 'wb' ) + chunk_write( query_stream, output_stream ) + query_stream.close() + output_stream.close() + + +def download_from_json_data( options, args ): + """ Parse the returned JSON data and download files. Write metadata + to flat JSON file. + """ + json_params = json.loads( open( options.json_param_file, 'r' ).read() ) + datasource_params = json_params.get( 'param_dict' ) + dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id'] + hda_id = json_params['output_data'][0]['hda_id'] + dataset_url = json_params['url'] + + url_param = datasource_params.get( file_url_name, None ) + output_filename = datasource_params.get( "output1", None ) + output_base_path = options.path + + # get JSON response from data source + # TODO: make sure response is not enormous + query_params = json.loads(urllib.urlopen( dataset_url ).read()) + metadata_to_write = [] + # download and write files + for query_item in query_params: + if isinstance( query_item, list ): + # do something with the nested list as a collection + for query_subitem in query_item: + multi_name = construct_multi_filename( hda_id, output_filename, extension ) + target_output_filename = os.path.normpath( '/'.join( [ output_base_path, multi_name ] ) ) + download_from_query( query_subitem, target_output_filename ) + if query_item.get( 'extra_data' ): + download_extra_data( query_item.get( 'extra_data' ), output_base_path ) + metadata_to_write.append( query_item ) + + elif isinstance( query_item, dict ): + # what is the difference between hda_id and dataset_id? + multi_name = construct_multi_filename( hda_id, output_filename, extension ) + target_output_filename = os.path.normpath( [ '/'.join( output_base_path, multi_name ) ] ) + download_from_query( query_item, target_output_filename ) + if query_item.get( 'extra_data' ): + download_extra_data( query_item.get( 'extra_data' ), output_base_path ) + metadata_to_write.append( query_item ) + + with open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) as metadata_parameter_file: + # write JSON metadata from flattened list + metadata_parameter_file.write( json.dumps( metadata_to_write ) ) + + +def __main__(): + """ Read the JSON return from a data source. Parse each line and request + the data, download to "newfilepath", and write metadata. + + Schema + ------ + + [ {"url":"http://url_of_file", + "name":"encode WigData", + "extension":"wig", + "metadata":{"db_key":"hg19"}, + "extra_data":[ {"url":"http://url_of_ext_file", + "path":"rel/path/to/ext_file"} + ] + } + ] + + """ + # Parse the command line options + usage = "Usage: json_data_source.py max_size --json_param_file filename [options]" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-j", "--json_param_file", type="string", + action="store", dest="json_param_file", help="json schema return data") + parser.add_option("-p", "--path", type="string", + action="store", dest="newfilepath", help="new file path") + + (options, args) = parser.parse_args() + download_from_json_data( options, args ) + + +if __name__ == "__main__": __main__() diff -r 000000000000 -r 44119c5d8cc6 json_data_source.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/json_data_source.xml Sun Jun 29 10:52:24 2014 -0400 @@ -0,0 +1,13 @@ + +