json_data_source: json_data_source.py annotate

annotate json_data_source.py @ 0:44119c5d8cc6

Initial commit.

author	Matt Shirley <mdshw5@gmail.com>
date	Sun, 29 Jun 2014 10:52:24 -0400
parents
children	988f34ef5c9f

rev	line source
0 44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	1 #!/usr/bin/env python
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	2 import json
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	3 import optparse
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	4 import urllib
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	5 import os.path
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	6
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	7 CHUNK_SIZE = 2**20 #1mb
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	8
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	9
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	10 def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	11 source_method = getattr( source_stream, source_method )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	12 target_method = getattr( target_stream, target_method )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	13 while True:
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	14 chunk = source_method( CHUNK_SIZE )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	15 if chunk:
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	16 target_method( chunk )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	17 else:
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	18 break
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	19
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	20
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	21 def deconstruct_multi_filename( multi_filename ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	22 keys = [ 'primary', 'id', 'name', 'visible', 'file_type' ]
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	23 return ( dict( zip( keys, multi_filename.split('_') ) ) )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	24
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	25
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	26 def construct_multi_filename( id, name, file_type ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	27 """ Implementation of Number of Output datasets cannot be determined until tool run from documentation_.
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	28 .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	29 """
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	30 filename = "%s_%s_%s_%s_%s" % ( 'primary', id, name, 'visible', file_type )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	31 return filename
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	32
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	33
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	34 def download_from_query( query_data, target_output_filename ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	35 """ Download file from the json data and write it to target_output_filename.
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	36 """
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	37 query_url = query_data.get( 'url' )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	38 query_file_type = query_data.get( 'extension' )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	39 query_stream = urllib.urlopen( query_url )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	40 output_stream = open( target_output_filename, 'wb' )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	41 chunk_write( query_stream, output_stream )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	42 query_stream.close()
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	43 output_stream.close()
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	44
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	45
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	46 def download_extra_data( query_ext_data, base_path ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	47 for ext_data in query_ext_data:
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	48 query_stream = urllib.urlopen( ext_data.get( 'url' ) )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	49 output_stream = open( os.path.normpath( '/'.join( [ base_path, extra_item.get( 'path' ) ] ) ), 'wb' )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	50 chunk_write( query_stream, output_stream )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	51 query_stream.close()
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	52 output_stream.close()
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	53
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	54
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	55 def download_from_json_data( options, args ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	56 """ Parse the returned JSON data and download files. Write metadata
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	57 to flat JSON file.
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	58 """
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	59 json_params = json.loads( open( options.json_param_file, 'r' ).read() )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	60 datasource_params = json_params.get( 'param_dict' )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	61 dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id']
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	62 hda_id = json_params['output_data'][0]['hda_id']
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	63 dataset_url = json_params['url']
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	64
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	65 url_param = datasource_params.get( file_url_name, None )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	66 output_filename = datasource_params.get( "output1", None )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	67 output_base_path = options.path
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	68
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	69 # get JSON response from data source
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	70 # TODO: make sure response is not enormous
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	71 query_params = json.loads(urllib.urlopen( dataset_url ).read())
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	72 metadata_to_write = []
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	73 # download and write files
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	74 for query_item in query_params:
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	75 if isinstance( query_item, list ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	76 # do something with the nested list as a collection
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	77 for query_subitem in query_item:
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	78 multi_name = construct_multi_filename( hda_id, output_filename, extension )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	79 target_output_filename = os.path.normpath( '/'.join( [ output_base_path, multi_name ] ) )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	80 download_from_query( query_subitem, target_output_filename )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	81 if query_item.get( 'extra_data' ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	82 download_extra_data( query_item.get( 'extra_data' ), output_base_path )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	83 metadata_to_write.append( query_item )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	84
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	85 elif isinstance( query_item, dict ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	86 # what is the difference between hda_id and dataset_id?
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	87 multi_name = construct_multi_filename( hda_id, output_filename, extension )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	88 target_output_filename = os.path.normpath( [ '/'.join( output_base_path, multi_name ) ] )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	89 download_from_query( query_item, target_output_filename )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	90 if query_item.get( 'extra_data' ):
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	91 download_extra_data( query_item.get( 'extra_data' ), output_base_path )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	92 metadata_to_write.append( query_item )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	93
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	94 with open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) as metadata_parameter_file:
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	95 # write JSON metadata from flattened list
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	96 metadata_parameter_file.write( json.dumps( metadata_to_write ) )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	97
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	98
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	99 def __main__():
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	100 """ Read the JSON return from a data source. Parse each line and request
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	101 the data, download to "newfilepath", and write metadata.
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	102
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	103 Schema
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	104 ------
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	105
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	106 [ {"url":"http://url_of_file",
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	107 "name":"encode WigData",
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	108 "extension":"wig",
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	109 "metadata":{"db_key":"hg19"},
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	110 "extra_data":[ {"url":"http://url_of_ext_file",
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	111 "path":"rel/path/to/ext_file"}
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	112 ]
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	113 }
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	114 ]
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	115
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	116 """
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	117 # Parse the command line options
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	118 usage = "Usage: json_data_source.py max_size --json_param_file filename [options]"
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	119 parser = optparse.OptionParser(usage = usage)
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	120 parser.add_option("-j", "--json_param_file", type="string",
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	121 action="store", dest="json_param_file", help="json schema return data")
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	122 parser.add_option("-p", "--path", type="string",
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	123 action="store", dest="newfilepath", help="new file path")
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	124
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	125 (options, args) = parser.parse_args()
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	126 download_from_json_data( options, args )
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	127
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	128
44119c5d8cc6 Initial commit. Matt Shirley <mdshw5@gmail.com> parents: diff changeset	129 if __name__ == "__main__": __main__()

Mercurial > repos > matt-shirley > json_data_source

annotate json_data_source.py @ 0:44119c5d8cc6