annotate json_data_source.py @ 4:96103d66b7af

Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
author Matt Shirley <mdshw5@gmail.com>
date Wed, 02 Jul 2014 09:33:03 -0400
parents 988f34ef5c9f
children 33fa019735a4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
2 import json
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
3 import optparse
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
4 import urllib
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
5 import os.path
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
6 import os
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
7 from operator import itemgetter
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
8
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
9 CHUNK_SIZE = 2**20 #1mb
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
10 VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
11
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
12
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
13 def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
14 source_method = getattr( source_stream, source_method )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
15 target_method = getattr( target_stream, target_method )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
16 while True:
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
17 chunk = source_method( CHUNK_SIZE )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
18 if chunk:
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
19 target_method( chunk )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
20 else:
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
21 break
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
22
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
23
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
24 def deconstruct_multi_filename( multi_filename ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
25 keys = [ 'primary', 'id', 'name', 'visible', 'file_type' ]
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
26 return ( dict( zip( keys, multi_filename.split('_') ) ) )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
27
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
28
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
29 def construct_multi_filename( id, name, file_type ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
30 """ Implementation of *Number of Output datasets cannot be determined until tool run* from documentation_.
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
31 .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
32 """
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
33 filename = "%s_%s_%s_%s_%s" % ( 'primary', id, name, 'visible', file_type )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
34 return filename
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
35
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
36
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
37 def download_from_query( query_data, target_output_filename ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
38 """ Download file from the json data and write it to target_output_filename.
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
39 """
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
40 query_url = query_data.get( 'url' )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
41 query_file_type = query_data.get( 'extension' )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
42 query_stream = urllib.urlopen( query_url )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
43 output_stream = open( target_output_filename, 'wb' )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
44 chunk_write( query_stream, output_stream )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
45 query_stream.close()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
46 output_stream.close()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
47
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
48
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
49 def download_extra_data( query_ext_data, base_path ):
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
50 """ Download any extra data defined in the JSON.
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
51 NOTE: the "path" value is a relative path to the file on our
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
52 file system. This is slightly dangerous and we should make every effort
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
53 to avoid a malicious absolute path to write the file elsewhere on the
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
54 filesystem.
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
55 """
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
56 for ext_data in query_ext_data:
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
57 if not os.path.exists( base_path ):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
58 os.mkdir( base_path )
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
59 query_stream = urllib.urlopen( ext_data.get( 'url' ) )
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
60 ext_path = ext_data.get( 'path' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
61 os.makedirs( os.path.normpath( '/'.join( [ base_path, os.path.dirname( ext_path ) ] ) ) )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
62 output_stream = open( os.path.normpath( '/'.join( [ base_path, ext_path ] ) ), 'wb' )
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
63 chunk_write( query_stream, output_stream )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
64 query_stream.close()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
65 output_stream.close()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
66
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
67
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
68 def metadata_to_json( dataset_id, metadata, filename, ds_type='dataset', primary=False):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
69 """ Return line separated JSON """
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
70 meta_dict = dict( type = ds_type,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
71 ext = metadata.get( 'extension' ),
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
72 filename = filename,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
73 name = metadata.get( 'name' ),
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
74 metadata = metadata.get( 'metadata' ) )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
75 if metadata.get( 'extra_data', None ):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
76 meta_dict[ 'extra_data' ] = '_'.join( [ filename, 'files' ] )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
77 if primary:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
78 meta_dict[ 'base_dataset_id' ] = dataset_id
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
79 else:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
80 meta_dict[ 'dataset_id' ] = dataset_id
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
81 return "%s\n" % json.dumps( meta_dict )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
82
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
83
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
84 def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
85 """ Main work function that operates on the JSON representation of
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
86 one dataset and its metadata. Returns True.
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
87 """
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
88 dataset_url, output_filename, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
89 extra_files_path, file_name, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
90 ext, out_data_name, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
91 hda_id, dataset_id = set_up_config_values(json_params)
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
92 extension = query_item.get( 'extension' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
93 filename = query_item.get( 'url' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
94 extra_data = query_item.get( 'extra_data', None )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
95 if primary:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
96 filename = ''.join( c in VALID_CHARS and c or '-' for c in filename )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
97 name = construct_multi_filename( hda_id, filename, extension )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
98 target_output_filename = os.path.normpath( '/'.join( [ output_base_path, name ] ) )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
99 else:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
100 target_output_filename = output_filename
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
101 download_from_query( query_item, target_output_filename )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
102 if extra_data:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
103 download_extra_data( extra_data, '_'.join( [ target_output_filename, 'files' ] ) )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
104 metadata_parameter_file.write( metadata_to_json( dataset_id, query_item,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
105 target_output_filename,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
106 primary=primary) )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
107 return True
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
108
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
109
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
110 def set_up_config_values(json_params):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
111 """ Parse json_params file and return a tuple of necessary configuration
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
112 values.
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
113 """
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
114 datasource_params = json_params.get( 'param_dict' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
115 dataset_url = datasource_params.get( 'URL' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
116 output_filename = datasource_params.get( 'output1', None )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
117 output_data = json_params.get( 'output_data' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
118 extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
119 itemgetter('extra_files_path', 'file_name', 'ext', 'out_data_name', 'hda_id', 'dataset_id')(output_data[0])
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
120 return (dataset_url, output_filename,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
121 extra_files_path, file_name,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
122 ext, out_data_name,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
123 hda_id, dataset_id)
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
124
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
125
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
126 def download_from_json_data( options, args ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
127 """ Parse the returned JSON data and download files. Write metadata
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
128 to flat JSON file.
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
129 """
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
130 output_base_path = options.path
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
131 # read tool job configuration file and parse parameters we need
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
132 json_params = json.loads( open( options.json_param_file, 'r' ).read() )
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
133 dataset_url, output_filename, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
134 extra_files_path, file_name, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
135 ext, out_data_name, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
136 hda_id, dataset_id = set_up_config_values(json_params)
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
137 # line separated JSON file to contain all dataset metadata
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
138 metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' )
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
139
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
140 # get JSON response from data source
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
141 # TODO: make sure response is not enormous
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
142 query_params = json.loads(urllib.urlopen( dataset_url ).read())
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
143 # download and write files
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
144 primary = False
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
145 # query_item, hda_id, output_base_path, dataset_id
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
146 for query_item in query_params:
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
147 if isinstance( query_item, list ):
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
148 # TODO: do something with the nested list as a collection
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
149 for query_subitem in query_item:
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
150 primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
151 metadata_parameter_file, primary)
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
152
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
153 elif isinstance( query_item, dict ):
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
154 primary = download_files_and_write_metadata(query_item, json_params, output_base_path,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
155 metadata_parameter_file, primary)
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
156 metadata_parameter_file.close()
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
157
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
158 def __main__():
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
159 """ Read the JSON return from a data source. Parse each line and request
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
160 the data, download to "newfilepath", and write metadata.
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
161
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
162 Schema
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
163 ------
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
164
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
165 [ {"url":"http://url_of_file",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
166 "name":"encode WigData",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
167 "extension":"wig",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
168 "metadata":{"db_key":"hg19"},
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
169 "extra_data":[ {"url":"http://url_of_ext_file",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
170 "path":"rel/path/to/ext_file"}
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
171 ]
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
172 }
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
173 ]
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
174
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
175 """
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
176 # Parse the command line options
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
177 usage = "Usage: json_data_source.py max_size --json_param_file filename [options]"
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
178 parser = optparse.OptionParser(usage = usage)
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
179 parser.add_option("-j", "--json_param_file", type="string",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
180 action="store", dest="json_param_file", help="json schema return data")
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
181 parser.add_option("-p", "--path", type="string",
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
182 action="store", dest="path", help="new file path")
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
183
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
184 (options, args) = parser.parse_args()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
185 download_from_json_data( options, args )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
186
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
187
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
188 if __name__ == "__main__": __main__()