annotate json_data_source.py @ 7:ac7b4cab83cd default tip

Fix syntax.
author Matt Shirley <mdshw5@gmail.com>
date Wed, 27 Aug 2014 09:34:17 -0400
parents 46b589e9747a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
2 import json
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
3 import optparse
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
4 import urllib
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
5 import os.path
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
6 import os
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
7 from operator import itemgetter
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
8
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
9 CHUNK_SIZE = 2**20 #1mb
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
10 VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
11
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
12
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
13 def chunk_write( source_stream, target_stream, source_method = "read", target_method="write" ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
14 source_method = getattr( source_stream, source_method )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
15 target_method = getattr( target_stream, target_method )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
16 while True:
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
17 chunk = source_method( CHUNK_SIZE )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
18 if chunk:
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
19 target_method( chunk )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
20 else:
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
21 break
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
22
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
23
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
24 def deconstruct_multi_filename( multi_filename ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
25 keys = [ 'primary', 'id', 'name', 'visible', 'file_type' ]
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
26 return ( dict( zip( keys, multi_filename.split('_') ) ) )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
27
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
28
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
29 def construct_multi_filename( id, name, file_type ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
30 """ Implementation of *Number of Output datasets cannot be determined until tool run* from documentation_.
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
31 .. _documentation: http://wiki.galaxyproject.org/Admin/Tools/Multiple%20Output%20Files
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
32 """
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
33 filename = "%s_%s_%s_%s_%s" % ( 'primary', id, name, 'visible', file_type )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
34 return filename
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
35
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
36
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
37 def download_from_query( query_data, target_output_filename ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
38 """ Download file from the json data and write it to target_output_filename.
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
39 """
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
40 query_url = query_data.get( 'url' )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
41 query_file_type = query_data.get( 'extension' )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
42 query_stream = urllib.urlopen( query_url )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
43 output_stream = open( target_output_filename, 'wb' )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
44 chunk_write( query_stream, output_stream )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
45 query_stream.close()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
46 output_stream.close()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
47
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
48
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
49 def download_extra_data( query_ext_data, base_path ):
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
50 """ Download any extra data defined in the JSON.
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
51 NOTE: the "path" value is a relative path to the file on our
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
52 file system. This is slightly dangerous and we should make every effort
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
53 to avoid a malicious absolute path to write the file elsewhere on the
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
54 filesystem.
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
55 """
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
56 for ext_data in query_ext_data:
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
57 if not os.path.exists( base_path ):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
58 os.mkdir( base_path )
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
59 query_stream = urllib.urlopen( ext_data.get( 'url' ) )
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
60 ext_path = ext_data.get( 'path' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
61 os.makedirs( os.path.normpath( '/'.join( [ base_path, os.path.dirname( ext_path ) ] ) ) )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
62 output_stream = open( os.path.normpath( '/'.join( [ base_path, ext_path ] ) ), 'wb' )
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
63 chunk_write( query_stream, output_stream )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
64 query_stream.close()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
65 output_stream.close()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
66
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
67
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
68 def metadata_to_json( dataset_id, metadata, filename, ds_type='dataset', primary=False):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
69 """ Return line separated JSON """
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
70 meta_dict = dict( type = ds_type,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
71 ext = metadata.get( 'extension' ),
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
72 filename = filename,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
73 name = metadata.get( 'name' ),
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
74 metadata = metadata.get( 'metadata' ) )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
75 if metadata.get( 'extra_data', None ):
5
33fa019735a4 Save extra files path as extra_files, not extra_data. Changed type of dataset for new primary datasets. db_key becomes dbkey. Added example json files.
Matt Shirley <mdshw5@gmail.com>
parents: 4
diff changeset
76 meta_dict[ 'extra_files' ] = '_'.join( [ filename, 'files' ] )
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
77 if primary:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
78 meta_dict[ 'base_dataset_id' ] = dataset_id
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
79 else:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
80 meta_dict[ 'dataset_id' ] = dataset_id
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
81 return "%s\n" % json.dumps( meta_dict )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
82
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
83
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
84 def download_files_and_write_metadata(query_item, json_params, output_base_path, metadata_parameter_file, primary):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
85 """ Main work function that operates on the JSON representation of
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
86 one dataset and its metadata. Returns True.
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
87 """
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
88 dataset_url, output_filename, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
89 extra_files_path, file_name, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
90 ext, out_data_name, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
91 hda_id, dataset_id = set_up_config_values(json_params)
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
92 extension = query_item.get( 'extension' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
93 filename = query_item.get( 'url' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
94 extra_data = query_item.get( 'extra_data', None )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
95 if primary:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
96 filename = ''.join( c in VALID_CHARS and c or '-' for c in filename )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
97 name = construct_multi_filename( hda_id, filename, extension )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
98 target_output_filename = os.path.normpath( '/'.join( [ output_base_path, name ] ) )
5
33fa019735a4 Save extra files path as extra_files, not extra_data. Changed type of dataset for new primary datasets. db_key becomes dbkey. Added example json files.
Matt Shirley <mdshw5@gmail.com>
parents: 4
diff changeset
99 metadata_parameter_file.write( metadata_to_json( dataset_id, query_item,
33fa019735a4 Save extra files path as extra_files, not extra_data. Changed type of dataset for new primary datasets. db_key becomes dbkey. Added example json files.
Matt Shirley <mdshw5@gmail.com>
parents: 4
diff changeset
100 target_output_filename,
7
ac7b4cab83cd Fix syntax.
Matt Shirley <mdshw5@gmail.com>
parents: 6
diff changeset
101 ds_type='new_primary_dataset',
5
33fa019735a4 Save extra files path as extra_files, not extra_data. Changed type of dataset for new primary datasets. db_key becomes dbkey. Added example json files.
Matt Shirley <mdshw5@gmail.com>
parents: 4
diff changeset
102 primary=primary) )
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
103 else:
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
104 target_output_filename = output_filename
5
33fa019735a4 Save extra files path as extra_files, not extra_data. Changed type of dataset for new primary datasets. db_key becomes dbkey. Added example json files.
Matt Shirley <mdshw5@gmail.com>
parents: 4
diff changeset
105 metadata_parameter_file.write( metadata_to_json( dataset_id, query_item,
33fa019735a4 Save extra files path as extra_files, not extra_data. Changed type of dataset for new primary datasets. db_key becomes dbkey. Added example json files.
Matt Shirley <mdshw5@gmail.com>
parents: 4
diff changeset
106 target_output_filename,
7
ac7b4cab83cd Fix syntax.
Matt Shirley <mdshw5@gmail.com>
parents: 6
diff changeset
107 ds_type='dataset',
5
33fa019735a4 Save extra files path as extra_files, not extra_data. Changed type of dataset for new primary datasets. db_key becomes dbkey. Added example json files.
Matt Shirley <mdshw5@gmail.com>
parents: 4
diff changeset
108 primary=primary) )
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
109 download_from_query( query_item, target_output_filename )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
110 if extra_data:
6
46b589e9747a Remove underscore from extra_files_path.
Matt Shirley <mdshw5@gmail.com>
parents: 5
diff changeset
111 extra_files_path = ''.join( [ target_output_filename, 'files' ] )
46b589e9747a Remove underscore from extra_files_path.
Matt Shirley <mdshw5@gmail.com>
parents: 5
diff changeset
112 download_extra_data( extra_data, extra_files_path )
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
113 return True
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
114
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
115
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
116 def set_up_config_values(json_params):
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
117 """ Parse json_params file and return a tuple of necessary configuration
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
118 values.
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
119 """
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
120 datasource_params = json_params.get( 'param_dict' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
121 dataset_url = datasource_params.get( 'URL' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
122 output_filename = datasource_params.get( 'output1', None )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
123 output_data = json_params.get( 'output_data' )
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
124 extra_files_path, file_name, ext, out_data_name, hda_id, dataset_id = \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
125 itemgetter('extra_files_path', 'file_name', 'ext', 'out_data_name', 'hda_id', 'dataset_id')(output_data[0])
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
126 return (dataset_url, output_filename,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
127 extra_files_path, file_name,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
128 ext, out_data_name,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
129 hda_id, dataset_id)
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
130
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
131
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
132 def download_from_json_data( options, args ):
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
133 """ Parse the returned JSON data and download files. Write metadata
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
134 to flat JSON file.
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
135 """
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
136 output_base_path = options.path
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
137 # read tool job configuration file and parse parameters we need
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
138 json_params = json.loads( open( options.json_param_file, 'r' ).read() )
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
139 dataset_url, output_filename, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
140 extra_files_path, file_name, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
141 ext, out_data_name, \
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
142 hda_id, dataset_id = set_up_config_values(json_params)
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
143 # line separated JSON file to contain all dataset metadata
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
144 metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' )
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
145
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
146 # get JSON response from data source
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
147 # TODO: make sure response is not enormous
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
148 query_params = json.loads(urllib.urlopen( dataset_url ).read())
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
149 # download and write files
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
150 primary = False
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
151 # query_item, hda_id, output_base_path, dataset_id
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
152 for query_item in query_params:
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
153 if isinstance( query_item, list ):
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
154 # TODO: do something with the nested list as a collection
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
155 for query_subitem in query_item:
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
156 primary = download_files_and_write_metadata(query_subitem, json_params, output_base_path,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
157 metadata_parameter_file, primary)
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
158
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
159 elif isinstance( query_item, dict ):
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
160 primary = download_files_and_write_metadata(query_item, json_params, output_base_path,
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
161 metadata_parameter_file, primary)
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
162 metadata_parameter_file.close()
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
163
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
164 def __main__():
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
165 """ Read the JSON return from a data source. Parse each line and request
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
166 the data, download to "newfilepath", and write metadata.
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
167
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
168 Schema
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
169 ------
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
170
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
171 [ {"url":"http://url_of_file",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
172 "name":"encode WigData",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
173 "extension":"wig",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
174 "metadata":{"db_key":"hg19"},
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
175 "extra_data":[ {"url":"http://url_of_ext_file",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
176 "path":"rel/path/to/ext_file"}
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
177 ]
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
178 }
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
179 ]
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
180
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
181 """
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
182 # Parse the command line options
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
183 usage = "Usage: json_data_source.py max_size --json_param_file filename [options]"
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
184 parser = optparse.OptionParser(usage = usage)
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
185 parser.add_option("-j", "--json_param_file", type="string",
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
186 action="store", dest="json_param_file", help="json schema return data")
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
187 parser.add_option("-p", "--path", type="string",
4
96103d66b7af Properly handle extra data paths, write complete line separated JSON for Galaxy to scoop up and set metadata.
Matt Shirley <mdshw5@gmail.com>
parents: 3
diff changeset
188 action="store", dest="path", help="new file path")
0
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
189
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
190 (options, args) = parser.parse_args()
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
191 download_from_json_data( options, args )
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
192
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
193
44119c5d8cc6 Initial commit.
Matt Shirley <mdshw5@gmail.com>
parents:
diff changeset
194 if __name__ == "__main__": __main__()