0
|
1 #!/usr/bin/env python
|
|
2 import socket, urllib, sys, os
|
|
3 from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg...
|
|
4 from galaxy.util.json import from_json_string, to_json_string
|
|
5 from galaxy.util import get_charset_from_http_headers
|
|
6 import galaxy.model # need to import model before sniff to resolve a circular import dependency
|
|
7 from galaxy.datatypes import sniff
|
|
8 import tarfile
|
|
9 import re
|
|
10
|
|
11 filemap = [
|
|
12 ('genomic', r'genomic(Segment|Matrix)$'),
|
|
13 ('clinical', r'clinicalMatrix$'),
|
|
14 ]
|
|
15
|
|
16 files = {
|
|
17 'genomic': sys.argv[1],
|
|
18 'clinical': sys.argv[2]
|
|
19 }
|
|
20
|
|
21 max_file_size = sys.argv[3]
|
|
22
|
|
23 def file_type(file):
|
|
24 with open(file) as f:
|
|
25 return from_json_string(f.read())['type']
|
|
26
|
|
27 def stop_err( msg ):
|
|
28 sys.stderr.write( msg )
|
|
29 sys.exit()
|
|
30
|
|
31 def load_input_parameters( filename, erase_file = True ):
|
|
32 datasource_params = {}
|
|
33 try:
|
|
34 json_params = from_json_string( open( filename, 'r' ).read() )
|
|
35 datasource_params = json_params.get( 'param_dict' )
|
|
36 except:
|
|
37 json_params = None
|
|
38 for line in open( filename, 'r' ):
|
|
39 try:
|
|
40 line = line.strip()
|
|
41 fields = line.split( '\t' )
|
|
42 datasource_params[ fields[0] ] = fields[1]
|
|
43 except:
|
|
44 continue
|
|
45 if erase_file:
|
|
46 open( filename, 'w' ).close() #open file for writing, then close, removes params from file
|
|
47 return json_params, datasource_params
|
|
48
|
|
49 def load_file(files):
|
|
50 filename = files['genomic']
|
|
51 job_params, params = load_input_parameters( filename, False )
|
|
52 URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded
|
|
53 URL_method = params.get( 'URL_method', None )
|
|
54 socket.setdefaulttimeout( 600 )
|
|
55 try:
|
|
56 if not URL_method or URL_method == 'get':
|
|
57 page = urllib.urlopen( URL )
|
|
58 elif URL_method == 'post':
|
|
59 page = urllib.urlopen( URL, urllib.urlencode( params ) )
|
|
60 except Exception, e:
|
|
61 stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
|
|
62 if max_file_size:
|
|
63 file_size = int( page.info().get( 'Content-Length', 0 ) )
|
|
64 if file_size > max_file_size:
|
|
65 stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
|
|
66 try:
|
|
67 cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( filename, os.O_WRONLY | os.O_CREAT ), filename, source_encoding=get_charset_from_http_headers( page.headers ) )
|
|
68 except Exception, e:
|
|
69 stop_err( 'Unable to fetch %s:\n%s' % ( URL, e ) )
|
|
70
|
|
71 load_file(files)
|
|
72
|
|
73 tar = tarfile.open(files['genomic'])
|
|
74 names = tar.getnames()
|
|
75 metafiles = [n for n in names if n.endswith('.json')]
|
|
76 tar.extractall()
|
|
77 withtype = [(file_type(file), file[0:-len(".json")]) for file in metafiles]
|
|
78 try:
|
|
79 renames = [((n for (t, n) in withtype if re.search(pat, t)).next(), name) for (name, pat) in filemap]
|
|
80 except StopIteration:
|
|
81 stop_err( 'Missing required file type in tarball' )
|
|
82 for (frm, to) in renames:
|
|
83 os.rename(frm, files[to])
|