Mercurial > repos > melissacline > ucsc_xena_platform
comparison ucsc_xena_download.py @ 0:8bb037f88ed2
Uploaded
author | melissacline |
---|---|
date | Tue, 13 Jan 2015 23:37:23 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8bb037f88ed2 |
---|---|
1 #!/usr/bin/env python | |
2 import socket, urllib, sys, os | |
3 from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg... | |
4 from galaxy.util.json import from_json_string, to_json_string | |
5 from galaxy.util import get_charset_from_http_headers | |
6 import galaxy.model # need to import model before sniff to resolve a circular import dependency | |
7 from galaxy.datatypes import sniff | |
8 import tarfile | |
9 import re | |
10 | |
11 filemap = [ | |
12 ('genomic', r'genomic(Segment|Matrix)$'), | |
13 ('clinical', r'clinicalMatrix$'), | |
14 ] | |
15 | |
16 files = { | |
17 'genomic': sys.argv[1], | |
18 'clinical': sys.argv[2] | |
19 } | |
20 | |
21 max_file_size = sys.argv[3] | |
22 | |
23 def file_type(file): | |
24 with open(file) as f: | |
25 return from_json_string(f.read())['type'] | |
26 | |
27 def stop_err( msg ): | |
28 sys.stderr.write( msg ) | |
29 sys.exit() | |
30 | |
31 def load_input_parameters( filename, erase_file = True ): | |
32 datasource_params = {} | |
33 try: | |
34 json_params = from_json_string( open( filename, 'r' ).read() ) | |
35 datasource_params = json_params.get( 'param_dict' ) | |
36 except: | |
37 json_params = None | |
38 for line in open( filename, 'r' ): | |
39 try: | |
40 line = line.strip() | |
41 fields = line.split( '\t' ) | |
42 datasource_params[ fields[0] ] = fields[1] | |
43 except: | |
44 continue | |
45 if erase_file: | |
46 open( filename, 'w' ).close() #open file for writing, then close, removes params from file | |
47 return json_params, datasource_params | |
48 | |
49 def load_file(files): | |
50 filename = files['genomic'] | |
51 job_params, params = load_input_parameters( filename, False ) | |
52 URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded | |
53 URL_method = params.get( 'URL_method', None ) | |
54 socket.setdefaulttimeout( 600 ) | |
55 try: | |
56 if not URL_method or URL_method == 'get': | |
57 page = urllib.urlopen( URL ) | |
58 elif URL_method == 'post': | |
59 page = urllib.urlopen( URL, urllib.urlencode( params ) ) | |
60 except Exception, e: | |
61 stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) ) | |
62 if max_file_size: | |
63 file_size = int( page.info().get( 'Content-Length', 0 ) ) | |
64 if file_size > max_file_size: | |
65 stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) | |
66 try: | |
67 cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( filename, os.O_WRONLY | os.O_CREAT ), filename, source_encoding=get_charset_from_http_headers( page.headers ) ) | |
68 except Exception, e: | |
69 stop_err( 'Unable to fetch %s:\n%s' % ( URL, e ) ) | |
70 | |
71 load_file(files) | |
72 | |
73 tar = tarfile.open(files['genomic']) | |
74 names = tar.getnames() | |
75 metafiles = [n for n in names if n.endswith('.json')] | |
76 tar.extractall() | |
77 withtype = [(file_type(file), file[0:-len(".json")]) for file in metafiles] | |
78 try: | |
79 renames = [((n for (t, n) in withtype if re.search(pat, t)).next(), name) for (name, pat) in filemap] | |
80 except StopIteration: | |
81 stop_err( 'Missing required file type in tarball' ) | |
82 for (frm, to) in renames: | |
83 os.rename(frm, files[to]) |